diff --git a/.gitignore b/.gitignore index 228be554..4bd469ff 100644 --- a/.gitignore +++ b/.gitignore @@ -12,9 +12,7 @@ tests/*.c tests/*.pyxbldc tests/*.sam tests/*.fai -tests/pysam_data/*.bam -tests/pysam_data/*.bam.bai -tests/pysam_test_work +tests/pysam_data # cython files pysam/TabProxies.c diff --git a/AUTHORS b/AUTHORS index 308641ea..4b005369 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,10 +1,12 @@ List of contributors: -Andreas Heger, Tildon Grant Belgard, Kevin B. Jacobs, Florian -Finkernagel, Leo Goodstadt, Martin Goodson all contributed code -to pysam. +Andreas Heger, Tildon Grant Belgard, Florian Finkernagel, Leo +Goodstadt, Martin Goodson all contributed code to pysam. -Gerton Lunter provided a VCF parser. +Kevin B. Jacobs implemented a Cython wrapper for the VCF/BCF +reader/writer in htslib. + +Gerton Lunter provided a validating VCF parser. Marcel Martin implemented python 3 compatibility. Ben Schiller contributed a Windows compatible clone. diff --git a/doc/faq.rst b/doc/faq.rst index 30393f1f..20246a03 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -84,6 +84,24 @@ Note that re-opening files incurs a performance penalty which can become severe when calling :meth:`~pysam.AlignmentFile.fetch` often. Thus, ``multiple_iterators`` is set to ``False`` by default. +AlignmentFile.fetch does not show unmapped reads +================================================ + +:meth:`~pysam.AlignmentFile.fetch` will only iterate over alignments +in the SAM/BAM file. The following thus always works:: + + bf = pysam.AlignemFile(fname, "rb") + for r in bf.fetch(): + assert not r.is_unmapped + +If the SAM/BAM file contains unaligned reads, they can be included +in the iteration by adding the ``until_eof=True`` flag:: + + bf = pysam.AlignemFile(fname, "rb") + for r in bf.fetch(until_eof=True): + if r.is_unmapped: + print "read is unmapped" + BAM files with a large number of reference sequences is slow ============================================================ @@ -100,7 +118,7 @@ header. This might require a lot of jumping around in the file. To avoid this, use:: track = pysam.AlignmentFile(fname, "rb") - for aln in track.fetch(until_eof = True): + for aln in track.fetch(until_eof=True): pass This will iterate through reads as they appear in the file. diff --git a/doc/release.rst b/doc/release.rst index dab6953a..e94e7157 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -3,11 +3,23 @@ Release notes ============= Release 0.8.2 +============= +* Pysam now wraps htslib 1.2.1 and samtools version 1.2 +* Added CRAM file support to pysam +* New alignment info interface. + * opt() and setTag are deprecated, use get_tag() and set_tag() + instead. + * added has_tag() + * tags is deprecated, use get_tags() and set_tags() instead. * FastqFile is now FastxFile to reflect that the latter permits iteration over both fastq- and fasta-formatted files. +* A Cython wrapper for htslib VCF/BCF reader/writer. The wrapper + provides a nearly complete Pythonic interface to VCF/BCF metadata + with reading and writing capability. However, the interface is still + incomplete and preliminary and lacks capability to mutate the + resulting data. - Release 0.8.1 ============= diff --git a/htslib/INSTALL b/htslib/INSTALL index 1e8df2f0..ba65bd31 100644 --- a/htslib/INSTALL +++ b/htslib/INSTALL @@ -1,25 +1,79 @@ -System Requirements -=================== +Basic Installation +================== -HTSlib depends on the zlib library . Building HTSlib requires -zlib development files to be installed on the build machine; you may need to -ensure a package such as zlib1g-dev (on Debian or Ubuntu Linux) or zlib-devel -(on RPM/yum-based distributions) is installed. +To build and install HTSlib, 'cd' to the htslib-1.x directory containing +the package's source and type the following commands: + ./configure + make + make install -Compilation -=========== +The './configure' command checks your build environment and allows various +optional functionality to be enabled (see Configuration below). If you +don't want to select any optional functionality, you may wish to omit +configure and just type 'make; make install' as for previous versions +of HTSlib. However if the build fails you should run './configure' as +it can diagnose the common reasons for build failures. -'cd' to the htslib-1.x directory containing the package's source and type -'make' to compile HTSlib. +The 'make' command builds the HTSlib library and and various useful +utilities: bgzip, htsfile, and tabix. If compilation fails you should +run './configure' as it can diagnose problems with your build environment +that cause build failures. +The 'make install' command installs the libraries, library header files, +utilities, several manual pages, and a pkgconfig file to /usr/local. +The installation location can be changed by configuring with --prefix=DIR +or via 'make prefix=DIR install' (see Installation Locations below). -Installation -============ -Type 'make install' to install the bgzip and tabix utilities, library headers, -library archives, several manual pages, and a pkgconfig file to /usr/local. +Configuration +============= -Type 'make prefix=/path/to/dir install' to install everything under your -choice of installation directory. The install target also understands -DESTDIR and the other usual installation directory variables. +By default, './configure' examines your build environment, checking for +requirements such as the zlib development files, and arranges for a plain +HTSlib build. The following configure options can be used to enable +various features and specify further optional external requirements: + +--with-irods[=DIR] + Specifies the location of the iRODS client library to use to enable + access to data objects stored in iRODS () via file + paths like 'irods:DATAOBJ'. DIR is the base of an iRODS source tree + such that the library is present as DIR/lib/core/obj/libRodsAPI.* and + headers are present under DIR/lib/api/include and so on. If '=DIR' is + omitted, $IRODS_HOME will be used as a base directory. + +The configure script also accepts the usual options and environment variables +for tuning installation locations and compilers: type './configure --help' +for details. For example, + + ./configure CC=icc --prefix=/opt/icc-compiled + +would specify that HTSlib is to be built with icc and installed into bin, +lib, etc subdirectories under /opt/icc-compiled. + + +Installation Locations +====================== + +By default, 'make install' installs HTSlib libraries under /usr/local/lib, +HTSlib header files under /usr/local/include, utility programs under +/usr/local/bin, etc. (To be precise, the header files are installed within +a fixed 'htslib' subdirectory under the specified .../include location.) + +You can specify a different location to install HTSlib by configuring +with --prefix=DIR or specify locations for particular parts of HTSlib by +configuring with --libdir=DIR and so on. Type './configure --help' for +the full list of such install directory options. + +Alternatively you can specify different locations at install time by +typing 'make prefix=DIR install' or 'make libdir=DIR install' and so on. +Consult the list of prefix/exec_prefix/etc variables near the top of the +Makefile for the full list of such variables that can be overridden. + +You can also specify a staging area by typing 'make DESTDIR=DIR install', +possibly in conjunction with other --prefix or prefix=DIR settings. +For example, + + make DESTDIR=/tmp/staging prefix=/opt + +would install into bin, lib, etc subdirectories under /tmp/staging/opt. diff --git a/htslib/Makefile b/htslib/Makefile index 69199030..5120b248 100644 --- a/htslib/Makefile +++ b/htslib/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2014 Genome Research Ltd. +# Copyright (C) 2013-2015 Genome Research Ltd. # # Author: John Marshall # @@ -26,20 +26,38 @@ CC = gcc AR = ar RANLIB = ranlib -# TODO: edit cram code to remove need for -DSAMTOOLS -CPPFLAGS = -I. -DSAMTOOLS=1 +CPPFLAGS = -I. # TODO: probably update cram code to make it compile cleanly with -Wc++-compat CFLAGS = -g -Wall -O2 EXTRA_CFLAGS_PIC = -fpic LDFLAGS = LDLIBS = -prefix = /ifs/apps/bio/htslib-1.1 +# For now these don't work too well as samtools also needs to know to +# add -lbz2 and -llzma if linking against the static libhts.a library. +# TODO This needs configury and adding to htslib.pc.in. +# +# # Bzip2 support; optionally used by CRAM. +# HAVE_LIBBZ2 := $(shell echo -e "\#include \012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -lbz2 2>/dev/null && echo yes) +# ifeq "$(HAVE_LIBBZ2)" "yes" +# CPPFLAGS += -DHAVE_LIBBZ2 +# LDLIBS += -lbz2 +# endif +# +# # Lzma support; optionally used by CRAM. +# HAVE_LIBLZMA := $(shell echo -e "\#include \012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -llzma 2>/dev/null && echo yes) +# ifeq "$(HAVE_LIBLZMA)" "yes" +# CPPFLAGS += -DHAVE_LIBLZMA +# LDLIBS += -llzma +# endif + +prefix = /usr/local exec_prefix = $(prefix) bindir = $(exec_prefix)/bin includedir = $(prefix)/include libdir = $(exec_prefix)/lib -mandir = $(prefix)/share/man +datarootdir = $(prefix)/share +mandir = $(datarootdir)/man man1dir = $(mandir)/man1 man5dir = $(mandir)/man5 pkgconfigdir= $(libdir)/pkgconfig @@ -52,12 +70,14 @@ INSTALL_DIR = $(MKDIR_P) -m 755 BUILT_PROGRAMS = \ bgzip \ + htsfile \ tabix BUILT_TEST_PROGRAMS = \ test/fieldarith \ test/hfile \ test/sam \ + test/test-regidx \ test/test_view \ test/test-vcf-api \ test/test-vcf-sweep @@ -81,7 +101,7 @@ lib-shared: libhts.so endif -PACKAGE_VERSION = 1.1 +PACKAGE_VERSION = 1.2.1 LIBHTS_SOVERSION = 1 @@ -113,6 +133,9 @@ endif version.h: echo '#define HTS_VERSION "$(PACKAGE_VERSION)"' > $@ +print-version: + @echo $(PACKAGE_VERSION) + .SUFFIXES: .c .o .pico @@ -132,6 +155,7 @@ LIBHTS_OBJS = \ hfile.o \ hfile_net.o \ hts.o \ + regidx.o \ sam.o \ synced_bcf_reader.o \ vcf_sweep.o \ @@ -150,12 +174,33 @@ LIBHTS_OBJS = \ cram/md5.o \ cram/open_trace_file.o \ cram/pooled_alloc.o \ + cram/rANS_static.o \ cram/sam_header.o \ cram/string_alloc.o \ cram/thread_pool.o \ cram/vlen.o \ cram/zfio.o +cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h +cram_io_h = cram/cram_io.h $(cram_misc_h) +cram_misc_h = cram/misc.h cram/os.h +cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h htslib/khash.h htslib/kstring.h +cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h) +cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h htslib/khash.h +cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h +hfile_internal_h = hfile_internal.h $(htslib_hfile_h) + + +# To be effective, config.mk needs to appear after most Makefile variables are +# set but before most rules appear, so that it can both use previously-set +# variables in its own rules' prerequisites and also update variables for use +# in later rules' prerequisites. + +# sinclude is GNU Make-specific. If you don't have GNU Make or another make +# that understands sinclude, change this to 'include' if you are using the +# configure script or just comment the line out if you are not. +sinclude config.mk + libhts.a: $(LIBHTS_OBJS) @-rm -f $@ @@ -181,35 +226,28 @@ libhts.dylib: $(LIBHTS_OBJS) ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib -cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h -cram_io_h = cram/cram_io.h $(cram_misc_h) -cram_misc_h = cram/misc.h cram/os.h -cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h htslib/khash.h htslib/kstring.h -cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h) -cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h htslib/khash.h -cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h -hfile_internal_h = hfile_internal.h $(htslib_hfile_h) - -bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) htslib/khash.h +bgzf.o bgzf.pico: bgzf.c $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) htslib/khash.h kstring.o kstring.pico: kstring.c htslib/kstring.h knetfile.o knetfile.pico: knetfile.c htslib/knetfile.h hfile.o hfile.pico: hfile.c $(htslib_hfile_h) $(hfile_internal_h) +hfile_irods.o hfile_irods.pico: hfile_irods.c $(hfile_internal_h) hfile_net.o hfile_net.pico: hfile_net.c $(hfile_internal_h) htslib/knetfile.h hts.o hts.pico: hts.c version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/ksort.h vcf.o vcf.pico: vcf.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h sam.o sam.pico: sam.c $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h tbx.o tbx.pico: tbx.c $(htslib_tbx_h) $(htslib_bgzf_h) htslib/khash.h -faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) htslib/khash.h htslib/knetfile.h +faidx.o faidx.pico: faidx.c $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) htslib/khash.h synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_bcf_reader_h) htslib/kseq.h htslib/khash_str2int.h vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h) vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h) kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h +regidx.o regidx.pico: regidx.c $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h $(HTSPREFIX)htslib/kseq.h $(HTSPREFIX)htslib/khash_str2int.h $(htslib_regidx_h) cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c $(cram_h) cram/os.h cram/md5.h cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c $(cram_h) cram/os.h cram/md5.h cram/cram_index.o cram/cram_index.pico: cram/cram_index.c $(htslib_hfile_h) $(cram_h) cram/os.h cram/zfio.h -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c $(cram_h) cram/os.h cram/md5.h $(cram_open_trace_file_h) $(htslib_hfile_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c $(cram_h) cram/os.h cram/md5.h $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c $(cram_h) $(htslib_sam_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c $(cram_h) cram/os.h cram/files.o cram/files.pico: cram/files.c $(cram_misc_h) @@ -217,6 +255,7 @@ cram/mFILE.o cram/mFILE.pico: cram/mFILE.c cram/os.h cram/mFILE.h cram/vlen.h cram/md5.o cram/md5.pico: cram/md5.c cram/md5.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c cram/pooled_alloc.h +cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c cram/rANS_static.h cram/rANS_byte.h cram/sam_header.o cram/sam_header.pico: cram/sam_header.c $(cram_sam_header_h) cram/string_alloc.h cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c cram/string_alloc.h cram/thread_pool.o cram/thread_pool.pico: cram/thread_pool.c cram/thread_pool.h @@ -227,10 +266,14 @@ cram/zfio.o cram/zfio.pico: cram/zfio.c cram/os.h cram/zfio.h bgzip: bgzip.o libhts.a $(CC) -pthread $(LDFLAGS) -o $@ bgzip.o libhts.a $(LDLIBS) -lz +htsfile: htsfile.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ htsfile.o libhts.a $(LDLIBS) -lz + tabix: tabix.o libhts.a $(CC) -pthread $(LDFLAGS) -o $@ tabix.o libhts.a $(LDLIBS) -lz bgzip.o: bgzip.c $(htslib_bgzf_h) $(htslib_hts_h) +htsfile.o: htsfile.c $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $(htslib_bgzf_h) $(htslib_hts_h) @@ -239,7 +282,8 @@ tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $ check test: $(BUILT_TEST_PROGRAMS) test/fieldarith test/fieldarith.sam test/hfile - test/sam + test/sam test/ce.fa + test/test-regidx cd test && REF_PATH=: ./test_view.pl cd test && ./test.pl @@ -252,6 +296,9 @@ test/hfile: test/hfile.o libhts.a test/sam: test/sam.o libhts.a $(CC) -pthread $(LDFLAGS) -o $@ test/sam.o libhts.a $(LDLIBS) -lz +test/test-regidx: test/test-regidx.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LDLIBS) -lz + test/test_view: test/test_view.o libhts.a $(CC) -pthread $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LDLIBS) -lz @@ -263,7 +310,8 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a test/fieldarith.o: test/fieldarith.c $(htslib_sam_h) test/hfile.o: test/hfile.c $(htslib_hfile_h) $(htslib_hts_defs_h) -test/sam.o: test/sam.c $(htslib_sam_h) htslib/kstring.h +test/test-regidx.o: test/test-regidx.c $(htslib_regidx_h) +test/sam.o: test/sam.c $(htslib_sam_h) $(htslib_faidx_h) htslib/kstring.h test/test_view.o: test/test_view.c $(cram_h) $(htslib_sam_h) test/test-vcf-api.o: test/test-vcf-api.c $(htslib_hts_h) $(htslib_vcf_h) htslib/kstring.h test/test-vcf-sweep.o: test/test-vcf-sweep.c $(htslib_vcf_sweep_h) @@ -273,7 +321,7 @@ install: libhts.a $(BUILT_PROGRAMS) installdirs install-$(SHLIB_FLAVOUR) install $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) $(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a - $(INSTALL_DATA) tabix.1 $(DESTDIR)$(man1dir) + $(INSTALL_DATA) htsfile.1 tabix.1 $(DESTDIR)$(man1dir) $(INSTALL_DATA) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir) installdirs: @@ -315,6 +363,7 @@ clean: mostlyclean clean-$(SHLIB_FLAVOUR) -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) distclean: clean + -rm -f config.cache config.log config.mk config.status -rm -f TAGS *-uninstalled.pc clean-so: @@ -332,6 +381,6 @@ force: .PHONY: all check clean distclean force install install-pkgconfig installdirs -.PHONY: lib-shared lib-static mostlyclean tags test testclean +.PHONY: lib-shared lib-static mostlyclean print-version tags test testclean .PHONY: clean-so install-so .PHONY: clean-dylib install-dylib diff --git a/htslib/NEWS b/htslib/NEWS new file mode 100644 index 00000000..c135613d --- /dev/null +++ b/htslib/NEWS @@ -0,0 +1,50 @@ +Noteworthy changes in release 1.2.1 (3 February 2015) + +* Reinstated hts_file_type() and FT_* macros, which were available until 1.1 + but briefly removed in 1.2. This function is deprecated and will be removed + in a future release -- you should use hts_detect_format() etc instead + + +Noteworthy changes in release 1.2 (2 February 2015) + +* HTSlib now has a configure script which checks your build environment + and allows for selection of optional extras. See INSTALL for details + +* By default, reference sequences are fetched from the EBI CRAM Reference + Registry and cached in your $HOME cache directory. This behaviour can + be controlled by setting REF_PATH and REF_CACHE enviroment variables + (see the samtools(1) man page for details) + +* Numerous CRAM improvements: + - Support for CRAM v3.0, an upcoming revision to CRAM supporting + better compression and per-container checksums + - EOF checking for v2.1 and v3.0 (similar to checking BAM EOF blocks) + - Non-standard values for PNEXT and TLEN fields are now preserved + - hts_set_fai_filename() now provides a reference file when encoding + - Generated read names are now numbered from 1, rather than being + labelled 'slice:record-in-slice' + - Multi-threading and speed improvements + +* New htsfile command for identifying file formats, and corresponding + file format detection APIs + +* New tabix --regions FILE, --targets FILE options for filtering via BED files + +* Optional iRODS file access, disabled by default. Configure with --with-irods + to enable accessing iRODS data objects directly via 'irods:DATAOBJ' + +* All occurences of 2^29 in the source have been eliminated, so indexing + and querying against reference sequences larger than 512Mbp works (when + using CSI indices) + +* Support for plain GZIP compression in various places + +* VCF header editing speed improvements + +* Added seq_nt16_int[] (equivalent to the samtools API's bam_nt16_nt4_table) + +* Reinstated faidx_fetch_nseq(), which was accidentally removed from 1.1. + Now faidx_fetch_nseq() and faidx_nseq() are equivalent; eventually + faidx_fetch_nseq() will be deprecated and removed [#156] + +* Fixed bugs #141, #152, #155, #158, #159, and various memory leaks diff --git a/htslib/bgzf.c b/htslib/bgzf.c index 090bec7a..53064585 100644 --- a/htslib/bgzf.c +++ b/htslib/bgzf.c @@ -23,8 +23,6 @@ THE SOFTWARE. */ -#include "config.h" - #include #include #include @@ -39,6 +37,9 @@ #include "htslib/bgzf.h" #include "htslib/hfile.h" +#define BGZF_CACHE +#define BGZF_MT + #define BLOCK_HEADER_LENGTH 18 #define BLOCK_FOOTER_LENGTH 8 @@ -108,8 +109,8 @@ static inline void packInt32(uint8_t *buffer, uint32_t value) static BGZF *bgzf_read_init(hFILE *hfpr) { BGZF *fp; - uint8_t magic[2]; - ssize_t n = hpeek(hfpr, magic, 2); + uint8_t magic[18]; + ssize_t n = hpeek(hfpr, magic, 18); if (n < 0) return NULL; fp = (BGZF*)calloc(1, sizeof(BGZF)); @@ -119,17 +120,30 @@ static BGZF *bgzf_read_init(hFILE *hfpr) fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b); fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0; + fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1; #ifdef BGZF_CACHE fp->cache = kh_init(cache); #endif return fp; } -static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level, -2 plain uncompressed +// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed +static int mode2level(const char *__restrict mode) +{ + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = -2; + return compress_level; +} +static BGZF *bgzf_write_init(const char *mode) { BGZF *fp; fp = (BGZF*)calloc(1, sizeof(BGZF)); fp->is_write = 1; + int compress_level = mode2level(mode); if ( compress_level==-2 ) { fp->is_compressed = 0; @@ -140,18 +154,17 @@ static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the d fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; + if ( strchr(mode,'g') ) + { + // gzip output + fp->is_gzip = 1; + fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream)); + fp->gz_stream->zalloc = NULL; + fp->gz_stream->zfree = NULL; + if ( deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return NULL; + } return fp; } -// get the compress level from the mode string -static int mode2level(const char *__restrict mode) -{ - int i, compress_level = -1; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = -2; - return compress_level; -} BGZF *bgzf_open(const char *path, const char *mode) { @@ -166,7 +179,7 @@ BGZF *bgzf_open(const char *path, const char *mode) } else if (strchr(mode, 'w') || strchr(mode, 'a')) { hFILE *fpw; if ((fpw = hopen(path, mode)) == 0) return 0; - fp = bgzf_write_init(mode2level(mode)); + fp = bgzf_write_init(mode); fp->fp = fpw; } else { errno = EINVAL; return 0; } @@ -188,7 +201,7 @@ BGZF *bgzf_dopen(int fd, const char *mode) } else if (strchr(mode, 'w') || strchr(mode, 'a')) { hFILE *fpw; if ((fpw = hdopen(fd, mode)) == 0) return 0; - fp = bgzf_write_init(mode2level(mode)); + fp = bgzf_write_init(mode); fp->fp = fpw; } else { errno = EINVAL; return 0; } @@ -205,7 +218,7 @@ BGZF *bgzf_hopen(hFILE *hfp, const char *mode) fp = bgzf_read_init(hfp); if (fp == NULL) return NULL; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { - fp = bgzf_write_init(mode2level(mode)); + fp = bgzf_write_init(mode); } else { errno = EINVAL; return 0; } @@ -244,13 +257,6 @@ static int bgzf_gzip_compress(BGZF *fp, void *_dst, int *dlen, void *src, int sl { uint8_t *dst = (uint8_t*)_dst; z_stream *zs = fp->gz_stream; - if ( !zs ) - { - zs = fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream)); - zs->zalloc = NULL; - zs->zfree = NULL; - if ( deflateInit2(zs, level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return -1; // gzip output - } int flush = slen ? Z_NO_FLUSH : Z_FINISH; zs->next_in = (Bytef*)src; zs->avail_in = slen; @@ -433,7 +439,7 @@ int bgzf_read_block(BGZF *fp) // Reading compressed file int64_t block_address; block_address = htell(fp->fp); - if ( fp->is_gzip ) + if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream? { count = inflate_gzip_block(fp, 0); if ( count<0 ) diff --git a/htslib/bgzip.c b/htslib/bgzip.c index a8a88af2..2eeff3dd 100644 --- a/htslib/bgzip.c +++ b/htslib/bgzip.c @@ -156,8 +156,8 @@ int main(int argc, char **argv) strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); - if (f_dst < 0) return 1; free(name); + if (f_dst < 0) return 1; } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) diff --git a/htslib/config.mk b/htslib/config.mk new file mode 100644 index 00000000..c4452d26 --- /dev/null +++ b/htslib/config.mk @@ -0,0 +1,72 @@ +# Optional configure Makefile overrides for htslib. +# +# Copyright (C) 2015 Genome Research Ltd. +# +# Author: John Marshall +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# This is config.mk. Generated from config.mk.in by configure. +# +# If you use configure, this file overrides variables and augments rules +# in the Makefile to reflect your configuration choices. If you don't run +# configure, the main Makefile contains suitable conservative defaults. + +prefix = /usr/local +exec_prefix = ${prefix} +bindir = ${exec_prefix}/bin +includedir = ${prefix}/include +libdir = ${exec_prefix}/lib +datarootdir = ${prefix}/share +mandir = ${datarootdir}/man + +CC = gcc +RANLIB = ranlib + +CFLAGS = -g -O2 +LDFLAGS = +LDLIBS = + + +# ifeq/.../endif, +=, and target-specific variables are GNU Make-specific. +# If you don't have GNU Make, comment out this conditional and note that +# to enable iRODS you will need to implement the following elsewhere. +ifeq "iRODS-disabled" "iRODS-enabled" + +IRODS_HOME ?= /disabled + +EXTRA_CPPFLAGS_IRODS = \ + -I$(IRODS_HOME)/lib/api/include \ + -I$(IRODS_HOME)/lib/core/include \ + -I$(IRODS_HOME)/lib/md5/include \ + -I$(IRODS_HOME)/lib/sha1/include \ + -I$(IRODS_HOME)/server/core/include \ + -I$(IRODS_HOME)/server/drivers/include \ + -I$(IRODS_HOME)/server/icat/include + +LDFLAGS += -L$(IRODS_HOME)/lib/core/obj +LDLIBS += -lRodsAPIs -lgssapi_krb5 + +LIBHTS_OBJS += hfile_irods.o + +hfile.o hfile.pico: CPPFLAGS += -DHAVE_IRODS + +hfile_irods.o hfile_irods.pico: CPPFLAGS += $(EXTRA_CPPFLAGS_IRODS) + +endif diff --git a/htslib/config.mk.in b/htslib/config.mk.in new file mode 100644 index 00000000..e058ee58 --- /dev/null +++ b/htslib/config.mk.in @@ -0,0 +1,72 @@ +# Optional configure Makefile overrides for htslib. +# +# Copyright (C) 2015 Genome Research Ltd. +# +# Author: John Marshall +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# This is @configure_input@ +# +# If you use configure, this file overrides variables and augments rules +# in the Makefile to reflect your configuration choices. If you don't run +# configure, the main Makefile contains suitable conservative defaults. + +prefix = @prefix@ +exec_prefix = @exec_prefix@ +bindir = @bindir@ +includedir = @includedir@ +libdir = @libdir@ +datarootdir = @datarootdir@ +mandir = @mandir@ + +CC = @CC@ +RANLIB = @RANLIB@ + +CFLAGS = @CFLAGS@ +LDFLAGS = @LDFLAGS@ +LDLIBS = @LIBS@ + + +# ifeq/.../endif, +=, and target-specific variables are GNU Make-specific. +# If you don't have GNU Make, comment out this conditional and note that +# to enable iRODS you will need to implement the following elsewhere. +ifeq "iRODS-@irods@" "iRODS-enabled" + +@define_IRODS_HOME@ + +EXTRA_CPPFLAGS_IRODS = \ + -I$(IRODS_HOME)/lib/api/include \ + -I$(IRODS_HOME)/lib/core/include \ + -I$(IRODS_HOME)/lib/md5/include \ + -I$(IRODS_HOME)/lib/sha1/include \ + -I$(IRODS_HOME)/server/core/include \ + -I$(IRODS_HOME)/server/drivers/include \ + -I$(IRODS_HOME)/server/icat/include + +LDFLAGS += -L$(IRODS_HOME)/lib/core/obj +LDLIBS += -lRodsAPIs -lgssapi_krb5 + +LIBHTS_OBJS += hfile_irods.o + +hfile.o hfile.pico: CPPFLAGS += -DHAVE_IRODS + +hfile_irods.o hfile_irods.pico: CPPFLAGS += $(EXTRA_CPPFLAGS_IRODS) + +endif diff --git a/htslib/configure b/htslib/configure new file mode 100755 index 00000000..9fc4bd2b --- /dev/null +++ b/htslib/configure @@ -0,0 +1,4012 @@ +#! /bin/sh +# Guess values for system-dependent variables and create Makefiles. +# Generated by GNU Autoconf 2.68 for HTSlib 1.2.1. +# +# Report bugs to . +# +# +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, +# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software +# Foundation, Inc. +# +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. +# +# Portions copyright (C) 2015 Genome Research Ltd. +# +# This configure script is free software: you are free to change and +# redistribute it. There is NO WARRANTY, to the extent permitted by law. +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + +if test "x$CONFIG_SHELL" = x; then + as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which + # is contrary to our usage. Disable this feature. + alias -g '\${1+\"\$@\"}'='\"\$@\"' + setopt NO_GLOB_SUBST +else + case \`(set -o) 2>/dev/null\` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi +" + as_required="as_fn_return () { (exit \$1); } +as_fn_success () { as_fn_return 0; } +as_fn_failure () { as_fn_return 1; } +as_fn_ret_success () { return 0; } +as_fn_ret_failure () { return 1; } + +exitcode=0 +as_fn_success || { exitcode=1; echo as_fn_success failed.; } +as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } +as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } +as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } +if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : + +else + exitcode=1; echo positional parameters were not saved. +fi +test x\$exitcode = x0 || exit 1" + as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO + as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO + eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && + test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1" + if (eval "$as_required") 2>/dev/null; then : + as_have_required=yes +else + as_have_required=no +fi + if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : + +else + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +as_found=false +for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + as_found=: + case $as_dir in #( + /*) + for as_base in sh bash ksh sh5; do + # Try only shells that exist, to save several forks. + as_shell=$as_dir/$as_base + if { test -f "$as_shell" || test -f "$as_shell.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : + CONFIG_SHELL=$as_shell as_have_required=yes + if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : + break 2 +fi +fi + done;; + esac + as_found=false +done +$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : + CONFIG_SHELL=$SHELL as_have_required=yes +fi; } +IFS=$as_save_IFS + + + if test "x$CONFIG_SHELL" != x; then : + # We cannot yet assume a decent shell, so we have to provide a + # neutralization value for shells without unset; and this also + # works around shells that cannot unset nonexistent variables. + # Preserve -v and -x to the replacement shell. + BASH_ENV=/dev/null + ENV=/dev/null + (unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV + export CONFIG_SHELL + case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; + esac + exec "$CONFIG_SHELL" $as_opts "$as_myself" ${1+"$@"} +fi + + if test x$as_have_required = xno; then : + $as_echo "$0: This script requires a shell more modern than all" + $as_echo "$0: the shells that I found on your system." + if test x${ZSH_VERSION+set} = xset ; then + $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" + $as_echo "$0: be upgraded to zsh 4.3.4 or later." + else + $as_echo "$0: Please tell bug-autoconf@gnu.org and +$0: samtools-help@lists.sourceforge.net about your system, +$0: including any error possibly output before this +$0: message. Then install a modern shell, or manually run +$0: the script under such a shell if you do have one." + fi + exit 1 +fi +fi +fi +SHELL=${CONFIG_SHELL-/bin/sh} +export SHELL +# Unset more variables known to interfere with behavior of common tools. +CLICOLOR_FORCE= GREP_OPTIONS= +unset CLICOLOR_FORCE GREP_OPTIONS + +## --------------------- ## +## M4sh Shell Functions. ## +## --------------------- ## +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + + + as_lineno_1=$LINENO as_lineno_1a=$LINENO + as_lineno_2=$LINENO as_lineno_2a=$LINENO + eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && + test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { + # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -p'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -p' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -p' + fi +else + as_ln_s='cp -p' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +if test -x / >/dev/null 2>&1; then + as_test_x='test -x' +else + if ls -dL / >/dev/null 2>&1; then + as_ls_L_option=L + else + as_ls_L_option= + fi + as_test_x=' + eval sh -c '\'' + if test -d "$1"; then + test -d "$1/."; + else + case $1 in #( + -*)set "./$1";; + esac; + case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #(( + ???[sx]*):;;*)false;;esac;fi + '\'' sh + ' +fi +as_executable_p=$as_test_x + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +test -n "$DJDIR" || exec 7<&0 &1 + +# Name of the host. +# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status, +# so uname gets run too. +ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` + +# +# Initializations. +# +ac_default_prefix=/usr/local +ac_clean_files= +ac_config_libobj_dir=. +LIBOBJS= +cross_compiling=no +subdirs= +MFLAGS= +MAKEFLAGS= + +# Identity of this package. +PACKAGE_NAME='HTSlib' +PACKAGE_TARNAME='htslib' +PACKAGE_VERSION='1.2.1' +PACKAGE_STRING='HTSlib 1.2.1' +PACKAGE_BUGREPORT='samtools-help@lists.sourceforge.net' +PACKAGE_URL='http://www.htslib.org/' + +ac_unique_file="hts.c" +ac_subst_vars='LTLIBOBJS +LIBOBJS +define_IRODS_HOME +irods +RANLIB +OBJEXT +EXEEXT +ac_ct_CC +CPPFLAGS +LDFLAGS +CFLAGS +CC +target_alias +host_alias +build_alias +LIBS +ECHO_T +ECHO_N +ECHO_C +DEFS +mandir +localedir +libdir +psdir +pdfdir +dvidir +htmldir +infodir +docdir +oldincludedir +includedir +localstatedir +sharedstatedir +sysconfdir +datadir +datarootdir +libexecdir +sbindir +bindir +program_transform_name +prefix +exec_prefix +PACKAGE_URL +PACKAGE_BUGREPORT +PACKAGE_STRING +PACKAGE_VERSION +PACKAGE_TARNAME +PACKAGE_NAME +PATH_SEPARATOR +SHELL' +ac_subst_files='' +ac_user_opts=' +enable_option_checking +with_irods +' + ac_precious_vars='build_alias +host_alias +target_alias +CC +CFLAGS +LDFLAGS +LIBS +CPPFLAGS' + + +# Initialize some variables set by options. +ac_init_help= +ac_init_version=false +ac_unrecognized_opts= +ac_unrecognized_sep= +# The variables have the same names as the options, with +# dashes changed to underlines. +cache_file=/dev/null +exec_prefix=NONE +no_create= +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +verbose= +x_includes=NONE +x_libraries=NONE + +# Installation directory options. +# These are left unexpanded so users can "make install exec_prefix=/foo" +# and all the variables that are supposed to be based on exec_prefix +# by default will actually change. +# Use braces instead of parens because sh, perl, etc. also accept them. +# (The list follows the same order as the GNU Coding Standards.) +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datarootdir='${prefix}/share' +datadir='${datarootdir}' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +includedir='${prefix}/include' +oldincludedir='/usr/include' +docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' +infodir='${datarootdir}/info' +htmldir='${docdir}' +dvidir='${docdir}' +pdfdir='${docdir}' +psdir='${docdir}' +libdir='${exec_prefix}/lib' +localedir='${datarootdir}/locale' +mandir='${datarootdir}/man' + +ac_prev= +ac_dashdash= +for ac_option +do + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval $ac_prev=\$ac_option + ac_prev= + continue + fi + + case $ac_option in + *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; + *=) ac_optarg= ;; + *) ac_optarg=yes ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case $ac_dashdash$ac_option in + --) + ac_dashdash=yes ;; + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir=$ac_optarg ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build_alias ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build_alias=$ac_optarg ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file=$ac_optarg ;; + + --config-cache | -C) + cache_file=config.cache ;; + + -datadir | --datadir | --datadi | --datad) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=*) + datadir=$ac_optarg ;; + + -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ + | --dataroo | --dataro | --datar) + ac_prev=datarootdir ;; + -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ + | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) + datarootdir=$ac_optarg ;; + + -disable-* | --disable-*) + ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=no ;; + + -docdir | --docdir | --docdi | --doc | --do) + ac_prev=docdir ;; + -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) + docdir=$ac_optarg ;; + + -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) + ac_prev=dvidir ;; + -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) + dvidir=$ac_optarg ;; + + -enable-* | --enable-*) + ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=\$ac_optarg ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix=$ac_optarg ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he | -h) + ac_init_help=long ;; + -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) + ac_init_help=recursive ;; + -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) + ac_init_help=short ;; + + -host | --host | --hos | --ho) + ac_prev=host_alias ;; + -host=* | --host=* | --hos=* | --ho=*) + host_alias=$ac_optarg ;; + + -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) + ac_prev=htmldir ;; + -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ + | --ht=*) + htmldir=$ac_optarg ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir=$ac_optarg ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir=$ac_optarg ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir=$ac_optarg ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir=$ac_optarg ;; + + -localedir | --localedir | --localedi | --localed | --locale) + ac_prev=localedir ;; + -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) + localedir=$ac_optarg ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst | --locals) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) + localstatedir=$ac_optarg ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir=$ac_optarg ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c | -n) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir=$ac_optarg ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix=$ac_optarg ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix=$ac_optarg ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix=$ac_optarg ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name=$ac_optarg ;; + + -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) + ac_prev=pdfdir ;; + -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) + pdfdir=$ac_optarg ;; + + -psdir | --psdir | --psdi | --psd | --ps) + ac_prev=psdir ;; + -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) + psdir=$ac_optarg ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir=$ac_optarg ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir=$ac_optarg ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site=$ac_optarg ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir=$ac_optarg ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir=$ac_optarg ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target_alias ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target_alias=$ac_optarg ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers | -V) + ac_init_version=: ;; + + -with-* | --with-*) + ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=\$ac_optarg ;; + + -without-* | --without-*) + ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=no ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes=$ac_optarg ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries=$ac_optarg ;; + + -*) as_fn_error $? "unrecognized option: \`$ac_option' +Try \`$0 --help' for more information" + ;; + + *=*) + ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` + # Reject names that are not valid shell variable names. + case $ac_envvar in #( + '' | [0-9]* | *[!_$as_cr_alnum]* ) + as_fn_error $? "invalid variable name: \`$ac_envvar'" ;; + esac + eval $ac_envvar=\$ac_optarg + export $ac_envvar ;; + + *) + # FIXME: should be removed in autoconf 3.0. + $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 + expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && + $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 + : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" + ;; + + esac +done + +if test -n "$ac_prev"; then + ac_option=--`echo $ac_prev | sed 's/_/-/g'` + as_fn_error $? "missing argument to $ac_option" +fi + +if test -n "$ac_unrecognized_opts"; then + case $enable_option_checking in + no) ;; + fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; + *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; + esac +fi + +# Check all directory arguments for consistency. +for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ + datadir sysconfdir sharedstatedir localstatedir includedir \ + oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ + libdir localedir mandir +do + eval ac_val=\$$ac_var + # Remove trailing slashes. + case $ac_val in + */ ) + ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` + eval $ac_var=\$ac_val;; + esac + # Be sure to have absolute directory names. + case $ac_val in + [\\/$]* | ?:[\\/]* ) continue;; + NONE | '' ) case $ac_var in *prefix ) continue;; esac;; + esac + as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val" +done + +# There might be people who depend on the old broken behavior: `$host' +# used to hold the argument of --host etc. +# FIXME: To remove some day. +build=$build_alias +host=$host_alias +target=$target_alias + +# FIXME: To remove some day. +if test "x$host_alias" != x; then + if test "x$build_alias" = x; then + cross_compiling=maybe + $as_echo "$as_me: WARNING: if you wanted to set the --build type, don't use --host. + If a cross compiler is detected then cross compile mode will be used" >&2 + elif test "x$build_alias" != "x$host_alias"; then + cross_compiling=yes + fi +fi + +ac_tool_prefix= +test -n "$host_alias" && ac_tool_prefix=$host_alias- + +test "$silent" = yes && exec 6>/dev/null + + +ac_pwd=`pwd` && test -n "$ac_pwd" && +ac_ls_di=`ls -di .` && +ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || + as_fn_error $? "working directory cannot be determined" +test "X$ac_ls_di" = "X$ac_pwd_ls_di" || + as_fn_error $? "pwd does not report name of working directory" + + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then the parent directory. + ac_confdir=`$as_dirname -- "$as_myself" || +$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_myself" : 'X\(//\)[^/]' \| \ + X"$as_myself" : 'X\(//\)$' \| \ + X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_myself" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + srcdir=$ac_confdir + if test ! -r "$srcdir/$ac_unique_file"; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r "$srcdir/$ac_unique_file"; then + test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." + as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir" +fi +ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" +ac_abs_confdir=`( + cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg" + pwd)` +# When building in place, set srcdir=. +if test "$ac_abs_confdir" = "$ac_pwd"; then + srcdir=. +fi +# Remove unnecessary trailing slashes from srcdir. +# Double slashes in file names in object file debugging info +# mess up M-x gdb in Emacs. +case $srcdir in +*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; +esac +for ac_var in $ac_precious_vars; do + eval ac_env_${ac_var}_set=\${${ac_var}+set} + eval ac_env_${ac_var}_value=\$${ac_var} + eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} + eval ac_cv_env_${ac_var}_value=\$${ac_var} +done + +# +# Report the --help message. +# +if test "$ac_init_help" = "long"; then + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat <<_ACEOF +\`configure' configures HTSlib 1.2.1 to adapt to many kinds of systems. + +Usage: $0 [OPTION]... [VAR=VALUE]... + +To assign environment variables (e.g., CC, CFLAGS...), specify them as +VAR=VALUE. See below for descriptions of some of the useful variables. + +Defaults for the options are specified in brackets. + +Configuration: + -h, --help display this help and exit + --help=short display options specific to this package + --help=recursive display the short help of all the included packages + -V, --version display version information and exit + -q, --quiet, --silent do not print \`checking ...' messages + --cache-file=FILE cache test results in FILE [disabled] + -C, --config-cache alias for \`--cache-file=config.cache' + -n, --no-create do not create output files + --srcdir=DIR find the sources in DIR [configure dir or \`..'] + +Installation directories: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [PREFIX] + +By default, \`make install' will install all the files in +\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify +an installation prefix other than \`$ac_default_prefix' using \`--prefix', +for instance \`--prefix=\$HOME'. + +For better control, use the options below. + +Fine tuning of the installation directories: + --bindir=DIR user executables [EPREFIX/bin] + --sbindir=DIR system admin executables [EPREFIX/sbin] + --libexecdir=DIR program executables [EPREFIX/libexec] + --sysconfdir=DIR read-only single-machine data [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] + --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --libdir=DIR object code libraries [EPREFIX/lib] + --includedir=DIR C header files [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc [/usr/include] + --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] + --datadir=DIR read-only architecture-independent data [DATAROOTDIR] + --infodir=DIR info documentation [DATAROOTDIR/info] + --localedir=DIR locale-dependent data [DATAROOTDIR/locale] + --mandir=DIR man documentation [DATAROOTDIR/man] + --docdir=DIR documentation root [DATAROOTDIR/doc/htslib] + --htmldir=DIR html documentation [DOCDIR] + --dvidir=DIR dvi documentation [DOCDIR] + --pdfdir=DIR pdf documentation [DOCDIR] + --psdir=DIR ps documentation [DOCDIR] +_ACEOF + + cat <<\_ACEOF +_ACEOF +fi + +if test -n "$ac_init_help"; then + case $ac_init_help in + short | recursive ) echo "Configuration of HTSlib 1.2.1:";; + esac + cat <<\_ACEOF + +Optional Packages: + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --with-irods[=DIR] use RodsAPIs library (in DIR) to support iRODS URLs + +Some influential environment variables: + CC C compiler command + CFLAGS C compiler flags + LDFLAGS linker flags, e.g. -L if you have libraries in a + nonstandard directory + LIBS libraries to pass to the linker, e.g. -l + CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I if + you have headers in a nonstandard directory + +Use these variables to override the choices made by `configure' or to help +it to find libraries and programs with nonstandard names/locations. + +Report bugs to . +HTSlib home page: . +_ACEOF +ac_status=$? +fi + +if test "$ac_init_help" = "recursive"; then + # If there are subdirs, report their specific --help. + for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue + test -d "$ac_dir" || + { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || + continue + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + cd "$ac_dir" || { ac_status=$?; continue; } + # Check for guested configure. + if test -f "$ac_srcdir/configure.gnu"; then + echo && + $SHELL "$ac_srcdir/configure.gnu" --help=recursive + elif test -f "$ac_srcdir/configure"; then + echo && + $SHELL "$ac_srcdir/configure" --help=recursive + else + $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 + fi || ac_status=$? + cd "$ac_pwd" || { ac_status=$?; break; } + done +fi + +test -n "$ac_init_help" && exit $ac_status +if $ac_init_version; then + cat <<\_ACEOF +HTSlib configure 1.2.1 +generated by GNU Autoconf 2.68 + +Copyright (C) 2010 Free Software Foundation, Inc. +This configure script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it. + +Portions copyright (C) 2015 Genome Research Ltd. + +This configure script is free software: you are free to change and +redistribute it. There is NO WARRANTY, to the extent permitted by law. +_ACEOF + exit +fi + +## ------------------------ ## +## Autoconf initialization. ## +## ------------------------ ## + +# ac_fn_c_try_compile LINENO +# -------------------------- +# Try to compile conftest.$ac_ext, and return whether this succeeded. +ac_fn_c_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_compile + +# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES +# ------------------------------------------------------- +# Tests whether HEADER exists and can be compiled using the include files in +# INCLUDES, setting the cache variable VAR accordingly. +ac_fn_c_check_header_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +#include <$2> +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_header_compile + +# ac_fn_c_try_link LINENO +# ----------------------- +# Try to link conftest.$ac_ext, and return whether this succeeded. +ac_fn_c_try_link () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext conftest$ac_exeext + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + $as_test_x conftest$ac_exeext + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information + # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would + # interfere with the next link command; also delete a directory that is + # left behind by Apple's compiler. We do this before executing the actions. + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_link +cat >config.log <<_ACEOF +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. + +It was created by HTSlib $as_me 1.2.1, which was +generated by GNU Autoconf 2.68. Invocation command line was + + $ $0 $@ + +_ACEOF +exec 5>>config.log +{ +cat <<_ASUNAME +## --------- ## +## Platform. ## +## --------- ## + +hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` + +/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` +/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` +/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` +/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` + +_ASUNAME + +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + $as_echo "PATH: $as_dir" + done +IFS=$as_save_IFS + +} >&5 + +cat >&5 <<_ACEOF + + +## ----------- ## +## Core tests. ## +## ----------- ## + +_ACEOF + + +# Keep a trace of the command line. +# Strip out --no-create and --no-recursion so they do not pile up. +# Strip out --silent because we don't want to record it for future runs. +# Also quote any args containing shell meta-characters. +# Make two passes to allow for proper duplicate-argument suppression. +ac_configure_args= +ac_configure_args0= +ac_configure_args1= +ac_must_keep_next=false +for ac_pass in 1 2 +do + for ac_arg + do + case $ac_arg in + -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + continue ;; + *\'*) + ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + case $ac_pass in + 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; + 2) + as_fn_append ac_configure_args1 " '$ac_arg'" + if test $ac_must_keep_next = true; then + ac_must_keep_next=false # Got value, back to normal. + else + case $ac_arg in + *=* | --config-cache | -C | -disable-* | --disable-* \ + | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ + | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ + | -with-* | --with-* | -without-* | --without-* | --x) + case "$ac_configure_args0 " in + "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; + esac + ;; + -* ) ac_must_keep_next=true ;; + esac + fi + as_fn_append ac_configure_args " '$ac_arg'" + ;; + esac + done +done +{ ac_configure_args0=; unset ac_configure_args0;} +{ ac_configure_args1=; unset ac_configure_args1;} + +# When interrupted or exit'd, cleanup temporary files, and complete +# config.log. We remove comments because anyway the quotes in there +# would cause problems or look ugly. +# WARNING: Use '\'' to represent an apostrophe within the trap. +# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. +trap 'exit_status=$? + # Save into config.log some information that might help in debugging. + { + echo + + $as_echo "## ---------------- ## +## Cache variables. ## +## ---------------- ##" + echo + # The following way of writing the cache mishandles newlines in values, +( + for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + (set) 2>&1 | + case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + sed -n \ + "s/'\''/'\''\\\\'\'''\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" + ;; #( + *) + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) + echo + + $as_echo "## ----------------- ## +## Output variables. ## +## ----------------- ##" + echo + for ac_var in $ac_subst_vars + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + + if test -n "$ac_subst_files"; then + $as_echo "## ------------------- ## +## File substitutions. ## +## ------------------- ##" + echo + for ac_var in $ac_subst_files + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + fi + + if test -s confdefs.h; then + $as_echo "## ----------- ## +## confdefs.h. ## +## ----------- ##" + echo + cat confdefs.h + echo + fi + test "$ac_signal" != 0 && + $as_echo "$as_me: caught signal $ac_signal" + $as_echo "$as_me: exit $exit_status" + } >&5 + rm -f core *.core core.conftest.* && + rm -f -r conftest* confdefs* conf$$* $ac_clean_files && + exit $exit_status +' 0 +for ac_signal in 1 2 13 15; do + trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal +done +ac_signal=0 + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -f -r conftest* confdefs.h + +$as_echo "/* confdefs.h */" > confdefs.h + +# Predefined preprocessor variables. + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_NAME "$PACKAGE_NAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_TARNAME "$PACKAGE_TARNAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_VERSION "$PACKAGE_VERSION" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_STRING "$PACKAGE_STRING" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_URL "$PACKAGE_URL" +_ACEOF + + +# Let the site file select an alternate cache file if it wants to. +# Prefer an explicitly selected file to automatically selected ones. +ac_site_file1=NONE +ac_site_file2=NONE +if test -n "$CONFIG_SITE"; then + # We do not want a PATH search for config.site. + case $CONFIG_SITE in #(( + -*) ac_site_file1=./$CONFIG_SITE;; + */*) ac_site_file1=$CONFIG_SITE;; + *) ac_site_file1=./$CONFIG_SITE;; + esac +elif test "x$prefix" != xNONE; then + ac_site_file1=$prefix/share/config.site + ac_site_file2=$prefix/etc/config.site +else + ac_site_file1=$ac_default_prefix/share/config.site + ac_site_file2=$ac_default_prefix/etc/config.site +fi +for ac_site_file in "$ac_site_file1" "$ac_site_file2" +do + test "x$ac_site_file" = xNONE && continue + if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 +$as_echo "$as_me: loading site script $ac_site_file" >&6;} + sed 's/^/| /' "$ac_site_file" >&5 + . "$ac_site_file" \ + || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "failed to load site script $ac_site_file +See \`config.log' for more details" "$LINENO" 5; } + fi +done + +if test -r "$cache_file"; then + # Some versions of bash will fail to source /dev/null (special files + # actually), so we avoid doing that. DJGPP emulates it as a regular file. + if test /dev/null != "$cache_file" && test -f "$cache_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 +$as_echo "$as_me: loading cache $cache_file" >&6;} + case $cache_file in + [\\/]* | ?:[\\/]* ) . "$cache_file";; + *) . "./$cache_file";; + esac + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 +$as_echo "$as_me: creating cache $cache_file" >&6;} + >$cache_file +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 +$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 +$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 +$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) as_fn_append ac_configure_args " '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 +$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} + as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 +fi +## -------------------- ## +## Main body of script. ## +## -------------------- ## + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + + + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. +set dummy ${ac_tool_prefix}gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="${ac_tool_prefix}gcc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_CC"; then + ac_ct_CC=$CC + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_CC="gcc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +else + CC="$ac_cv_prog_CC" +fi + +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. +set dummy ${ac_tool_prefix}cc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="${ac_tool_prefix}cc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + fi +fi +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + ac_prog_rejected=no +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# != 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" + fi +fi +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + for ac_prog in cl.exe + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CC" && break + done +fi +if test -z "$CC"; then + ac_ct_CC=$CC + for ac_prog in cl.exe +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_CC="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CC" && break +done + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +fi + +fi + + +test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "no acceptable C compiler found in \$PATH +See \`config.log' for more details" "$LINENO" 5; } + +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" +# Try to create an executable without -o first, disregard a.out. +# It will help us diagnose broken compilers, and finding out an intuition +# of exeext. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 +$as_echo_n "checking whether the C compiler works... " >&6; } +ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` + +# The possible output files: +ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" + +ac_rmfiles= +for ac_file in $ac_files +do + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + * ) ac_rmfiles="$ac_rmfiles $ac_file";; + esac +done +rm -f $ac_rmfiles + +if { { ac_try="$ac_link_default" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link_default") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. +# So ignore a value of `no', otherwise this would lead to `EXEEXT = no' +# in a Makefile. We should not override ac_cv_exeext if it was cached, +# so that the user can short-circuit this test for compilers unknown to +# Autoconf. +for ac_file in $ac_files '' +do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) + ;; + [ab].out ) + # We found the default executable, but exeext='' is most + # certainly right. + break;; + *.* ) + if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; + then :; else + ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + fi + # We set ac_cv_exeext here because the later test for it is not + # safe: cross compilers may not add the suffix if given an `-o' + # argument, so we may need to know it at that point already. + # Even if this section looks crufty: it has the advantage of + # actually working. + break;; + * ) + break;; + esac +done +test "$ac_cv_exeext" = no && ac_cv_exeext= + +else + ac_file='' +fi +if test -z "$ac_file"; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +$as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "C compiler cannot create executables +See \`config.log' for more details" "$LINENO" 5; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 +$as_echo_n "checking for C compiler default output file name... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 +$as_echo "$ac_file" >&6; } +ac_exeext=$ac_cv_exeext + +rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out +ac_clean_files=$ac_clean_files_save +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 +$as_echo_n "checking for suffix of executables... " >&6; } +if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + # If both `conftest.exe' and `conftest' are `present' (well, observable) +# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will +# work properly (i.e., refer to `conftest.exe'), while it won't with +# `rm'. +for ac_file in conftest.exe conftest conftest.*; do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + break;; + * ) break;; + esac +done +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f conftest conftest$ac_cv_exeext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 +$as_echo "$ac_cv_exeext" >&6; } + +rm -f conftest.$ac_ext +EXEEXT=$ac_cv_exeext +ac_exeext=$EXEEXT +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +FILE *f = fopen ("conftest.out", "w"); + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +ac_clean_files="$ac_clean_files conftest.out" +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 +$as_echo_n "checking whether we are cross compiling... " >&6; } +if test "$cross_compiling" != yes; then + { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } + if { ac_try='./conftest$ac_cv_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + cross_compiling=no + else + if test "$cross_compiling" = maybe; then + cross_compiling=yes + else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details" "$LINENO" 5; } + fi + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 +$as_echo "$cross_compiling" >&6; } + +rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out +ac_clean_files=$ac_clean_files_save +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 +$as_echo_n "checking for suffix of object files... " >&6; } +if ${ac_cv_objext+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.o conftest.obj +if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + for ac_file in conftest.o conftest.obj conftest.*; do + test -f "$ac_file" || continue; + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; + *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` + break;; + esac +done +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compute suffix of object files: cannot compile +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f conftest.$ac_cv_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5 +$as_echo "$ac_cv_objext" >&6; } +OBJEXT=$ac_cv_objext +ac_objext=$OBJEXT +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5 +$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } +if ${ac_cv_c_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_c_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5 +$as_echo "$ac_cv_c_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GCC=yes +else + GCC= +fi +ac_test_CFLAGS=${CFLAGS+set} +ac_save_CFLAGS=$CFLAGS +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5 +$as_echo_n "checking whether $CC accepts -g... " >&6; } +if ${ac_cv_prog_cc_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_c_werror_flag=$ac_c_werror_flag + ac_c_werror_flag=yes + ac_cv_prog_cc_g=no + CFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_g=yes +else + CFLAGS="" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +else + ac_c_werror_flag=$ac_save_c_werror_flag + CFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_g=yes +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_c_werror_flag=$ac_save_c_werror_flag +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5 +$as_echo "$ac_cv_prog_cc_g" >&6; } +if test "$ac_test_CFLAGS" = set; then + CFLAGS=$ac_save_CFLAGS +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5 +$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } +if ${ac_cv_prog_cc_c89+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include +/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ +struct buf { int x; }; +FILE * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not '\xHH' hex character constants. + These don't provoke an error unfortunately, instead are silently treated + as 'x'. The following induces an error, until -std is added to get + proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an + array size at least. It's necessary to write '\x00'==0 to get something + that's true only with -std. */ +int osf4_cc_array ['\x00' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) 'x' +int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); +int argc; +char **argv; +int +main () +{ +return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; + ; + return 0; +} +_ACEOF +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ + -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_c89=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c89" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c89" in + x) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c89" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5 +$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c89" != xno; then : + +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args. +set dummy ${ac_tool_prefix}ranlib; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_RANLIB+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$RANLIB"; then + ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +RANLIB=$ac_cv_prog_RANLIB +if test -n "$RANLIB"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5 +$as_echo "$RANLIB" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_RANLIB"; then + ac_ct_RANLIB=$RANLIB + # Extract the first word of "ranlib", so it can be a program name with args. +set dummy ranlib; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_RANLIB+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_RANLIB"; then + ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_RANLIB="ranlib" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB +if test -n "$ac_ct_RANLIB"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5 +$as_echo "$ac_ct_RANLIB" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_RANLIB" = x; then + RANLIB=":" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + RANLIB=$ac_ct_RANLIB + fi +else + RANLIB="$ac_cv_prog_RANLIB" +fi + + + +# Check whether --with-irods was given. +if test "${with_irods+set}" = set; then : + withval=$with_irods; case $withval in + no) irods=disabled ;; + yes) irods=enabled ;; + *) irods=enabled; IRODS_HOME=$withval ;; + esac +else + irods=disabled +fi + + +save_LIBS=$LIBS +zlib_devel=ok + +ac_fn_c_check_header_compile "$LINENO" "zlib.h" "ac_cv_header_zlib_h" "; +" +if test "x$ac_cv_header_zlib_h" = xyes; then : + +else + zlib_devel=missing +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inflate in -lz" >&5 +$as_echo_n "checking for inflate in -lz... " >&6; } +if ${ac_cv_lib_z_inflate+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lz $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char inflate (); +int +main () +{ +return inflate (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_z_inflate=yes +else + ac_cv_lib_z_inflate=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_inflate" >&5 +$as_echo "$ac_cv_lib_z_inflate" >&6; } +if test "x$ac_cv_lib_z_inflate" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBZ 1 +_ACEOF + + LIBS="-lz $LIBS" + +else + zlib_devel=missing +fi + +LIBS=$save_LIBS + +if test $zlib_devel != ok; then + as_fn_error $? "zlib development files not found + +HTSlib uses compression routines from the zlib library . +Building HTSlib requires zlib development files to be installed on the build +machine; you may need to ensure a package such as zlib1g-dev (on Debian or +Ubuntu Linux) or zlib-devel (on RPM-based Linux distributions) is installed. + +FAILED. This error must be resolved in order to build HTSlib successfully." "$LINENO" 5 +fi + +if test $irods = enabled; then + # TODO Also test whether we require libgssapi_krb5 and AC_CHECK_LIB it + save_LDFLAGS=$LDFLAGS + LDFLAGS="$LDFLAGS -L$IRODS_HOME/lib/core/obj" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for getRodsEnvFileName in -lRodsAPIs" >&5 +$as_echo_n "checking for getRodsEnvFileName in -lRodsAPIs... " >&6; } +if ${ac_cv_lib_RodsAPIs_getRodsEnvFileName+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lRodsAPIs -lgssapi_krb5 -lpthread $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char getRodsEnvFileName (); +int +main () +{ +return getRodsEnvFileName (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_RodsAPIs_getRodsEnvFileName=yes +else + ac_cv_lib_RodsAPIs_getRodsEnvFileName=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_RodsAPIs_getRodsEnvFileName" >&5 +$as_echo "$ac_cv_lib_RodsAPIs_getRodsEnvFileName" >&6; } +if test "x$ac_cv_lib_RodsAPIs_getRodsEnvFileName" = xyes; then : + case $with_irods in + yes) define_IRODS_HOME='# Uses $(IRODS_HOME) from the environment' ;; + *) define_IRODS_HOME="IRODS_HOME = $with_irods" ;; + esac +else + as_fn_error $? "iRODS development files not found + +Support for iRODS URLs requires the libRodsAPI client library and headers. +Configure with --with-irods=DIR (or just --with-irods if \$IRODS_HOME has +been exported with a suitable value), where DIR is the base of an iRODS tree +such that the library is present as DIR/lib/core/obj/libRodsAPI.* and headers +are present under DIR/lib/api/include and so on." "$LINENO" 5 +fi + + LDFLAGS=$save_LDFLAGS +else + define_IRODS_HOME='IRODS_HOME ?= /disabled' +fi + + + +ac_config_files="$ac_config_files config.mk" + +cat >confcache <<\_ACEOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs, see configure's option --config-cache. +# It is not useful on other systems. If it contains results you don't +# want to keep, you may remove or edit it. +# +# config.status only pays attention to the cache file if you give it +# the --recheck option to rerun configure. +# +# `ac_cv_env_foo' variables (set or unset) will be overridden when +# loading this file, other *unset* `ac_cv_foo' will be assigned the +# following values. + +_ACEOF + +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, we kill variables containing newlines. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +( + for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + + (set) 2>&1 | + case $as_nl`(ac_space=' '; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + # `set' does not quote correctly, so add quotes: double-quote + # substitution turns \\\\ into \\, and sed turns \\ into \. + sed -n \ + "s/'/'\\\\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" + ;; #( + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) | + sed ' + /^ac_cv_env_/b end + t clear + :clear + s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ + t end + s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ + :end' >>confcache +if diff "$cache_file" confcache >/dev/null 2>&1; then :; else + if test -w "$cache_file"; then + if test "x$cache_file" != "x/dev/null"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 +$as_echo "$as_me: updating cache $cache_file" >&6;} + if test ! -f "$cache_file" || test -h "$cache_file"; then + cat confcache >"$cache_file" + else + case $cache_file in #( + */* | ?:*) + mv -f confcache "$cache_file"$$ && + mv -f "$cache_file"$$ "$cache_file" ;; #( + *) + mv -f confcache "$cache_file" ;; + esac + fi + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 +$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} + fi +fi +rm -f confcache + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +# +# If the first sed substitution is executed (which looks for macros that +# take arguments), then branch to the quote section. Otherwise, +# look for a macro that doesn't take arguments. +ac_script=' +:mline +/\\$/{ + N + s,\\\n,, + b mline +} +t clear +:clear +s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g +t quote +s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g +t quote +b any +:quote +s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g +s/\[/\\&/g +s/\]/\\&/g +s/\$/$$/g +H +:any +${ + g + s/^\n// + s/\n/ /g + p +} +' +DEFS=`sed -n "$ac_script" confdefs.h` + + +ac_libobjs= +ac_ltlibobjs= +U= +for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue + # 1. Remove the extension, and $U if already installed. + ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' + ac_i=`$as_echo "$ac_i" | sed "$ac_script"` + # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR + # will be set to the directory where LIBOBJS objects are built. + as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" + as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo' +done +LIBOBJS=$ac_libobjs + +LTLIBOBJS=$ac_ltlibobjs + + + +: "${CONFIG_STATUS=./config.status}" +ac_write_fail=0 +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files $CONFIG_STATUS" +{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} +as_write_fail=0 +cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 +#! $SHELL +# Generated by $as_me. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. + +debug=false +ac_cs_recheck=false +ac_cs_silent=false + +SHELL=\${CONFIG_SHELL-$SHELL} +export SHELL +_ASEOF +cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -p'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -p' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -p' + fi +else + as_ln_s='cp -p' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +if test -x / >/dev/null 2>&1; then + as_test_x='test -x' +else + if ls -dL / >/dev/null 2>&1; then + as_ls_L_option=L + else + as_ls_L_option= + fi + as_test_x=' + eval sh -c '\'' + if test -d "$1"; then + test -d "$1/."; + else + case $1 in #( + -*)set "./$1";; + esac; + case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #(( + ???[sx]*):;;*)false;;esac;fi + '\'' sh + ' +fi +as_executable_p=$as_test_x + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +exec 6>&1 +## ----------------------------------- ## +## Main body of $CONFIG_STATUS script. ## +## ----------------------------------- ## +_ASEOF +test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1 + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# Save the log message, to keep $0 and so on meaningful, and to +# report actual input values of CONFIG_FILES etc. instead of their +# values after options handling. +ac_log=" +This file was extended by HTSlib $as_me 1.2.1, which was +generated by GNU Autoconf 2.68. Invocation command line was + + CONFIG_FILES = $CONFIG_FILES + CONFIG_HEADERS = $CONFIG_HEADERS + CONFIG_LINKS = $CONFIG_LINKS + CONFIG_COMMANDS = $CONFIG_COMMANDS + $ $0 $@ + +on `(hostname || uname -n) 2>/dev/null | sed 1q` +" + +_ACEOF + +case $ac_config_files in *" +"*) set x $ac_config_files; shift; ac_config_files=$*;; +esac + + + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +# Files that config.status was made for. +config_files="$ac_config_files" + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +ac_cs_usage="\ +\`$as_me' instantiates files and other configuration actions +from templates according to the current configuration. Unless the files +and actions are specified as TAGs, all are instantiated by default. + +Usage: $0 [OPTION]... [TAG]... + + -h, --help print this help, then exit + -V, --version print version number and configuration settings, then exit + --config print configuration, then exit + -q, --quiet, --silent + do not print progress messages + -d, --debug don't remove temporary files + --recheck update $as_me by reconfiguring in the same conditions + --file=FILE[:TEMPLATE] + instantiate the configuration file FILE + +Configuration files: +$config_files + +Report bugs to . +HTSlib home page: ." + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" +ac_cs_version="\\ +HTSlib config.status 1.2.1 +configured by $0, generated by GNU Autoconf 2.68, + with options \\"\$ac_cs_config\\" + +Copyright (C) 2010 Free Software Foundation, Inc. +This config.status script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it." + +ac_pwd='$ac_pwd' +srcdir='$srcdir' +test -n "\$AWK" || AWK=awk +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# The default lists apply if the user does not specify any file. +ac_need_defaults=: +while test $# != 0 +do + case $1 in + --*=?*) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` + ac_shift=: + ;; + --*=) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg= + ac_shift=: + ;; + *) + ac_option=$1 + ac_optarg=$2 + ac_shift=shift + ;; + esac + + case $ac_option in + # Handling of the options. + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + ac_cs_recheck=: ;; + --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) + $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; + --debug | --debu | --deb | --de | --d | -d ) + debug=: ;; + --file | --fil | --fi | --f ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + '') as_fn_error $? "missing file argument" ;; + esac + as_fn_append CONFIG_FILES " '$ac_optarg'" + ac_need_defaults=false;; + --he | --h | --help | --hel | -h ) + $as_echo "$ac_cs_usage"; exit ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil | --si | --s) + ac_cs_silent=: ;; + + # This is an error. + -*) as_fn_error $? "unrecognized option: \`$1' +Try \`$0 --help' for more information." ;; + + *) as_fn_append ac_config_targets " $1" + ac_need_defaults=false ;; + + esac + shift +done + +ac_configure_extra_args= + +if $ac_cs_silent; then + exec 6>/dev/null + ac_configure_extra_args="$ac_configure_extra_args --silent" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +if \$ac_cs_recheck; then + set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + shift + \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 + CONFIG_SHELL='$SHELL' + export CONFIG_SHELL + exec "\$@" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +exec 5>>config.log +{ + echo + sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX +## Running $as_me. ## +_ASBOX + $as_echo "$ac_log" +} >&5 + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + +# Handling of arguments. +for ac_config_target in $ac_config_targets +do + case $ac_config_target in + "config.mk") CONFIG_FILES="$CONFIG_FILES config.mk" ;; + + *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; + esac +done + + +# If the user did not use the arguments to specify the items to instantiate, +# then the envvar interface is used. Set only those that are not. +# We use the long form for the default assignment because of an extremely +# bizarre bug on SunOS 4.1.3. +if $ac_need_defaults; then + test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files +fi + +# Have a temporary directory for convenience. Make it in the build tree +# simply because there is no reason against having it here, and in addition, +# creating and moving files from /tmp can sometimes cause problems. +# Hook for its removal unless debugging. +# Note that there is a small window in which the directory will not be cleaned: +# after its creation but before its name has been assigned to `$tmp'. +$debug || +{ + tmp= ac_tmp= + trap 'exit_status=$? + : "${ac_tmp:=$tmp}" + { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status +' 0 + trap 'as_fn_exit 1' 1 2 13 15 +} +# Create a (secure) tmp directory for tmp files. + +{ + tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && + test -d "$tmp" +} || +{ + tmp=./conf$$-$RANDOM + (umask 077 && mkdir "$tmp") +} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 +ac_tmp=$tmp + +# Set up the scripts for CONFIG_FILES section. +# No need to generate them if there are no CONFIG_FILES. +# This happens for instance with `./config.status config.h'. +if test -n "$CONFIG_FILES"; then + + +ac_cr=`echo X | tr X '\015'` +# On cygwin, bash can eat \r inside `` if the user requested igncr. +# But we know of no other shell where ac_cr would be empty at this +# point, so we can use a bashism as a fallback. +if test "x$ac_cr" = x; then + eval ac_cr=\$\'\\r\' +fi +ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` +if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then + ac_cs_awk_cr='\\r' +else + ac_cs_awk_cr=$ac_cr +fi + +echo 'BEGIN {' >"$ac_tmp/subs1.awk" && +_ACEOF + + +{ + echo "cat >conf$$subs.awk <<_ACEOF" && + echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && + echo "_ACEOF" +} >conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 +ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'` +ac_delim='%!_!# ' +for ac_last_try in false false false false false :; do + . ./conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + + ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` + if test $ac_delim_n = $ac_delim_num; then + break + elif $ac_last_try; then + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done +rm -f conf$$subs.sh + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK && +_ACEOF +sed -n ' +h +s/^/S["/; s/!.*/"]=/ +p +g +s/^[^!]*!// +:repl +t repl +s/'"$ac_delim"'$// +t delim +:nl +h +s/\(.\{148\}\)..*/\1/ +t more1 +s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ +p +n +b repl +:more1 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t nl +:delim +h +s/\(.\{148\}\)..*/\1/ +t more2 +s/["\\]/\\&/g; s/^/"/; s/$/"/ +p +b +:more2 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t delim +' >$CONFIG_STATUS || ac_write_fail=1 +rm -f conf$$subs.awk +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACAWK +cat >>"\$ac_tmp/subs1.awk" <<_ACAWK && + for (key in S) S_is_set[key] = 1 + FS = "" + +} +{ + line = $ 0 + nfields = split(line, field, "@") + substed = 0 + len = length(field[1]) + for (i = 2; i < nfields; i++) { + key = field[i] + keylen = length(key) + if (S_is_set[key]) { + value = S[key] + line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) + len += length(value) + length(field[++i]) + substed = 1 + } else + len += 1 + keylen + } + + print line +} + +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then + sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" +else + cat +fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ + || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 +_ACEOF + +# VPATH may cause trouble with some makes, so we remove sole $(srcdir), +# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and +# trailing colons and then remove the whole line if VPATH becomes empty +# (actually we leave an empty line to preserve line numbers). +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{ +h +s/// +s/^/:/ +s/[ ]*$/:/ +s/:\$(srcdir):/:/g +s/:\${srcdir}:/:/g +s/:@srcdir@:/:/g +s/^:*// +s/:*$// +x +s/\(=[ ]*\).*/\1/ +G +s/\n// +s/^[^=]*=[ ]*$// +}' +fi + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +fi # test -n "$CONFIG_FILES" + + +eval set X " :F $CONFIG_FILES " +shift +for ac_tag +do + case $ac_tag in + :[FHLC]) ac_mode=$ac_tag; continue;; + esac + case $ac_mode$ac_tag in + :[FHL]*:*);; + :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; + :[FH]-) ac_tag=-:-;; + :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; + esac + ac_save_IFS=$IFS + IFS=: + set x $ac_tag + IFS=$ac_save_IFS + shift + ac_file=$1 + shift + + case $ac_mode in + :L) ac_source=$1;; + :[FH]) + ac_file_inputs= + for ac_f + do + case $ac_f in + -) ac_f="$ac_tmp/stdin";; + *) # Look for the file first in the build tree, then in the source tree + # (if the path is not absolute). The absolute path cannot be DOS-style, + # because $ac_f cannot contain `:'. + test -f "$ac_f" || + case $ac_f in + [\\/$]*) false;; + *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; + esac || + as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; + esac + case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + as_fn_append ac_file_inputs " '$ac_f'" + done + + # Let's still pretend it is `configure' which instantiates (i.e., don't + # use $as_me), people would be surprised to read: + # /* config.h. Generated by config.status. */ + configure_input='Generated from '` + $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + `' by configure.' + if test x"$ac_file" != x-; then + configure_input="$ac_file. $configure_input" + { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 +$as_echo "$as_me: creating $ac_file" >&6;} + fi + # Neutralize special characters interpreted by sed in replacement strings. + case $configure_input in #( + *\&* | *\|* | *\\* ) + ac_sed_conf_input=`$as_echo "$configure_input" | + sed 's/[\\\\&|]/\\\\&/g'`;; #( + *) ac_sed_conf_input=$configure_input;; + esac + + case $ac_tag in + *:-:* | *:-) cat >"$ac_tmp/stdin" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; + esac + ;; + esac + + ac_dir=`$as_dirname -- "$ac_file" || +$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$ac_file" : 'X\(//\)[^/]' \| \ + X"$ac_file" : 'X\(//\)$' \| \ + X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$ac_file" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + as_dir="$ac_dir"; as_fn_mkdir_p + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + + case $ac_mode in + :F) + # + # CONFIG_FILE + # + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# If the template does not know about datarootdir, expand it. +# FIXME: This hack should be removed a few years after 2.60. +ac_datarootdir_hack=; ac_datarootdir_seen= +ac_sed_dataroot=' +/datarootdir/ { + p + q +} +/@datadir@/p +/@docdir@/p +/@infodir@/p +/@localedir@/p +/@mandir@/p' +case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in +*datarootdir*) ac_datarootdir_seen=yes;; +*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + ac_datarootdir_hack=' + s&@datadir@&$datadir&g + s&@docdir@&$docdir&g + s&@infodir@&$infodir&g + s&@localedir@&$localedir&g + s&@mandir@&$mandir&g + s&\\\${datarootdir}&$datarootdir&g' ;; +esac +_ACEOF + +# Neutralize VPATH when `$srcdir' = `.'. +# Shell code in configure.ac might set extrasub. +# FIXME: do we really want to maintain this feature? +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_sed_extra="$ac_vpsub +$extrasub +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +:t +/@[a-zA-Z_][a-zA-Z_0-9]*@/!b +s|@configure_input@|$ac_sed_conf_input|;t t +s&@top_builddir@&$ac_top_builddir_sub&;t t +s&@top_build_prefix@&$ac_top_build_prefix&;t t +s&@srcdir@&$ac_srcdir&;t t +s&@abs_srcdir@&$ac_abs_srcdir&;t t +s&@top_srcdir@&$ac_top_srcdir&;t t +s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t +s&@builddir@&$ac_builddir&;t t +s&@abs_builddir@&$ac_abs_builddir&;t t +s&@abs_top_builddir@&$ac_abs_top_builddir&;t t +$ac_datarootdir_hack +" +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ + >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + +test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && + { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ + "$ac_tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&5 +$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&2;} + + rm -f "$ac_tmp/stdin" + case $ac_file in + -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; + *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; + esac \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + ;; + + + + esac + +done # for ac_tag + + +as_fn_exit 0 +_ACEOF +ac_clean_files=$ac_clean_files_save + +test $ac_write_fail = 0 || + as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5 + + +# configure is writing to config.log, and then calls config.status. +# config.status does its own redirection, appending to config.log. +# Unfortunately, on DOS this fails, as config.log is still kept open +# by configure, so config.status won't be able to write to it; its +# output is simply discarded. So we exec the FD to /dev/null, +# effectively closing config.log, so it can be properly (re)opened and +# appended to by config.status. When coming back to configure, we +# need to make the FD available again. +if test "$no_create" != yes; then + ac_cs_success=: + ac_config_status_args= + test "$silent" = yes && + ac_config_status_args="$ac_config_status_args --quiet" + exec 5>/dev/null + $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false + exec 5>>config.log + # Use ||, not &&, to avoid exiting from the if with $? = 1, which + # would make configure fail if this is the last instruction. + $ac_cs_success || as_fn_exit 1 +fi +if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 +$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} +fi + diff --git a/htslib/configure.ac b/htslib/configure.ac new file mode 100644 index 00000000..77ce99c4 --- /dev/null +++ b/htslib/configure.ac @@ -0,0 +1,93 @@ +# Configure script for htslib, a C library for high-throughput sequencing data. +# +# Copyright (C) 2015 Genome Research Ltd. +# +# Author: John Marshall +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +dnl Process this file with autoconf to produce a configure script +AC_INIT([HTSlib], m4_esyscmd_s([make print-version]), + [samtools-help@lists.sourceforge.net], [], [http://www.htslib.org/]) +AC_PREREQ(2.63) dnl This version introduced 4-argument AC_CHECK_HEADER +AC_CONFIG_SRCDIR(hts.c) + +dnl Copyright notice to be copied into the generated configure script +AC_COPYRIGHT([Portions copyright (C) 2015 Genome Research Ltd. + +This configure script is free software: you are free to change and +redistribute it. There is NO WARRANTY, to the extent permitted by law.]) + +AC_PROG_CC +AC_PROG_RANLIB + +AC_ARG_WITH([irods], + [AS_HELP_STRING([[--with-irods[=DIR]]], + [use RodsAPIs library (in DIR) to support iRODS URLs])], + [case $withval in + no) irods=disabled ;; + yes) irods=enabled ;; + *) irods=enabled; IRODS_HOME=$withval ;; + esac], + [irods=disabled]) + +save_LIBS=$LIBS +zlib_devel=ok +dnl Set a trivial non-empty INCLUDES to avoid excess default includes tests +AC_CHECK_HEADER([zlib.h], [], [zlib_devel=missing], [;]) +AC_CHECK_LIB(z, inflate, [], [zlib_devel=missing]) +LIBS=$save_LIBS + +if test $zlib_devel != ok; then + AC_MSG_ERROR([zlib development files not found + +HTSlib uses compression routines from the zlib library . +Building HTSlib requires zlib development files to be installed on the build +machine; you may need to ensure a package such as zlib1g-dev (on Debian or +Ubuntu Linux) or zlib-devel (on RPM-based Linux distributions) is installed. + +FAILED. This error must be resolved in order to build HTSlib successfully.]) +fi + +if test $irods = enabled; then + # TODO Also test whether we require libgssapi_krb5 and AC_CHECK_LIB it + save_LDFLAGS=$LDFLAGS + LDFLAGS="$LDFLAGS -L$IRODS_HOME/lib/core/obj" + AC_CHECK_LIB([RodsAPIs], [getRodsEnvFileName], + [case $with_irods in + yes) define_IRODS_HOME='# Uses $(IRODS_HOME) from the environment' ;; + *) define_IRODS_HOME="IRODS_HOME = $with_irods" ;; + esac], + [AC_MSG_ERROR([iRODS development files not found + +Support for iRODS URLs requires the libRodsAPI client library and headers. +Configure with --with-irods=DIR (or just --with-irods if \$IRODS_HOME has +been exported with a suitable value), where DIR is the base of an iRODS tree +such that the library is present as DIR/lib/core/obj/libRodsAPI.* and headers +are present under DIR/lib/api/include and so on.])], + [-lgssapi_krb5 -lpthread]) + LDFLAGS=$save_LDFLAGS +else + define_IRODS_HOME='IRODS_HOME ?= /disabled' +fi +AC_SUBST([irods]) +AC_SUBST([define_IRODS_HOME]) + +AC_CONFIG_FILES(config.mk) +AC_OUTPUT diff --git a/htslib/cram/cram.h b/htslib/cram/cram.h index 0b8b2916..02f7774d 100644 --- a/htslib/cram/cram.h +++ b/htslib/cram/cram.h @@ -31,8 +31,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*! \file * CRAM interface. * - * Consider using the higher level scram_*() API for programs that wish to - * be file format agnostic. + * Consider using the higher level hts_*() API for programs that wish to + * be file format agnostic (see htslib/hts.h). * * This API should be used for CRAM specific code. The specifics of the * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h @@ -43,13 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -#ifdef SAMTOOLS -# include "cram/cram_samtools.h" -#endif - #ifndef _CRAM_H_ #define _CRAM_H_ +#include "cram/cram_samtools.h" #include "cram/sam_header.h" #include "cram_structs.h" #include "cram_io.h" diff --git a/htslib/cram/cram_codecs.c b/htslib/cram/cram_codecs.c index 3c3d13f9..c6bfb167 100644 --- a/htslib/cram/cram_codecs.c +++ b/htslib/cram/cram_codecs.c @@ -271,8 +271,7 @@ static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { } } - - + /* fits in current bit-field */ if (nbits <= block->bit+1) { block->data[block->byte] |= (val << (block->bit+1-nbits)); if ((block->bit-=nbits) == -1) { @@ -330,11 +329,11 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, /* Find the external block */ if (slice->block_by_id) { if (!(b = slice->block_by_id[c->external.content_id])) - return -1; + return *out_size?-1:0; } else { for (i = 0; i < slice->hdr->num_blocks; i++) { b = slice->block[i]; - if (b->content_type == EXTERNAL && + if (b && b->content_type == EXTERNAL && b->content_id == c->external.content_id) { break; } @@ -361,11 +360,11 @@ int cram_external_decode_char(cram_slice *slice, cram_codec *c, /* Find the external block */ if (slice->block_by_id) { if (!(b = slice->block_by_id[c->external.content_id])) - return -1; + return *out_size?-1:0; } else { for (i = 0; i < slice->hdr->num_blocks; i++) { b = slice->block[i]; - if (b->content_type == EXTERNAL && + if (b && b->content_type == EXTERNAL && b->content_id == c->external.content_id) { break; } @@ -382,9 +381,9 @@ int cram_external_decode_char(cram_slice *slice, cram_codec *c, return 0; } -int cram_external_decode_block(cram_slice *slice, cram_codec *c, - cram_block *in, char *out_, - int *out_size) { +static int cram_external_decode_block(cram_slice *slice, cram_codec *c, + cram_block *in, char *out_, + int *out_size) { int i; char *cp; cram_block *b = NULL; @@ -393,11 +392,11 @@ int cram_external_decode_block(cram_slice *slice, cram_codec *c, /* Find the external block */ if (slice->block_by_id) { if (!(b = slice->block_by_id[c->external.content_id])) - return -1; + return *out_size?-1:0; } else { for (i = 0; i < slice->hdr->num_blocks; i++) { b = slice->block[i]; - if (b->content_type == EXTERNAL && + if (b && b->content_type == EXTERNAL && b->content_id == c->external.content_id) { break; } @@ -450,11 +449,17 @@ cram_codec *cram_external_decode_init(char *data, int size, return c; } -int cram_external_encode(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { +int cram_external_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { uint32_t *i32 = (uint32_t *)in; - itf8_put_blk(out, *i32); + itf8_put_blk(c->out, *i32); + return 0; +} + +int cram_external_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + BLOCK_APPEND(c->out, in, in_size); return 0; } @@ -495,7 +500,12 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - c->encode = cram_external_encode; + if (option == E_INT || option == E_LONG) + c->encode = cram_external_encode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->encode = cram_external_encode_char; + else + abort(); c->store = cram_external_encode_store; c->e_external.content_id = (size_t)dat; @@ -516,7 +526,7 @@ int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; } else { for (i = 0, n = *out_size; i < n; i++) - out_i[i] = 0; + out_i[i] = -c->beta.offset; } return 0; @@ -530,7 +540,7 @@ int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; } else { for (i = 0, n = *out_size; i < n; i++) - out[i] = 0; + out[i] = -c->beta.offset; } return 0; @@ -591,23 +601,25 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, } int cram_beta_encode_int(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { + char *in, int in_size) { int *syms = (int *)in; int i, r = 0; for (i = 0; i < in_size; i++) - r |= store_bits_MSB(out, syms[i] + c->e_beta.offset, c->e_beta.nbits); + r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset, + c->e_beta.nbits); return r; } int cram_beta_encode_char(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { + char *in, int in_size) { unsigned char *syms = (unsigned char *)in; int i, r = 0; for (i = 0; i < in_size; i++) - r |= store_bits_MSB(out, syms[i] + c->e_beta.offset, c->e_beta.nbits); + r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset, + c->e_beta.nbits); return r; } @@ -859,9 +871,9 @@ int cram_huffman_decode_char(cram_slice *slice, cram_codec *c, //val <<= dlen; //val |= get_bits_MSB(in, dlen); - //last_len = (len += dlen); + //last_len = (len += dlen); - last_len = (len += dlen); + last_len = (len += dlen); for (; dlen; dlen--) GET_BIT_MSB(in, val); idx = val - codes[idx].p; @@ -909,9 +921,9 @@ int cram_huffman_decode_int(cram_slice *slice, cram_codec *c, //val <<= dlen; //val |= get_bits_MSB(in, dlen); - //last_len = (len += dlen); + //last_len = (len += dlen); - last_len = (len += dlen); + last_len = (len += dlen); for (; dlen; dlen--) GET_BIT_MSB(in, val); idx = val - codes[idx].p; @@ -1051,12 +1063,12 @@ cram_codec *cram_huffman_decode_init(char *data, int size, } int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { + char *in, int in_size) { return 0; } int cram_huffman_encode_char(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { + char *in, int in_size) { int i, code, len, r = 0; unsigned char *syms = (unsigned char *)in; @@ -1080,19 +1092,19 @@ int cram_huffman_encode_char(cram_slice *slice, cram_codec *c, len = c->e_huffman.codes[i].len; } - r |= store_bits_MSB(out, code, len); + r |= store_bits_MSB(c->out, code, len); } while (--in_size); return r; } int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { + char *in, int in_size) { return 0; } int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { + char *in, int in_size) { int i, code, len, r = 0; int *syms = (int *)in; @@ -1117,7 +1129,7 @@ int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, len = c->e_huffman.codes[i].len; } - r |= store_bits_MSB(out, code, len); + r |= store_bits_MSB(c->out, code, len); } while (--in_size); return r; @@ -1428,19 +1440,37 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, } int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { - return -1; // not imp. + char *in, int in_size) { + int32_t i32 = in_size; + int r = 0; + + r |= c->e_byte_array_len.len_codec->encode(slice, + c->e_byte_array_len.len_codec, + (char *)&i32, 1); + r |= c->e_byte_array_len.val_codec->encode(slice, + c->e_byte_array_len.val_codec, + in, in_size); + return r; } void cram_byte_array_len_encode_free(cram_codec *c) { if (!c) return; + + if (c->e_byte_array_len.len_codec) + c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec); + + if (c->e_byte_array_len.val_codec) + c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec); + free(c); } int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - int len = 0; + int len = 0, len2, len3; + cram_codec *tc; + cram_block *b_len, *b_val; if (prefix) { size_t l = strlen(prefix); @@ -1448,16 +1478,23 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, len += l; } + tc = c->e_byte_array_len.len_codec; + b_len = cram_new_block(0, 0); + len2 = tc->store(tc, b_len, NULL, version); + + tc = c->e_byte_array_len.val_codec; + b_val = cram_new_block(0, 0); + len3 = tc->store(tc, b_val, NULL, version); + len += itf8_put_blk(b, c->codec); - len += itf8_put_blk(b, c->e_byte_array_len.len_len + - c->e_byte_array_len.val_len); - BLOCK_APPEND(b, c->e_byte_array_len.len_dat, c->e_byte_array_len.len_len); - len += c->e_byte_array_len.len_len; + len += itf8_put_blk(b, len2+len3); + BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); + BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val)); - BLOCK_APPEND(b, c->e_byte_array_len.val_dat, c->e_byte_array_len.val_len); - len += c->e_byte_array_len.val_len; + cram_free_block(b_len); + cram_free_block(b_val); - return len; + return len + len2 + len3; } cram_codec *cram_byte_array_len_encode_init(cram_stats *st, @@ -1475,10 +1512,14 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, c->encode = cram_byte_array_len_encode; c->store = cram_byte_array_len_encode_store; - c->e_byte_array_len.len_len = e->len_len; - c->e_byte_array_len.len_dat = e->len_dat; - c->e_byte_array_len.val_len = e->val_len; - c->e_byte_array_len.val_dat = e->val_dat; + c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, + NULL, E_INT, + e->len_dat, + version); + c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, + NULL, E_BYTE_ARRAY, + e->val_dat, + version); return c; } @@ -1487,20 +1528,20 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, * --------------------------------------------------------------------------- * BYTE_ARRAY_STOP */ -int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, - cram_block *in, char *out, - int *out_size) { +static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, + int *out_size) { int i; cram_block *b = NULL; char *cp, ch; if (slice->block_by_id) { if (!(b = slice->block_by_id[c->byte_array_stop.content_id])) - return -1; + return *out_size?-1:0; } else { for (i = 0; i < slice->hdr->num_blocks; i++) { b = slice->block[i]; - if (b->content_type == EXTERNAL && + if (b && b->content_type == EXTERNAL && b->content_id == c->byte_array_stop.content_id) { break; } @@ -1529,20 +1570,19 @@ int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, cram_block *in, char *out_, int *out_size) { - int space = 256; cram_block *b = NULL; cram_block *out = (cram_block *)out_; - char *cp, ch, *out_cp, *cp_end, *out_end; + char *cp, *out_cp, *cp_end; char stop; if (slice->block_by_id) { if (!(b = slice->block_by_id[c->byte_array_stop.content_id])) - return -1; + return *out_size?-1:0; } else { int i; for (i = 0; i < slice->hdr->num_blocks; i++) { b = slice->block[i]; - if (b->content_type == EXTERNAL && + if (b && b->content_type == EXTERNAL && b->content_id == c->byte_array_stop.content_id) { break; } @@ -1555,25 +1595,20 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, return -1; cp = (char *)b->data + b->idx; cp_end = (char *)b->data + b->uncomp_size; - BLOCK_GROW(out, space); out_cp = (char *)BLOCK_END(out); - out_end = out_cp + space; stop = c->byte_array_stop.stop; - while ((ch = *cp) != stop) { - if (cp++ == cp_end) - return -1; - *out_cp++ = ch; - - if (out_cp == out_end) { - BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); - space *= 2; - BLOCK_GROW(out, space); - out_cp = (char *)BLOCK_END(out); - out_end = out_cp + space; - } + if (cp_end - cp < out->alloc - out->byte) { + while (*cp != stop && cp != cp_end) + *out_cp++ = *cp++; + BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); + } else { + char *cp_start; + for (cp_start = cp; *cp != stop && cp != cp_end; cp++) + ; + BLOCK_APPEND(out, cp_start, cp - cp_start); + BLOCK_GROW(out, cp - cp_start); } - BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); *out_size = cp - (char *)(b->data + b->idx); b->idx = cp - (char *)b->data + 1; @@ -1603,7 +1638,7 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, c->free = cram_byte_array_stop_decode_free; c->byte_array_stop.stop = *cp++; - if (version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(version) == 1) { c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16) + (cp[3]<<24); cp += 4; @@ -1621,8 +1656,10 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, } int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c, - cram_block *out, char *in, int in_size) { - return -1; // not imp. + char *in, int in_size) { + BLOCK_APPEND(c->out, in, in_size); + BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop); + return 0; } void cram_byte_array_stop_encode_free(cram_codec *c) { @@ -1644,7 +1681,7 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, cp += itf8_put(cp, c->codec); - if (version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(version) == 1) { cp += itf8_put(cp, 5); *cp++ = c->e_byte_array_stop.stop; *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff; @@ -1756,9 +1793,54 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, return NULL; if (encode_init[codec]) { - return encode_init[codec](st, option, dat, version); + cram_codec *r; + if ((r = encode_init[codec](st, option, dat, version))) + r->out = NULL; + return r; } else { fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec)); abort(); } } + +/* + * Returns the content_id used by this codec, also in id2 if byte_array_len. + * Returns -1 for the CORE block and -2 for unneeded. + * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs. + */ +int cram_codec_to_id(cram_codec *c, int *id2) { + int bnum1, bnum2 = -2; + + switch (c->codec) { + case E_HUFFMAN: + bnum1 = c->huffman.ncodes == 1 ? -2 : -1; + break; + case E_GOLOMB: + case E_BETA: + case E_SUBEXP: + case E_GOLOMB_RICE: + case E_GAMMA: + bnum1 = -1; + break; + case E_EXTERNAL: + bnum1 = c->external.content_id; + break; + case E_BYTE_ARRAY_LEN: + bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL); + bnum2 = cram_codec_to_id(c->byte_array_len.value_codec, NULL); + break; + case E_BYTE_ARRAY_STOP: + bnum1 = c->byte_array_stop.content_id; + break; + case E_NULL: + bnum1 = -2; + break; + default: + fprintf(stderr, "Unknown codec type %d\n", c->codec); + bnum1 = -1; + } + + if (id2) + *id2 = bnum2; + return bnum1; +} diff --git a/htslib/cram/cram_codecs.h b/htslib/cram/cram_codecs.h index 7037814c..e047901b 100644 --- a/htslib/cram/cram_codecs.h +++ b/htslib/cram/cram_codecs.h @@ -97,10 +97,12 @@ typedef struct { } cram_byte_array_stop_decoder; typedef struct { - uint32_t len_len; - unsigned char *len_dat; - uint32_t val_len; - unsigned char *val_dat; + enum cram_encoding len_encoding; + enum cram_encoding val_encoding; + void *len_dat; + void *val_dat; + struct cram_codec *len_codec; + struct cram_codec *val_codec; } cram_byte_array_len_encoder; /* @@ -108,11 +110,12 @@ typedef struct { */ typedef struct cram_codec { enum cram_encoding codec; + cram_block *out; void (*free)(struct cram_codec *codec); int (*decode)(cram_slice *slice, struct cram_codec *codec, cram_block *in, char *out, int *out_size); int (*encode)(cram_slice *slice, struct cram_codec *codec, - cram_block *out, char *in, int in_size); + char *in, int in_size); int (*store)(struct cram_codec *codec, cram_block *b, char *prefix, int version); union { @@ -146,7 +149,14 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, //#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, (--b->bit == -1) && (b->bit = 7, b->byte++)) -#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (b->bit==0), b->bit+=(b->bit==0)*8-1) +#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (--b->bit<0), b->bit&=7) + +/* + * Returns the content_id used by this codec, also in id2 if byte_array_len. + * Returns -1 for the CORE block and -2 for unneeded. + * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs. + */ +int cram_codec_to_id(cram_codec *c, int *id2); #ifdef __cplusplus } diff --git a/htslib/cram/cram_decode.c b/htslib/cram/cram_decode.c index e002ac9f..1d6281ee 100644 --- a/htslib/cram/cram_decode.c +++ b/htslib/cram/cram_decode.c @@ -133,13 +133,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, return NULL; if (b->method != RAW) { - if (cram_uncompress_block(b)) + if (cram_uncompress_block(b)) { + free(hdr); return NULL; + } } cp = (char *)b->data; - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { cp += itf8_get(cp, &hdr->ref_seq_id); cp += itf8_get(cp, &hdr->ref_seq_start); cp += itf8_get(cp, &hdr->ref_seq_span); @@ -367,179 +369,212 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, */ if (key[0] == 'B' && key[1] == 'F') { - if (!(hdr->BF_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'C' && key[1] == 'F') { - if (!(hdr->CF_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'R' && key[1] == 'I') { - if (!(hdr->RI_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'R' && key[1] == 'L') { - if (!(hdr->RL_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'A' && key[1] == 'P') { - if (!(hdr->AP_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'R' && key[1] == 'G') { - if (!(hdr->RG_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'M' && key[1] == 'F') { - if (!(hdr->MF_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'N' && key[1] == 'S') { - if (!(hdr->NS_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'N' && key[1] == 'P') { - if (!(hdr->NP_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'T' && key[1] == 'S') { - if (!(hdr->TS_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'N' && key[1] == 'F') { - if (!(hdr->NF_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'T' && key[1] == 'C') { - if (!(hdr->TC_codec = cram_decoder_init(encoding, cp, size, E_BYTE, - fd->version))) { + if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size, + E_BYTE, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'T' && key[1] == 'N') { - if (!(hdr->TN_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'F' && key[1] == 'N') { - if (!(hdr->FN_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'F' && key[1] == 'C') { - if (!(hdr->FC_codec = cram_decoder_init(encoding, cp, size, E_BYTE, - fd->version))) { + if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size, + E_BYTE, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'F' && key[1] == 'P') { - if (!(hdr->FP_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'B' && key[1] == 'S') { - if (!(hdr->BS_codec = cram_decoder_init(encoding, cp, size, E_BYTE, - fd->version))) { + if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size, + E_BYTE, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'I' && key[1] == 'N') { - if (!(hdr->IN_codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { + if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'S' && key[1] == 'C') { - if (!(hdr->SC_codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { + if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'D' && key[1] == 'L') { - if (!(hdr->DL_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'B' && key[1] == 'A') { - if (!(hdr->BA_codec = cram_decoder_init(encoding, cp, size, E_BYTE, - fd->version))) { + if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size, + E_BYTE, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'B' && key[1] == 'B') { + if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'R' && key[1] == 'S') { - if (!(hdr->RS_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'P' && key[1] == 'D') { - if (!(hdr->PD_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'H' && key[1] == 'C') { - if (!(hdr->HC_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'M' && key[1] == 'Q') { - if (!(hdr->MQ_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'R' && key[1] == 'N') { - if (!(hdr->RN_codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY_BLOCK, - fd->version))) { + if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY_BLOCK, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'Q' && key[1] == 'S') { - if (!(hdr->QS_codec = cram_decoder_init(encoding, cp, size, E_BYTE, - fd->version))) { + if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size, + E_BYTE, + fd->version))) { cram_free_compression_header(hdr); return NULL; } - if (!(hdr->Qs_codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { + } else if (key[0] == 'Q' && key[1] == 'Q') { + if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { cram_free_compression_header(hdr); return NULL; } } else if (key[0] == 'T' && key[1] == 'L') { - if (!(hdr->TL_codec = cram_decoder_init(encoding, cp, size, E_INT, - fd->version))) { + if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size, + E_INT, + fd->version))) { cram_free_compression_header(hdr); return NULL; } @@ -601,6 +636,323 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, return hdr; } +/* + * Note we also need to scan through the record encoding map to + * see which data series share the same block, either external or + * CORE. For example if we need the BF data series but MQ and CF + * are also encoded in the same block then we need to add those in + * as a dependency in order to correctly decode BF. + * + * Returns 0 on success + * -1 on failure + */ +int cram_dependent_data_series(cram_fd *fd, + cram_block_compression_hdr *hdr, + cram_slice *s) { + int *block_used; + int core_used = 0; + int i; + static int i_to_id[] = { + DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS, + DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN, + DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD, + DS_HC, DS_SC, DS_BB, DS_QQ, + }; + uint32_t orig_ds; + + /* + * Set the data_series bit field based on fd->required_fields + * contents. + */ + if (fd->required_fields && fd->required_fields != INT_MAX) { + hdr->data_series = 0; + + if (fd->required_fields & SAM_QNAME) + hdr->data_series |= CRAM_RN; + + if (fd->required_fields & SAM_FLAG) + hdr->data_series |= CRAM_BF; + + if (fd->required_fields & SAM_RNAME) + hdr->data_series |= CRAM_RI | CRAM_BF; + + if (fd->required_fields & SAM_POS) + hdr->data_series |= CRAM_AP | CRAM_BF; + + if (fd->required_fields & SAM_MAPQ) + hdr->data_series |= CRAM_MQ; + + if (fd->required_fields & SAM_CIGAR) + hdr->data_series |= CRAM_CIGAR; + + if (fd->required_fields & SAM_RNEXT) + hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF; + + if (fd->required_fields & SAM_PNEXT) + hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF; + + if (fd->required_fields & SAM_TLEN) + hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS | + CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR; + + if (fd->required_fields & SAM_SEQ) + hdr->data_series |= CRAM_SEQ; + + if (!(fd->required_fields & SAM_AUX)) + // No easy way to get MD/NM without other tags at present + fd->decode_md = 0; + + if (fd->required_fields & SAM_QUAL) + hdr->data_series |= CRAM_SEQ; + + if (fd->required_fields & SAM_AUX) + hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux; + + if (fd->required_fields & SAM_RGAUX) + hdr->data_series |= CRAM_RG | CRAM_BF; + + // Always uncompress CORE block + if (cram_uncompress_block(s->block[0])) + return -1; + } else { + hdr->data_series = CRAM_ALL; + + for (i = 0; i < s->hdr->num_blocks; i++) { + if (cram_uncompress_block(s->block[i])) + return -1; + } + + return 0; + } + + block_used = calloc(s->hdr->num_blocks+1, sizeof(int)); + if (!block_used) + return -1; + + do { + /* + * Also set data_series based on code prerequisites. Eg if we need + * CRAM_QS then we also need to know CRAM_RL so we know how long it + * is, or if we need FC/FP then we also need FN (number of features). + * + * It's not reciprocal though. We may be needing to decode FN + * but have no need to decode FC, FP and cigar ops. + */ + if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP; + if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP; + + // cram_decode_seq() needs seq[] array + if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL; + + if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC; + if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN; + if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL; + if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF; + if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF; + if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI; + if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF)) + hdr->data_series |= CRAM_CF; + if (!hdr->read_names_included && hdr->data_series & CRAM_RN) + hdr->data_series |= CRAM_CF | CRAM_NF; + if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ)) + hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL; + + orig_ds = hdr->data_series; + + // Find which blocks are in use. + for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) { + int bnum1, bnum2, j; + cram_codec *c = hdr->codecs[i_to_id[i]]; + + if (!(hdr->data_series & (1<hdr->num_blocks; j++) { + if (s->block[j]->content_type == EXTERNAL && + s->block[j]->content_id == bnum1) { + block_used[j] = 1; + if (cram_uncompress_block(s->block[j])) { + free(block_used); + return -1; + } + } + } + break; + } + + if (bnum2 == -2 || bnum1 == bnum2) + break; + + bnum1 = bnum2; // 2nd pass + } + } + + // Tags too + if ((fd->required_fields & SAM_AUX) || + (hdr->data_series & CRAM_aux)) { + for (i = 0; i < CRAM_MAP_HASH; i++) { + int bnum1, bnum2, j; + cram_map *m = hdr->tag_encoding_map[i]; + + while (m) { + cram_codec *c = m->codec; + if (!c) + continue; + + bnum1 = cram_codec_to_id(c, &bnum2); + + for (;;) { + switch (bnum1) { + case -2: + break; + + case -1: + core_used = 1; + break; + + default: + for (j = 0; j < s->hdr->num_blocks; j++) { + if (s->block[j]->content_type == EXTERNAL && + s->block[j]->content_id == bnum1) { + block_used[j] = 1; + if (cram_uncompress_block(s->block[j])) { + free(block_used); + return -1; + } + } + } + break; + } + + if (bnum2 == -2 || bnum1 == bnum2) + break; + + bnum1 = bnum2; // 2nd pass + } + + m = m->next; + } + } + } + + // We now know which blocks are in used, so repeat and find + // which other data series need to be added. + for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) { + int bnum1, bnum2, j; + cram_codec *c = hdr->codecs[i_to_id[i]]; + + if (!c) + continue; + + bnum1 = cram_codec_to_id(c, &bnum2); + + for (;;) { + switch (bnum1) { + case -2: + break; + + case -1: + if (core_used) { + //printf(" + data series %08x:\n", 1<data_series |= 1<hdr->num_blocks; j++) { + if (s->block[j]->content_type == EXTERNAL && + s->block[j]->content_id == bnum1) { + if (block_used[j]) { + //printf(" + data series %08x:\n", 1<data_series |= 1<tag_encoding_map[i]; + + while (m) { + cram_codec *c = m->codec; + if (!c) + continue; + + bnum1 = cram_codec_to_id(c, &bnum2); + + for (;;) { + switch (bnum1) { + case -2: + break; + + case -1: + //printf(" + data series %08x:\n", CRAM_aux); + hdr->data_series |= CRAM_aux; + break; + + default: + for (j = 0; j < s->hdr->num_blocks; j++) { + if (s->block[j]->content_type && + s->block[j]->content_id == bnum1) { + if (block_used[j]) { + //printf(" + data series %08x:\n", + // CRAM_aux); + hdr->data_series |= CRAM_aux; + } + } + } + break; + } + + if (bnum2 == -2 || bnum1 == bnum2) + break; + + bnum1 = bnum2; // 2nd pass + } + + m = m->next; + } + } + } while (orig_ds != hdr->data_series); + + free(block_used); + return 0; +} + /* ---------------------------------------------------------------------- * CRAM slices */ @@ -630,8 +982,15 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { cp += itf8_get(cp, &hdr->ref_seq_span); } cp += itf8_get(cp, &hdr->num_records); - if (fd->version != CRAM_1_VERS) - cp += itf8_get(cp, &hdr->record_counter); + hdr->record_counter = 0; + if (CRAM_MAJOR_VERS(fd->version) == 2) { + int32_t i32; + cp += itf8_get(cp, &i32); + hdr->record_counter = i32; + } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { + cp += ltf8_get(cp, &hdr->record_counter); + } + cp += itf8_get(cp, &hdr->num_blocks); cp += itf8_get(cp, &hdr->num_content_ids); @@ -649,7 +1008,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { cp += itf8_get(cp, &hdr->ref_base_id); } - if (fd->version != CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) != 1) { memcpy(hdr->md5, cp, 16); } else { memset(hdr->md5, 0, 16); @@ -707,10 +1066,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, uint32_t cigar_alloc = s->cigar_alloc; uint32_t nm = 0, md_dist = 0; int orig_aux = 0; - int decode_md = fd->decode_md; - char buf[20]; + int decode_md = fd->decode_md && s->ref; + uint32_t ds = c->comp_hdr->data_series; - if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { + if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { memset(qual, 30, cr->len); } @@ -719,14 +1078,22 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, BLOCK_APPEND(s->aux_blk, "MDZ", 3); } - if (!c->comp_hdr->FN_codec) return -1; - r |= c->comp_hdr->FN_codec->decode(s,c->comp_hdr->FN_codec, blk, - (char *)&fn, &out_sz); + if (ds & CRAM_FN) { + if (!c->comp_hdr->codecs[DS_FN]) return -1; + r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN], + blk, (char *)&fn, &out_sz); + } else { + fn = 0; + } ref_pos--; // count from 0 cr->cigar = ncigar; + + if (!(ds & (CRAM_FC | CRAM_FP))) + goto skip_cigar; + for (f = 0; f < fn; f++) { - int32_t pos; + int32_t pos = 0; char op; if (ncigar+2 >= cigar_alloc) { @@ -736,12 +1103,22 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, return -1; } - if (!c->comp_hdr->FC_codec) return -1; - r |= c->comp_hdr->FC_codec->decode(s, c->comp_hdr->FC_codec, blk, - &op, &out_sz); - if (!c->comp_hdr->FP_codec) return -1; - r |= c->comp_hdr->FP_codec->decode(s, c->comp_hdr->FP_codec, blk, - (char *)&pos, &out_sz); + if (ds & CRAM_FC) { + if (!c->comp_hdr->codecs[DS_FC]) return -1; + r |= c->comp_hdr->codecs[DS_FC]->decode(s, + c->comp_hdr->codecs[DS_FC], + blk, + &op, &out_sz); + } + + if (!(ds & CRAM_FP)) + continue; + + if (!c->comp_hdr->codecs[DS_FP]) return -1; + r |= c->comp_hdr->codecs[DS_FP]->decode(s, + c->comp_hdr->codecs[DS_FP], + blk, + (char *)&pos, &out_sz); pos += prev_pos; if (pos > seq_pos) { @@ -781,6 +1158,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, prev_pos = pos; + if (!(ds & CRAM_FC)) + goto skip_cigar; + + if (!(ds & CRAM_FC)) + continue; + switch(op) { case 'S': { // soft clip: IN int32_t out_sz2 = 1; @@ -789,20 +1172,36 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (fd->version == CRAM_1_VERS) { - r |= c->comp_hdr->IN_codec - ? c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec, - blk, &seq[pos-1], &out_sz2) - : (seq[pos-1] = 'N', out_sz2 = 1, 0); - } else { - r |= c->comp_hdr->SC_codec - ? c->comp_hdr->SC_codec->decode(s, c->comp_hdr->SC_codec, - blk, &seq[pos-1], &out_sz2) - : (seq[pos-1] = 'N', out_sz2 = 1, 0); - } - cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; - cig_op = BAM_CSOFT_CLIP; - seq_pos += out_sz2; + if (ds & CRAM_IN) { + switch (CRAM_MAJOR_VERS(fd->version)) { + case 1: + r |= c->comp_hdr->codecs[DS_IN] + ? c->comp_hdr->codecs[DS_IN] + ->decode(s, c->comp_hdr->codecs[DS_IN], + blk, &seq[pos-1], &out_sz2) + : (seq[pos-1] = 'N', out_sz2 = 1, 0); + break; + + case 2: + default: + r |= c->comp_hdr->codecs[DS_SC] + ? c->comp_hdr->codecs[DS_SC] + ->decode(s, c->comp_hdr->codecs[DS_SC], + blk, &seq[pos-1], &out_sz2) + : (seq[pos-1] = 'N', out_sz2 = 1, 0); + break; + +// default: +// r |= c->comp_hdr->codecs[DS_BB] +// ? c->comp_hdr->codecs[DS_BB] +// ->decode(s, c->comp_hdr->codecs[DS_BB], +// blk, &seq[pos-1], &out_sz2) +// : (seq[pos-1] = 'N', out_sz2 = 1, 0); + } + cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; + cig_op = BAM_CSOFT_CLIP; + seq_pos += out_sz2; + } break; } @@ -813,10 +1212,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->BS_codec) return -1; - r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk, - (char *)&base, &out_sz); - seq[pos-1] = 'N'; // FIXME look up BS=base value + if (ds & CRAM_BS) { + if (!c->comp_hdr->codecs[DS_BS]) return -1; + r |= c->comp_hdr->codecs[DS_BS] + ->decode(s, c->comp_hdr->codecs[DS_BS], blk, + (char *)&base, &out_sz); + seq[pos-1] = 'N'; // FIXME look up BS=base value + } cig_op = BAM_CBASE_MISMATCH; #else int ref_base; @@ -824,18 +1226,23 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->BS_codec) return -1; - r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk, - (char *)&base, &out_sz); - if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { - seq[pos-1] = 'N'; - } else { - ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]]; - seq[pos-1] = c->comp_hdr->substitution_matrix[ref_base][base]; - if (decode_md) { - BLOCK_APPENDF_2(s->aux_blk, buf, "%d%c", - md_dist, s->ref[ref_pos-s->ref_start +1]); - md_dist = 0; + if (ds & CRAM_BS) { + if (!c->comp_hdr->codecs[DS_BS]) return -1; + r |= c->comp_hdr->codecs[DS_BS] + ->decode(s, c->comp_hdr->codecs[DS_BS], blk, + (char *)&base, &out_sz); + if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { + seq[pos-1] = 'N'; + } else { + ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]]; + seq[pos-1] = c->comp_hdr-> + substitution_matrix[ref_base][base]; + if (decode_md) { + BLOCK_APPEND_UINT(s->aux_blk, md_dist); + BLOCK_APPEND_CHAR(s->aux_blk, + s->ref[ref_pos-s->ref_start +1]); + md_dist = 0; + } } } cig_op = BAM_CMATCH; @@ -852,20 +1259,25 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->DL_codec) return -1; - r |= c->comp_hdr->DL_codec->decode(s, c->comp_hdr->DL_codec, blk, - (char *)&i32, &out_sz); - if (decode_md) { - BLOCK_APPENDF_1(s->aux_blk, buf, "%d^", md_dist); - BLOCK_APPEND(s->aux_blk, &s->ref[ref_pos - s->ref_start +1], - i32); - md_dist = 0; - } - cig_op = BAM_CDEL; - cig_len += i32; - ref_pos += i32; - nm += i32; - //printf(" %d: DL = %d (ret %d)\n", f, i32, r); + if (ds & CRAM_DL) { + if (!c->comp_hdr->codecs[DS_DL]) return -1; + r |= c->comp_hdr->codecs[DS_DL] + ->decode(s, c->comp_hdr->codecs[DS_DL], blk, + (char *)&i32, &out_sz); + if (decode_md) { + BLOCK_APPEND_UINT(s->aux_blk, md_dist); + BLOCK_APPEND_CHAR(s->aux_blk, '^'); + BLOCK_APPEND(s->aux_blk, + &s->ref[ref_pos - s->ref_start +1], + i32); + md_dist = 0; + } + cig_op = BAM_CDEL; + cig_len += i32; + ref_pos += i32; + nm += i32; + //printf(" %d: DL = %d (ret %d)\n", f, i32, r); + } break; } @@ -877,14 +1289,17 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } - if (!c->comp_hdr->IN_codec) return -1; - r |= c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec, blk, - &seq[pos-1], &out_sz2); - cig_op = BAM_CINS; - cig_len += out_sz2; - seq_pos += out_sz2; - nm += out_sz2; - //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2); + if (ds & CRAM_IN) { + if (!c->comp_hdr->codecs[DS_IN]) return -1; + r |= c->comp_hdr->codecs[DS_IN] + ->decode(s, c->comp_hdr->codecs[DS_IN], blk, + &seq[pos-1], &out_sz2); + cig_op = BAM_CINS; + cig_len += out_sz2; + seq_pos += out_sz2; + nm += out_sz2; + //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2); + } break; } @@ -893,14 +1308,64 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->BA_codec) return -1; - r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk, - (char *)&seq[pos-1], &out_sz); + if (ds & CRAM_BA) { + if (!c->comp_hdr->codecs[DS_BA]) return -1; + r |= c->comp_hdr->codecs[DS_BA] + ->decode(s, c->comp_hdr->codecs[DS_BA], blk, + (char *)&seq[pos-1], &out_sz); + } cig_op = BAM_CINS; cig_len++; seq_pos++; nm++; - //printf(" %d: BA = %c (ret %d)\n", f, seq[pos-1], r); + break; + } + + case 'b': { // Several bases + int32_t len = 1; + + if (cig_len && cig_op != BAM_CMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + + if (ds & CRAM_BB) { + if (!c->comp_hdr->codecs[DS_BB]) return -1; + r |= c->comp_hdr->codecs[DS_BB] + ->decode(s, c->comp_hdr->codecs[DS_BB], blk, + (char *)&seq[pos-1], &len); + } + + cig_op = BAM_CMATCH; + + cig_len+=len; + seq_pos+=len; + ref_pos+=len; + //prev_pos+=len; + break; + } + + case 'q': { // Several quality values + int32_t len = 1; + + if (cig_len && cig_op != BAM_CMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + + if (ds & CRAM_QQ) { + if (!c->comp_hdr->codecs[DS_QQ]) return -1; + r |= c->comp_hdr->codecs[DS_QQ] + ->decode(s, c->comp_hdr->codecs[DS_QQ], blk, + (char *)&qual[pos-1], &len); + } + + cig_op = BAM_CMATCH; + + cig_len+=len; + seq_pos+=len; + ref_pos+=len; + //prev_pos+=len; break; } @@ -916,12 +1381,18 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } #endif - if (!c->comp_hdr->BA_codec) return -1; - r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk, - (char *)&seq[pos-1], &out_sz); - if (!c->comp_hdr->QS_codec) return -1; - r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk, - (char *)&qual[pos-1], &out_sz); + if (ds & CRAM_BA) { + if (!c->comp_hdr->codecs[DS_BA]) return -1; + r |= c->comp_hdr->codecs[DS_BA] + ->decode(s, c->comp_hdr->codecs[DS_BA], blk, + (char *)&seq[pos-1], &out_sz); + } + if (ds & CRAM_QS) { + if (!c->comp_hdr->codecs[DS_QS]) return -1; + r |= c->comp_hdr->codecs[DS_QS] + ->decode(s, c->comp_hdr->codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); + } #ifdef USE_X cig_op = BAM_CBASE_MISMATCH; #else @@ -935,10 +1406,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } case 'Q': { // Quality score; QS - if (!c->comp_hdr->QS_codec) return -1; - r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk, - (char *)&qual[pos-1], &out_sz); - //printf(" %d: QS = %d (ret %d)\n", f, qc, r); + if (ds & CRAM_QS) { + if (!c->comp_hdr->codecs[DS_QS]) return -1; + r |= c->comp_hdr->codecs[DS_QS] + ->decode(s, c->comp_hdr->codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); + //printf(" %d: QS = %d (ret %d)\n", f, qc, r); + } break; } @@ -947,12 +1421,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->HC_codec) return -1; - r |= c->comp_hdr->HC_codec->decode(s, c->comp_hdr->HC_codec, blk, - (char *)&i32, &out_sz); - cig_op = BAM_CHARD_CLIP; - cig_len += i32; - nm += i32; + if (ds & CRAM_HC) { + if (!c->comp_hdr->codecs[DS_HC]) return -1; + r |= c->comp_hdr->codecs[DS_HC] + ->decode(s, c->comp_hdr->codecs[DS_HC], blk, + (char *)&i32, &out_sz); + cig_op = BAM_CHARD_CLIP; + cig_len += i32; + nm += i32; + } break; } @@ -961,12 +1438,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->PD_codec) return -1; - r |= c->comp_hdr->PD_codec->decode(s, c->comp_hdr->PD_codec, blk, - (char *)&i32, &out_sz); - cig_op = BAM_CPAD; - cig_len += i32; - nm += i32; + if (ds & CRAM_PD) { + if (!c->comp_hdr->codecs[DS_PD]) return -1; + r |= c->comp_hdr->codecs[DS_PD] + ->decode(s, c->comp_hdr->codecs[DS_PD], blk, + (char *)&i32, &out_sz); + cig_op = BAM_CPAD; + cig_len += i32; + nm += i32; + } break; } @@ -975,13 +1455,16 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cigar[ncigar++] = (cig_len<<4) + cig_op; cig_len = 0; } - if (!c->comp_hdr->RS_codec) return -1; - r |= c->comp_hdr->RS_codec->decode(s, c->comp_hdr->RS_codec, blk, - (char *)&i32, &out_sz); - cig_op = BAM_CREF_SKIP; - cig_len += i32; - ref_pos += i32; - nm += i32; + if (ds & CRAM_RS) { + if (!c->comp_hdr->codecs[DS_RS]) return -1; + r |= c->comp_hdr->codecs[DS_RS] + ->decode(s, c->comp_hdr->codecs[DS_RS], blk, + (char *)&i32, &out_sz); + cig_op = BAM_CREF_SKIP; + cig_len += i32; + ref_pos += i32; + nm += i32; + } break; } @@ -990,8 +1473,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } } + if (!(ds & CRAM_FC)) + goto skip_cigar; + /* An implement match op for any unaccounted for bases */ - if (cr->len >= seq_pos) { + if ((ds & CRAM_FN) && cr->len >= seq_pos) { if (s->ref) { if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) { static int whinged = 0; @@ -1027,8 +1513,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, #endif cig_len += cr->len - seq_pos+1; } - if (decode_md) { - BLOCK_APPENDF_1(s->aux_blk, buf, "%d", md_dist); + + skip_cigar: + + if ((ds & CRAM_FN) && decode_md) { + BLOCK_APPEND_UINT(s->aux_blk, md_dist); } if (cig_len) { @@ -1047,16 +1536,24 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); - if (!c->comp_hdr->MQ_codec) return -1; - r |= c->comp_hdr->MQ_codec->decode(s, c->comp_hdr->MQ_codec, blk, - (char *)&cr->mqual, &out_sz); + if (ds & CRAM_MQ) { + if (!c->comp_hdr->codecs[DS_MQ]) return -1; + r |= c->comp_hdr->codecs[DS_MQ] + ->decode(s, c->comp_hdr->codecs[DS_MQ], blk, + (char *)&cr->mqual, &out_sz); + } else { + cr->mqual = 40; + } - if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) { + if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { int32_t out_sz2 = cr->len; - if (!c->comp_hdr->Qs_codec) return -1; - r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec, blk, - qual, &out_sz2); + if (ds & CRAM_QS) { + if (!c->comp_hdr->codecs[DS_QS]) return -1; + r |= c->comp_hdr->codecs[DS_QS] + ->decode(s, c->comp_hdr->codecs[DS_QS], blk, + qual, &out_sz2); + } } s->cigar = cigar; @@ -1100,9 +1597,9 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, int i, r = 0, out_sz = 1; unsigned char ntags; - if (!c->comp_hdr->TC_codec) return -1; - r |= c->comp_hdr->TC_codec->decode(s, c->comp_hdr->TC_codec, blk, - (char *)&ntags, &out_sz); + if (!c->comp_hdr->codecs[DS_TC]) return -1; + r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk, + (char *)&ntags, &out_sz); cr->ntags = ntags; //printf("TC=%d\n", cr->ntags); @@ -1115,9 +1612,9 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, cram_map *m; //printf("Tag %d/%d\n", i+1, cr->ntags); - if (!c->comp_hdr->TN_codec) return -1; - r |= c->comp_hdr->TN_codec->decode(s, c->comp_hdr->TN_codec, - blk, (char *)&id, &out_sz); + if (!c->comp_hdr->codecs[DS_TN]) return -1; + r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN], + blk, (char *)&id, &out_sz); if (out_sz == 3) { tag_data[0] = ((char *)&id)[0]; tag_data[1] = ((char *)&id)[1]; @@ -1145,12 +1642,19 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, static int cram_decode_aux(cram_container *c, cram_slice *s, cram_block *blk, cram_record *cr) { int i, r = 0, out_sz = 1; - int32_t TL; + int32_t TL = 0; unsigned char *TN; + uint32_t ds = c->comp_hdr->data_series; - if (!c->comp_hdr->TL_codec) return -1; - r |= c->comp_hdr->TL_codec->decode(s, c->comp_hdr->TL_codec, blk, - (char *)&TL, &out_sz); + if (!(ds & (CRAM_TL|CRAM_aux))) { + cr->aux = 0; + cr->aux_size = 0; + return 0; + } + + if (!c->comp_hdr->codecs[DS_TL]) return -1; + r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk, + (char *)&TL, &out_sz); if (r || TL < 0 || TL >= c->comp_hdr->nTL) return -1; @@ -1161,6 +1665,9 @@ static int cram_decode_aux(cram_container *c, cram_slice *s, cr->aux_size = 0; cr->aux = BLOCK_SIZE(s->aux_blk); + if (!(ds & CRAM_aux)) + return 0; + for (i = 0; i < cr->ntags; i++) { int32_t id, out_sz = 1; unsigned char tag_data[3]; @@ -1186,9 +1693,21 @@ static int cram_decode_aux(cram_container *c, cram_slice *s, } /* Resolve mate pair cross-references between recs within this slice */ -static void cram_decode_slice_xref(cram_slice *s) { +static void cram_decode_slice_xref(cram_slice *s, int required_fields) { int rec; + if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) { + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + + cr->tlen = 0; + cr->mate_pos = 0; + cr->mate_ref_id = -1; + } + + return; + } + for (rec = 0; rec < s->hdr->num_records; rec++) { cram_record *cr = &s->crecs[rec]; @@ -1213,9 +1732,14 @@ static void cram_decode_slice_xref(cram_slice *s) { int tlen; int ref = cr->ref_id; + // number of segments starting at the same point. + int left_cnt = 0; + do { if (aleft > s->crecs[id2].apos) - aleft = s->crecs[id2].apos; + aleft = s->crecs[id2].apos, left_cnt = 1; + else if (aleft == s->crecs[id2].apos) + left_cnt++; if (aright < s->crecs[id2].aend) aright = s->crecs[id2].aend; if (s->crecs[id2].mate_line == -1) { @@ -1239,9 +1763,8 @@ static void cram_decode_slice_xref(cram_slice *s) { * bit flags instead, as a tie breaker. */ if (s->crecs[id2].apos == aleft) { - if (s->crecs[id2].aend != aright) - s->crecs[id2].tlen = tlen; - else if (s->crecs[id2].flags & BAM_FREAD1) + if (left_cnt == 1 || + (s->crecs[id2].flags & BAM_FREAD1)) s->crecs[id2].tlen = tlen; else s->crecs[id2].tlen = -tlen; @@ -1252,9 +1775,8 @@ static void cram_decode_slice_xref(cram_slice *s) { id2 = s->crecs[id2].mate_line; while (id2 != id1) { if (s->crecs[id2].apos == aleft) { - if (s->crecs[id2].aend != aright) - s->crecs[id2].tlen = tlen; - else if (s->crecs[id2].flags & BAM_FREAD1) + if (left_cnt == 1 || + (s->crecs[id2].flags & BAM_FREAD1)) s->crecs[id2].tlen = tlen; else s->crecs[id2].tlen = -tlen; @@ -1339,15 +1861,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, unsigned char cf; int out_sz, r = 0; int rec; - char *seq, *qual; + char *seq = NULL, *qual = NULL; int unknown_rg = -1; - int id, embed_ref; + int embed_ref; char **refs = NULL; + uint32_t ds; - for (id = 0; id < s->hdr->num_blocks; id++) { - if (cram_uncompress_block(s->block[id])) - return -1; - } + if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0) + return -1; + + ds = c->comp_hdr->data_series; blk->bit = 7; // MSB first @@ -1378,6 +1901,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (!s->block_by_id || !(b = s->block_by_id[s->hdr->ref_base_id])) return -1; + cram_uncompress_block(b); s->ref = (char *)BLOCK_DATA(b); s->ref_start = s->hdr->ref_seq_start; s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; @@ -1386,10 +1910,11 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0); //s->ref_start = 1; - s->ref = - cram_get_ref(fd, s->hdr->ref_seq_id, - s->hdr->ref_seq_start, - s->hdr->ref_seq_start + s->hdr->ref_seq_span -1); + if (fd->required_fields & SAM_SEQ) + s->ref = + cram_get_ref(fd, s->hdr->ref_seq_id, + s->hdr->ref_seq_start, + s->hdr->ref_seq_start + s->hdr->ref_seq_span -1); s->ref_start = s->hdr->ref_seq_start; s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; @@ -1400,7 +1925,8 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, } pthread_mutex_lock(&fd->ref_lock); pthread_mutex_lock(&fd->refs->lock); - if (s->ref_end > fd->refs->ref_id[ref_id]->length) { + if ((fd->required_fields & SAM_SEQ) && + s->ref_end > fd->refs->ref_id[ref_id]->length) { fprintf(stderr, "Slice ends beyond reference end.\n"); s->ref_end = fd->refs->ref_id[ref_id]->length; } @@ -1409,14 +1935,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, } } - if (s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) { + if ((fd->required_fields & SAM_SEQ) && + s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) { fprintf(stderr, "Unable to fetch reference #%d %d..%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); return -1; } - if (fd->version != CRAM_1_VERS && s->hdr->ref_seq_id >= 0 + if (CRAM_MAJOR_VERS(fd->version) != 1 + && (fd->required_fields & SAM_SEQ) + && s->hdr->ref_seq_id >= 0 && !fd->ignore_md5 && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) { MD5_CTX md5; @@ -1483,67 +2012,96 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->s = s; out_sz = 1; /* decode 1 item */ - if (!c->comp_hdr->BF_codec) return -1; - r |= c->comp_hdr->BF_codec->decode(s, c->comp_hdr->BF_codec, blk, - (char *)&bf, &out_sz); - if (bf < 0 || - bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap)) - return -1; - bf = fd->bam_flag_swap[bf]; - cr->flags = bf; - - if (fd->version == CRAM_1_VERS) { - /* CF is byte in 1.0, int32 in 2.0 */ - if (!c->comp_hdr->CF_codec) return -1; - r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk, - (char *)&cf, &out_sz); - cr->cram_flags = cf; + if (ds & CRAM_BF) { + if (!c->comp_hdr->codecs[DS_BF]) return -1; + r |= c->comp_hdr->codecs[DS_BF] + ->decode(s, c->comp_hdr->codecs[DS_BF], blk, + (char *)&bf, &out_sz); + if (bf < 0 || + bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap)) + return -1; + bf = fd->bam_flag_swap[bf]; + cr->flags = bf; } else { - if (!c->comp_hdr->CF_codec) return -1; - r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk, - (char *)&cr->cram_flags, - &out_sz); - cf = cr->cram_flags; + cr->flags = bf = 0x4; // unmapped + } + + if (ds & CRAM_CF) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { + /* CF is byte in 1.0, int32 in 2.0 */ + if (!c->comp_hdr->codecs[DS_CF]) return -1; + r |= c->comp_hdr->codecs[DS_CF] + ->decode(s, c->comp_hdr->codecs[DS_CF], blk, + (char *)&cf, &out_sz); + cr->cram_flags = cf; + } else { + if (!c->comp_hdr->codecs[DS_CF]) return -1; + r |= c->comp_hdr->codecs[DS_CF] + ->decode(s, c->comp_hdr->codecs[DS_CF], blk, + (char *)&cr->cram_flags, + &out_sz); + cf = cr->cram_flags; + } } - if (fd->version != CRAM_1_VERS && ref_id == -2) { - if (!c->comp_hdr->RI_codec) return -1; - r |= c->comp_hdr->RI_codec->decode(s, c->comp_hdr->RI_codec, blk, - (char *)&cr->ref_id, &out_sz); - if (cr->ref_id >= 0) { - if (!fd->no_ref) { - if (!refs[cr->ref_id]) - refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0); - s->ref = refs[cr->ref_id]; + if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) { + if (ds & CRAM_RI) { + if (!c->comp_hdr->codecs[DS_RI]) return -1; + r |= c->comp_hdr->codecs[DS_RI] + ->decode(s, c->comp_hdr->codecs[DS_RI], blk, + (char *)&cr->ref_id, &out_sz); + if ((fd->required_fields & (SAM_SEQ|SAM_TLEN)) + && cr->ref_id >= 0) { + if (!fd->no_ref) { + if (!refs[cr->ref_id]) + refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, + 1, 0); + s->ref = refs[cr->ref_id]; + } + s->ref_start = 1; + pthread_mutex_lock(&fd->ref_lock); + pthread_mutex_lock(&fd->refs->lock); + s->ref_end = fd->refs->ref_id[cr->ref_id]->length; + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); } - s->ref_start = 1; - pthread_mutex_lock(&fd->ref_lock); - pthread_mutex_lock(&fd->refs->lock); - s->ref_end = fd->refs->ref_id[cr->ref_id]->length; - pthread_mutex_unlock(&fd->refs->lock); - pthread_mutex_unlock(&fd->ref_lock); + } else { + cr->ref_id = 0; } } else { cr->ref_id = ref_id; // Forced constant in CRAM 1.0 } - if (!c->comp_hdr->RL_codec) return -1; - r |= c->comp_hdr->RL_codec->decode(s, c->comp_hdr->RL_codec, blk, - (char *)&cr->len, &out_sz); + if (ds & CRAM_RL) { + if (!c->comp_hdr->codecs[DS_RL]) return -1; + r |= c->comp_hdr->codecs[DS_RL] + ->decode(s, c->comp_hdr->codecs[DS_RL], blk, + (char *)&cr->len, &out_sz); + } - if (!c->comp_hdr->AP_codec) return -1; - r |= c->comp_hdr->AP_codec->decode(s, c->comp_hdr->AP_codec, blk, - (char *)&cr->apos, &out_sz); - if (c->comp_hdr->AP_delta) - cr->apos += s->last_apos; - s->last_apos= cr->apos; + if (ds & CRAM_AP) { + if (!c->comp_hdr->codecs[DS_AP]) return -1; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&cr->apos, &out_sz); + if (c->comp_hdr->AP_delta) + cr->apos += s->last_apos; + s->last_apos= cr->apos; + } else { + cr->apos = c->ref_seq_start; + } - if (!c->comp_hdr->RG_codec) return -1; - r |= c->comp_hdr->RG_codec->decode(s, c->comp_hdr->RG_codec, blk, - (char *)&cr->rg, &out_sz); - if (cr->rg == unknown_rg) + if (ds & CRAM_RG) { + if (!c->comp_hdr->codecs[DS_RG]) return -1; + r |= c->comp_hdr->codecs[DS_RG] + ->decode(s, c->comp_hdr->codecs[DS_RG], blk, + (char *)&cr->rg, &out_sz); + if (cr->rg == unknown_rg) + cr->rg = -1; + } else { cr->rg = -1; + } cr->name_len = 0; @@ -1552,28 +2110,38 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, // Read directly into name cram_block cr->name = BLOCK_SIZE(s->name_blk); - if (!c->comp_hdr->RN_codec) return -1; - r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec, blk, - (char *)s->name_blk, &out_sz2); - cr->name_len = out_sz2; + if (ds & CRAM_RN) { + if (!c->comp_hdr->codecs[DS_RN]) return -1; + r |= c->comp_hdr->codecs[DS_RN] + ->decode(s, c->comp_hdr->codecs[DS_RN], blk, + (char *)s->name_blk, &out_sz2); + cr->name_len = out_sz2; + } } + cr->mate_pos = 0; cr->mate_line = -1; cr->mate_ref_id = -1; - if (cf & CRAM_FLAG_DETACHED) { - if (fd->version == CRAM_1_VERS) { - /* MF is byte in 1.0, int32 in 2.0 */ - unsigned char mf; - if (!c->comp_hdr->MF_codec) return -1; - r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec, - blk, (char *)&mf, &out_sz); - cr->mate_flags = mf; + if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) { + if (ds & CRAM_MF) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { + /* MF is byte in 1.0, int32 in 2.0 */ + unsigned char mf; + if (!c->comp_hdr->codecs[DS_MF]) return -1; + r |= c->comp_hdr->codecs[DS_MF] + ->decode(s, c->comp_hdr->codecs[DS_MF], + blk, (char *)&mf, &out_sz); + cr->mate_flags = mf; + } else { + if (!c->comp_hdr->codecs[DS_MF]) return -1; + r |= c->comp_hdr->codecs[DS_MF] + ->decode(s, c->comp_hdr->codecs[DS_MF], + blk, + (char *)&cr->mate_flags, + &out_sz); + } } else { - if (!c->comp_hdr->MF_codec) return -1; - r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec, - blk, - (char *)&cr->mate_flags, - &out_sz); + cr->mate_flags = 0; } if (!c->comp_hdr->read_names_included) { @@ -1581,16 +2149,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, // Read directly into name cram_block cr->name = BLOCK_SIZE(s->name_blk); - if (!c->comp_hdr->RN_codec) return -1; - r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec, - blk, (char *)s->name_blk, - &out_sz2); - cr->name_len = out_sz2; + if (ds & CRAM_RN) { + if (!c->comp_hdr->codecs[DS_RN]) return -1; + r |= c->comp_hdr->codecs[DS_RN] + ->decode(s, c->comp_hdr->codecs[DS_RN], + blk, (char *)s->name_blk, + &out_sz2); + cr->name_len = out_sz2; + } } - if (!c->comp_hdr->NS_codec) return -1; - r |= c->comp_hdr->NS_codec->decode(s, c->comp_hdr->NS_codec, blk, - (char *)&cr->mate_ref_id, &out_sz); + if (ds & CRAM_NS) { + if (!c->comp_hdr->codecs[DS_NS]) return -1; + r |= c->comp_hdr->codecs[DS_NS] + ->decode(s, c->comp_hdr->codecs[DS_NS], blk, + (char *)&cr->mate_ref_id, &out_sz); + } // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown. // if (cr->mate_ref_id == -1 && cr->flags & 0x01) { @@ -1598,25 +2172,40 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, // cr->flags |= BAM_FMUNMAP; // } - if (!c->comp_hdr->NP_codec) return -1; - r |= c->comp_hdr->NP_codec->decode(s, c->comp_hdr->NP_codec, blk, - (char *)&cr->mate_pos, &out_sz); - if (!c->comp_hdr->TS_codec) return -1; - r |= c->comp_hdr->TS_codec->decode(s, c->comp_hdr->TS_codec, blk, - (char *)&cr->tlen, &out_sz); - } else if (cf & CRAM_FLAG_MATE_DOWNSTREAM) { - if (!c->comp_hdr->NF_codec) return -1; - r |= c->comp_hdr->NF_codec->decode(s, c->comp_hdr->NF_codec, blk, - (char *)&cr->mate_line, &out_sz); - cr->mate_line += rec + 1; - - //cr->name_len = sprintf(name, "%d", name_id++); - //cr->name = DSTRING_LEN(name_ds); - //dstring_nappend(name_ds, name, cr->name_len); + if (ds & CRAM_NP) { + if (!c->comp_hdr->codecs[DS_NP]) return -1; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&cr->mate_pos, &out_sz); + } - cr->mate_ref_id = -1; - cr->tlen = INT_MIN; - cr->mate_pos = 0; + if (ds & CRAM_TS) { + if (!c->comp_hdr->codecs[DS_TS]) return -1; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&cr->tlen, &out_sz); + } else { + cr->tlen = INT_MIN; + } + } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) { + if (ds & CRAM_NF) { + if (!c->comp_hdr->codecs[DS_NF]) return -1; + r |= c->comp_hdr->codecs[DS_NF] + ->decode(s, c->comp_hdr->codecs[DS_NF], blk, + (char *)&cr->mate_line, &out_sz); + cr->mate_line += rec + 1; + + //cr->name_len = sprintf(name, "%d", name_id++); + //cr->name = DSTRING_LEN(name_ds); + //dstring_nappend(name_ds, name, cr->name_len); + + cr->mate_ref_id = -1; + cr->tlen = INT_MIN; + cr->mate_pos = 0; + } else { + cr->mate_flags = 0; + cr->tlen = INT_MIN; + } } else { cr->mate_flags = 0; cr->tlen = INT_MIN; @@ -1635,31 +2224,40 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, */ /* Auxiliary tags */ - if (fd->version == CRAM_1_VERS) + if (CRAM_MAJOR_VERS(fd->version) == 1) r |= cram_decode_aux_1_0(c, s, blk, cr); else r |= cram_decode_aux(c, s, blk, cr); /* Fake up dynamic string growth and appending */ - cr->seq = BLOCK_SIZE(s->seqs_blk); - BLOCK_GROW(s->seqs_blk, cr->len); - seq = (char *)BLOCK_END(s->seqs_blk); - BLOCK_SIZE(s->seqs_blk) += cr->len; + if (ds & CRAM_RL) { + cr->seq = BLOCK_SIZE(s->seqs_blk); + BLOCK_GROW(s->seqs_blk, cr->len); + seq = (char *)BLOCK_END(s->seqs_blk); + BLOCK_SIZE(s->seqs_blk) += cr->len; - if (!seq) - return -1; + if (!seq) + return -1; - cr->qual = BLOCK_SIZE(s->qual_blk); - BLOCK_GROW(s->qual_blk, cr->len); - qual = (char *)BLOCK_END(s->qual_blk); - BLOCK_SIZE(s->qual_blk) += cr->len; + cr->qual = BLOCK_SIZE(s->qual_blk); + BLOCK_GROW(s->qual_blk, cr->len); + qual = (char *)BLOCK_END(s->qual_blk); + BLOCK_SIZE(s->qual_blk) += cr->len; - if (!s->ref) - memset(seq, '=', cr->len); + if (!s->ref) + memset(seq, '=', cr->len); + } if (!(bf & BAM_FUNMAP)) { /* Decode sequence and generate CIGAR */ - r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual); + if (ds & (CRAM_SEQ | CRAM_MQ)) { + r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual); + } else { + cr->cigar = 0; + cr->ncigar = 0; + cr->aend = cr->apos; + cr->mqual = 0; + } } else { int out_sz2 = cr->len; @@ -1669,17 +2267,24 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->aend = cr->apos; cr->mqual = 0; - if (!c->comp_hdr->BA_codec) return -1; - r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk, - (char *)seq, &out_sz2); + if (ds & CRAM_BA) { + if (!c->comp_hdr->codecs[DS_BA]) return -1; + r |= c->comp_hdr->codecs[DS_BA] + ->decode(s, c->comp_hdr->codecs[DS_BA], blk, + (char *)seq, &out_sz2); + } - if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) { + if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { out_sz2 = cr->len; - if (!c->comp_hdr->Qs_codec) return -1; - r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec, - blk, qual, &out_sz2); + if (ds & CRAM_QS) { + if (!c->comp_hdr->codecs[DS_QS]) return -1; + r |= c->comp_hdr->codecs[DS_QS] + ->decode(s, c->comp_hdr->codecs[DS_QS], + blk, qual, &out_sz2); + } } else { - memset(qual, 30, cr->len); + if (ds & CRAM_RL) + memset(qual, 30, cr->len); } } } @@ -1698,7 +2303,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, pthread_mutex_unlock(&fd->ref_lock); /* Resolve mate pair cross-references between recs within this slice */ - cram_decode_slice_xref(s); + cram_decode_slice_xref(s, fd->required_fields); return r; } @@ -1738,7 +2343,7 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, j->s = s; j->h = bfd; - nonblock = t_pool_results_queue_len(fd->rqueue) ? 0 : 1; + nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0; if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread, j, nonblock)) { @@ -1776,21 +2381,33 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, char name_a[1024], *name; int name_len; char *aux, *aux_orig; + char *seq, *qual; /* Assign names if not explicitly set */ - if (cr->name_len) { - name = (char *)BLOCK_DATA(s->name_blk) + cr->name; - name_len = cr->name_len; + if (fd->required_fields & SAM_QNAME) { + if (cr->name_len) { + name = (char *)BLOCK_DATA(s->name_blk) + cr->name; + name_len = cr->name_len; + } else { + name = name_a; + name_len = strlen(fd->prefix); + memcpy(name, fd->prefix, name_len); + name += name_len; + *name++ = ':'; + if (cr->mate_line >= 0 && cr->mate_line < rec) + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + cr->mate_line + 1); + else + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + rec + 1); + name_len = name - name_a; + name = name_a; + } } else { - // FIXME: add prefix, container number, slice number, etc - name = name_a; - - if (cr->mate_line >= 0 && cr->mate_line < rec) - name_len = sprintf(name_a, "%s:%"PRId64":%d", - fd->prefix, s->id, cr->mate_line); - else - name_len = sprintf(name_a, "%s:%"PRId64":%d", - fd->prefix, s->id, rec); + name = "?"; + name_len = 1; } /* Generate BAM record */ @@ -1798,10 +2415,23 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, return -1; rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0; - if (!BLOCK_DATA(s->seqs_blk)) - return -1; - if (!BLOCK_DATA(s->qual_blk)) - return -1; + if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) { + if (!BLOCK_DATA(s->seqs_blk)) + return -1; + seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; + } else { + seq = "*"; + cr->len = 1; + } + + + if (fd->required_fields & SAM_QUAL) { + if (!BLOCK_DATA(s->qual_blk)) + return -1; + qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual; + } else { + qual = NULL; + } bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len, name, name_len, @@ -1815,8 +2445,8 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, cr->mate_pos, cr->tlen, cr->len, - (char *)BLOCK_DATA(s->seqs_blk) + cr->seq, - (char *)BLOCK_DATA(s->qual_blk) + cr->qual); + seq, + qual); if (bam_idx == -1) return -1; @@ -1837,12 +2467,6 @@ static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, *aux++ = 0; } -#ifndef SAMTOOLS - bam_set_blk_size(*bam, bam_blk_size(*bam) + (aux - aux_orig)); -#endif - - *aux++ = 0; - return bam_idx + (aux - aux_orig); } @@ -1853,8 +2477,6 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { cram_container *c; cram_slice *s = NULL; - fd->eof = 0; - if (!(c = fd->ctr)) { // Load first container. do { @@ -1899,8 +2521,11 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } } - if ((s = c->slice)) + if ((s = c->slice)) { + c->slice = NULL; cram_free_slice(s); + s = NULL; + } if (c->curr_slice == c->max_slice) { cram_free_container(c); @@ -1934,14 +2559,22 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { /* Skip containers not yet spanning our range */ if (fd->range.refid != -2 && c->ref_seq_id != -2) { + fd->required_fields |= SAM_POS; + if (c->ref_seq_id != fd->range.refid) { + cram_free_container(c); + fd->ctr = NULL; + fd->ooc = 1; fd->eof = 1; - return NULL; + break; } if (c->ref_seq_start > fd->range.end) { + cram_free_container(c); + fd->ctr = NULL; + fd->ooc = 1; fd->eof = 1; - return NULL; + break; } if (c->ref_seq_start + c->ref_seq_span-1 < @@ -2028,7 +2661,9 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { if (!fd->pool || fd->job_pending) break; - if (t_pool_results_queue_sz(fd->rqueue) > fd->pool->qsize) + // Push it a bit far, to qsize in queue rather than pending arrival, + // as cram tends to be a bit bursty in decode timings. + if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize) break; } @@ -2054,6 +2689,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { c = j->c; s = j->s; + fd->ctr = c; + t_pool_delete_result(res, 1); } diff --git a/htslib/cram/cram_encode.c b/htslib/cram/cram_encode.c index 94c2cebe..8057e9c8 100644 --- a/htslib/cram/cram_encode.c +++ b/htslib/cram/cram_encode.c @@ -47,25 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/os.h" #include "cram/md5.h" -#ifdef SAMTOOLS -# define bam_copy(dst, src) bam_copy1(*(dst), (src)) -#else -void bam_copy(bam_seq_t **bt, bam_seq_t *bf) { - size_t a; - - if (bf->alloc > (*bt)->alloc) { - a = ((int)((bf->alloc+15)/16))*16; - *bt = realloc(*bt, a); - memcpy(*bt, bf, bf->alloc); - } else { - a = (*bt)->alloc; - memcpy(*bt, bf, bf->alloc); - } - - (*bt)->alloc = a; -} -#endif - #define Z_CRAM_STRAT Z_FILTERED //#define Z_CRAM_STRAT Z_RLE //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY @@ -111,7 +92,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, */ // Duplicated from container itself, and removed in 1.1 - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { itf8_put_blk(cb, h->ref_seq_id); itf8_put_blk(cb, h->ref_seq_start); itf8_put_blk(cb, h->ref_seq_span); @@ -135,7 +116,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, if (-1 == r) return NULL; kh_val(h->preservation_map, k).i = 1; - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { k = kh_put(map, h->preservation_map, "PI", &r); if (-1 == r) return NULL; kh_val(h->preservation_map, k).i = 0; @@ -269,160 +250,197 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, /* rec encoding map */ mc = 0; BLOCK_SIZE(map) = 0; - if (h->BF_codec) { - if (-1 == h->BF_codec->store(h->BF_codec, map, "BF", fd->version)) + if (h->codecs[DS_BF]) { + if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF", + fd->version)) + return NULL; + mc++; + } + if (h->codecs[DS_CF]) { + if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF", + fd->version)) return NULL; mc++; } - if (h->CF_codec) { - if (-1 == h->CF_codec->store(h->CF_codec, map, "CF", fd->version)) + if (h->codecs[DS_RL]) { + if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL", + fd->version)) return NULL; mc++; } - if (h->RL_codec) { - if (-1 == h->RL_codec->store(h->RL_codec, map, "RL", fd->version)) + if (h->codecs[DS_AP]) { + if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP", + fd->version)) return NULL; mc++; } - if (h->AP_codec) { - if (-1 == h->AP_codec->store(h->AP_codec, map, "AP", fd->version)) + if (h->codecs[DS_RG]) { + if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG", + fd->version)) return NULL; mc++; } - if (h->RG_codec) { - if (-1 == h->RG_codec->store(h->RG_codec, map, "RG", fd->version)) + if (h->codecs[DS_MF]) { + if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF", + fd->version)) return NULL; mc++; } - if (h->MF_codec) { - if (-1 == h->MF_codec->store(h->MF_codec, map, "MF", fd->version)) + if (h->codecs[DS_NS]) { + if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS", + fd->version)) return NULL; mc++; } - if (h->NS_codec) { - if (-1 == h->NS_codec->store(h->NS_codec, map, "NS", fd->version)) + if (h->codecs[DS_NP]) { + if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP", + fd->version)) return NULL; mc++; } - if (h->NP_codec) { - if (-1 == h->NP_codec->store(h->NP_codec, map, "NP", fd->version)) + if (h->codecs[DS_TS]) { + if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS", + fd->version)) return NULL; mc++; } - if (h->TS_codec) { - if (-1 == h->TS_codec->store(h->TS_codec, map, "TS", fd->version)) + if (h->codecs[DS_NF]) { + if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF", + fd->version)) return NULL; mc++; } - if (h->NF_codec) { - if (-1 == h->NF_codec->store(h->NF_codec, map, "NF", fd->version)) + if (h->codecs[DS_TC]) { + if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC", + fd->version)) return NULL; mc++; } - if (h->TC_codec) { - if (-1 == h->TC_codec->store(h->TC_codec, map, "TC", fd->version)) + if (h->codecs[DS_TN]) { + if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN", + fd->version)) return NULL; mc++; } - if (h->TN_codec) { - if (-1 == h->TN_codec->store(h->TN_codec, map, "TN", fd->version)) + if (h->codecs[DS_TL]) { + if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL", + fd->version)) return NULL; mc++; } - if (h->TL_codec) { - if (-1 == h->TL_codec->store(h->TL_codec, map, "TL", fd->version)) + if (h->codecs[DS_FN]) { + if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN", + fd->version)) return NULL; mc++; } - if (h->FN_codec) { - if (-1 == h->FN_codec->store(h->FN_codec, map, "FN", fd->version)) + if (h->codecs[DS_FC]) { + if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC", + fd->version)) return NULL; mc++; } - if (h->FC_codec) { - if (-1 == h->FC_codec->store(h->FC_codec, map, "FC", fd->version)) + if (h->codecs[DS_FP]) { + if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP", + fd->version)) return NULL; mc++; } - if (h->FP_codec) { - if (-1 == h->FP_codec->store(h->FP_codec, map, "FP", fd->version)) + if (h->codecs[DS_BS]) { + if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS", + fd->version)) return NULL; mc++; } - if (h->BS_codec) { - if (-1 == h->BS_codec->store(h->BS_codec, map, "BS", fd->version)) + if (h->codecs[DS_IN]) { + if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN", + fd->version)) return NULL; mc++; } - if (h->IN_codec) { - if (-1 == h->IN_codec->store(h->IN_codec, map, "IN", fd->version)) + if (h->codecs[DS_DL]) { + if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL", + fd->version)) return NULL; mc++; } - if (h->DL_codec) { - if (-1 == h->DL_codec->store(h->DL_codec, map, "DL", fd->version)) + if (h->codecs[DS_BA]) { + if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA", + fd->version)) return NULL; mc++; } - if (h->BA_codec) { - if (-1 == h->BA_codec->store(h->BA_codec, map, "BA", fd->version)) + if (h->codecs[DS_BB]) { + if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB", + fd->version)) return NULL; mc++; } - if (h->MQ_codec) { - if (-1 == h->MQ_codec->store(h->MQ_codec, map, "MQ", fd->version)) + if (h->codecs[DS_MQ]) { + if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ", + fd->version)) return NULL; mc++; } - if (h->RN_codec) { - if (-1 == h->RN_codec->store(h->RN_codec, map, "RN", fd->version)) + if (h->codecs[DS_RN]) { + if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN", + fd->version)) return NULL; mc++; } - if (h->QS_codec) { - if (-1 == h->QS_codec->store(h->QS_codec, map, "QS", fd->version)) + if (h->codecs[DS_QS]) { + if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS", + fd->version)) return NULL; mc++; } - if (h->Qs_codec) { - if (-1 == h->Qs_codec->store(h->Qs_codec, map, "Qs", fd->version)) + if (h->codecs[DS_QQ]) { + if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ", + fd->version)) return NULL; mc++; } - if (h->RI_codec) { - if (-1 == h->RI_codec->store(h->RI_codec, map, "RI", fd->version)) + if (h->codecs[DS_RI]) { + if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI", + fd->version)) return NULL; mc++; } - if (fd->version != CRAM_1_VERS) { - if (h->SC_codec) { - if (-1 == h->SC_codec->store(h->SC_codec, map, "SC", fd->version)) + if (CRAM_MAJOR_VERS(fd->version) != 1) { + if (h->codecs[DS_SC]) { + if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC", + fd->version)) return NULL; mc++; } - if (h->RS_codec) { - if (-1 == h->RS_codec->store(h->RS_codec, map, "RS", fd->version)) + if (h->codecs[DS_RS]) { + if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS", + fd->version)) return NULL; mc++; } - if (h->PD_codec) { - if (-1 == h->PD_codec->store(h->PD_codec, map, "PD", fd->version)) + if (h->codecs[DS_PD]) { + if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD", + fd->version)) return NULL; mc++; } - if (h->HC_codec) { - if (-1 == h->HC_codec->store(h->HC_codec, map, "HC", fd->version)) + if (h->codecs[DS_HC]) { + if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC", + fd->version)) return NULL; mc++; } } - if (h->TM_codec) { - if (-1 == h->TM_codec->store(h->TM_codec, map, "TM", fd->version)) + if (h->codecs[DS_TM]) { + if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM", + fd->version)) return NULL; mc++; } - if (h->TV_codec) { - if (-1 == h->TV_codec->store(h->TV_codec, map, "TV", fd->version)) + if (h->codecs[DS_TV]) { + if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV", + fd->version)) return NULL; mc++; } @@ -458,7 +476,10 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, if (c->tags_used) { khint_t k; +#define TAG_ID(a) ((#a[0]<<8)+#a[1]) + for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { + int key; if (!kh_exist(c->tags_used, k)) continue; @@ -466,23 +487,73 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, itf8_put_blk(map, kh_key(c->tags_used, k)); // use block content id 4 - switch(kh_key(c->tags_used, k) & 0xff) { + switch((key = kh_key(c->tags_used, k)) & 0xff) { case 'Z': case 'H': // string as byte_array_stop - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { BLOCK_APPEND(map, "\005" // BYTE_ARRAY_STOP "\005" // len "\t" // stop-byte is also SAM separator - CRAM_EXT_TAG_S "\000\000\000", + DS_aux_S "\000\000\000", 7); } else { - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - CRAM_EXT_TAG_S, - 4); + if (key>>8 == TAG_ID(OQ)) + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_OQ_S, + 4); + else if (key>>8 == TAG_ID(BQ)) + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_BQ_S, + 4); + else if (key>>8 == TAG_ID(BD)) + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_BD_S, + 4); + else if (key>>8 == TAG_ID(BI)) + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_BI_S, + 4); + else if ((key>>8 == TAG_ID(Q2)) || + (key>>8 == TAG_ID(U2)) || + (key>>8 == TAG_ID(QT)) || + (key>>8 == TAG_ID(CQ))) + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_oq_S, + 4); + else if ((key>>8 == TAG_ID(R2)) || + (key>>8 == TAG_ID(E2)) || + (key>>8 == TAG_ID(CS)) || + (key>>8 == TAG_ID(BC)) || + (key>>8 == TAG_ID(RT))) + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_os_S, + 4); + else + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + DS_aux_oz_S, + 4); } break; @@ -499,7 +570,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, "\000" // length=0 "\001" // EXTERNAL (val) "\001" // external-len - CRAM_EXT_TAG_S,// content-id + DS_aux_S,// content-id 11); break; @@ -516,7 +587,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, "\000" // length=0 "\001" // EXTERNAL (val) "\001" // external-len - CRAM_EXT_TAG_S,// content-id + DS_aux_S,// content-id 11); break; @@ -533,7 +604,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, "\000" // length=0 "\001" // EXTERNAL (val) "\001" // external-len - CRAM_EXT_TAG_S,// content-id + DS_aux_S,// content-id 11); break; @@ -543,16 +614,28 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, // after slice header construction). So we use // BYTE_ARRAY_LEN with the length codec being external // too. - BLOCK_APPEND(map, - "\004" // BYTE_ARRAY_LEN - "\006" // length - "\001" // EXTERNAL (len) - "\001" // external-len - "\004" // content-id - "\001" // EXTERNAL (val) - "\001" // external-len - CRAM_EXT_TAG_S,// content-id - 8); + if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM))) + BLOCK_APPEND(map, + "\004" // BYTE_ARRAY_LEN + "\006" // length + "\001" // EXTERNAL (len) + "\001" // external-len + DS_aux_FZ_S // content-id + "\001" // EXTERNAL (val) + "\001" // external-len + DS_aux_FZ_S,// content-id + 8); + else + BLOCK_APPEND(map, + "\004" // BYTE_ARRAY_LEN + "\006" // length + "\001" // EXTERNAL (len) + "\001" // external-len + DS_aux_S // content-id + "\001" // EXTERNAL (val) + "\001" // external-len + DS_aux_S,// content-id + 8); break; default: @@ -603,8 +686,10 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { cp += itf8_put(cp, s->hdr->ref_seq_start); cp += itf8_put(cp, s->hdr->ref_seq_span); cp += itf8_put(cp, s->hdr->num_records); - if (fd->version != CRAM_1_VERS) + if (CRAM_MAJOR_VERS(fd->version) == 2) cp += itf8_put(cp, s->hdr->record_counter); + else if (CRAM_MAJOR_VERS(fd->version) >= 3) + cp += ltf8_put(cp, s->hdr->record_counter); cp += itf8_put(cp, s->hdr->num_blocks); cp += itf8_put(cp, s->hdr->num_content_ids); for (j = 0; j < s->hdr->num_content_ids; j++) { @@ -613,7 +698,7 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { if (s->hdr->content_type == MAPPED_SLICE) cp += itf8_put(cp, s->hdr->ref_base_id); - if (fd->version != CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) != 1) { memcpy(cp, s->hdr->md5, 16); cp += 16; } @@ -627,451 +712,506 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { /* - * Encodes a single slice from a container - * FIXME: break into smaller components. + * Encodes a single read. * * Returns 0 on success * -1 on failure */ -static int cram_encode_slice(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h, cram_slice *s) { - int rec, r = 0, last_pos; - cram_block *core; - int nblk, embed_ref; +static int cram_encode_slice_read(cram_fd *fd, + cram_container *c, + cram_block_compression_hdr *h, + cram_slice *s, + cram_record *cr, + int *last_pos) { + int r = 0; + int32_t i32; + unsigned char uc; - embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; + //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); - /* - * Slice external blocks: - * ID 0 => base calls (insertions, soft-clip) - * ID 1 => qualities - * ID 2 => names - * ID 3 => TS (insert size), NP (next frag) - * ID 4 => tag values - * ID 5 => BA, ifdef BA_external - * ID 6 => tag IDs (TN), ifdef TN_external and CRAM_1_VERS - * ID 7 => TD tag dictionary, if !CRAM_1_VERS - */ + //printf("BF=0x%x\n", cr->flags); + // bf = cram_flag_swap[cr->flags]; + i32 = fd->cram_flag_swap[cr->flags & 0xfff]; + r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1); - /* Create cram slice header, num_blocks etc */ - s->hdr->ref_base_id = embed_ref ? CRAM_EXT_REF : -1; - s->hdr->record_counter = c->num_records + c->record_counter; - c->num_records += s->hdr->num_records; - nblk = (fd->version == CRAM_1_VERS) ? 5 : 6; -#ifdef BA_external - nblk++; -#endif -#ifdef TN_external - if (fd->version == CRAM_1_VERS) { - nblk++; - } -#endif - if (embed_ref) - nblk++; - - s->hdr->num_content_ids = nblk; - s->hdr->num_blocks = s->hdr->num_content_ids+1; - s->block = calloc(s->hdr->num_blocks, sizeof(s->block[0])); - s->hdr->block_content_ids = malloc(s->hdr->num_content_ids * - sizeof(int32_t)); - if (!s->block || !s->hdr->block_content_ids) - return -1; - s->hdr->block_content_ids[0] = 0; // core - s->hdr->block_content_ids[1] = CRAM_EXT_QUAL; - s->hdr->block_content_ids[2] = CRAM_EXT_NAME; - s->hdr->block_content_ids[3] = CRAM_EXT_TS_NP; - s->hdr->block_content_ids[4] = CRAM_EXT_TAG; - s->hdr->block_content_ids[5] = CRAM_EXT_SC; - nblk = (fd->version == CRAM_1_VERS) ? 5 : 6; -#ifdef BA_external - s->hdr->block_content_ids[(s->ba_id = ++nblk)-1] = CRAM_EXT_BA; -#endif -#ifdef TN_external - if (fd->version == CRAM_1_VERS) { - s->hdr->block_content_ids[(s->tn_id = ++nblk)-1] = CRAM_EXT_TN; - } -#endif - if (embed_ref) - s->hdr->block_content_ids[(s->ref_id = ++nblk)-1] = CRAM_EXT_REF; - - if (!(s->block[0] = cram_new_block(CORE, 0))) return -1; - if (!(s->block[1] = cram_new_block(EXTERNAL, CRAM_EXT_IN))) return -1; - if (!(s->block[2] = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) return -1; - if (!(s->block[3] = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) return -1; - if (!(s->block[4] = cram_new_block(EXTERNAL, CRAM_EXT_TS_NP))) return -1; - if (!(s->block[5] = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) return -1; - if (fd->version != CRAM_1_VERS) { - if (!(s->block[6] = cram_new_block(EXTERNAL, CRAM_EXT_SC))) - return -1; - } -#ifdef BA_external - if (!(s->block[s->ba_id] = cram_new_block(EXTERNAL, CRAM_EXT_BA))) - return -1; -#endif -#ifdef TN_external - if (fd->version == CRAM_1_VERS) { - if (!(s->block[s->tn_id] = cram_new_block(EXTERNAL, CRAM_EXT_TN))) - return -1; - } -#endif - if (embed_ref) { - if (!(s->block[s->ref_id] = cram_new_block(EXTERNAL, CRAM_EXT_REF))) - return -1; - BLOCK_APPEND(s->block[s->ref_id], - c->ref + c->first_base - c->ref_start, - c->last_base - c->first_base + 1); - } + i32 = cr->cram_flags; + r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1); - core = s->block[0]; - - /* Create a formal method for stealing from dstrings! */ - s->block[4]->data = calloc(10, s->hdr->num_records); // NP TS - if (!s->block[4]->data) - return -1; - s->block[4]->comp_size = s->block[4]->uncomp_size = 0; + if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2) + r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1); -#ifdef BA_external - s->block[s->ba_id]->data = calloc(1, s->BA_len); - if (!s->block[s->ba_id]->data) - return -1; - s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size = 0; -#endif + r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); - /* Generate core block */ - if (!(s->hdr_block = cram_encode_slice_header(fd, s))) - return -1; + if (c->pos_sorted) { + i32 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + *last_pos = cr->apos; + } else { + i32 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } - last_pos = s->hdr->ref_seq_start; - for (rec = 0; rec < s->hdr->num_records; rec++) { - cram_record *cr = &s->crecs[rec]; - int32_t i32; - unsigned char uc; + r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); - //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); + if (c->comp_hdr->read_names_included) { + // RN codec: Already stored in block[3]. + } - //printf("BF=0x%x\n", cr->flags); - // bf = cram_flag_swap[cr->flags]; - i32 = fd->cram_flag_swap[cr->flags & 0xfff]; - r |= h->BF_codec->encode(s, h->BF_codec, core, (char *)&i32, 1); + if (cr->cram_flags & CRAM_FLAG_DETACHED) { + i32 = cr->mate_flags; + r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1); - i32 = cr->cram_flags; - r |= h->CF_codec->encode(s, h->CF_codec, core, - (char *)&i32, 1); + if (!c->comp_hdr->read_names_included) { + // RN codec: Already stored in block[3]. + } - if (fd->version != CRAM_1_VERS) - r |= h->RI_codec->encode(s, h->RI_codec, core, - (char *)&cr->ref_id, 1); + r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], + (char *)&cr->mate_ref_id, 1); - r |= h->RL_codec->encode(s, h->RL_codec, core, - (char *)&cr->len, 1); + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&cr->mate_pos, 1); - if (c->pos_sorted) { - i32 = cr->apos - last_pos; - r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1); - last_pos = cr->apos; - } else { - i32 = cr->apos; - r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1); - } + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { + r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], + (char *)&cr->mate_line, 1); + } - r |= h->RG_codec->encode(s, h->RG_codec, core, - (char *)&cr->rg, 1); + /* Aux tags */ + if (CRAM_MAJOR_VERS(fd->version) == 1) { + int j; + uc = cr->ntags; + r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1); - if (c->comp_hdr->read_names_included) { - // RN codec: Already stored in block[3]. + for (j = 0; j < cr->ntags; j++) { + uint32_t i32 = s->TN[cr->TN_idx + j]; // id + r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1); } + } else { + r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1); + } - if (cr->cram_flags & CRAM_FLAG_DETACHED) { - i32 = cr->mate_flags; - r |= h->MF_codec->encode(s, h->MF_codec, core, (char *)&i32, 1); + // qual + // QS codec : Already stored in block[2]. - if (!c->comp_hdr->read_names_included) { - // RN codec: Already stored in block[3]. - } + // features (diffs) + if (!(cr->flags & BAM_FUNMAP)) { + int prev_pos = 0, j; -#ifndef NS_external - r |= h->NS_codec->encode(s, h->NS_codec, core, - (char *)&cr->mate_ref_id, 1); -#else - s->block[4]->uncomp_size += - itf8_put(&s->block[4]->data[s->block[4]->uncomp_size], - cr->mate_ref_id); -#endif + r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN], + (char *)&cr->nfeature, 1); + for (j = 0; j < cr->nfeature; j++) { + cram_feature *f = &s->features[cr->feature + j]; -#ifndef TS_external - r |= h->NP_codec->encode(s, h->NP_codec, core, - (char *)&cr->mate_pos, 1); + uc = f->X.code; + r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1); + i32 = f->X.pos - prev_pos; + r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1); + prev_pos = f->X.pos; - r |= h->TS_codec->encode(s, h->TS_codec, core, - (char *)&cr->tlen, 1); -#else - s->block[4]->uncomp_size += - itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size], - cr->mate_pos); - s->block[4]->uncomp_size += - itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size], - cr->tlen); -#endif - } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { - r |= h->NF_codec->encode(s, h->NF_codec, core, - (char *)&cr->mate_line, 1); - } + switch(f->X.code) { + //char *seq; - /* Aux tags */ - if (fd->version == CRAM_1_VERS) { - uc = cr->ntags; - r |= h->TC_codec->encode(s, h->TC_codec, core, (char *)&uc, 1); -#ifndef TN_external - { - int j; - for (j = 0; j < cr->ntags; j++) { - uint32_t i32 = s->TN[cr->TN_idx + j]; // id - r |= h->TN_codec->encode(s, h->TN_codec, core, - (char *)&i32, 1); - } - } -#endif - } else { - r |= h->TL_codec->encode(s, h->TL_codec, core, (char *)&cr->TL, 1); - } - - // qual - // QS codec : Already stored in block[2]. + case 'X': + //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base); + + uc = f->X.base; + r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS], + (char *)&uc, 1); + break; + case 'S': + // Already done +// r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC], +// BLOCK_DATA(s->soft_blk) + f->S.seq_idx, +// f->S.len); + +// if (IS_CRAM_3_VERS(fd)) { +// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], +// BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, +// f->S.len); +// } + break; + case 'I': + //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; + //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], + // seq, f->S.len); +// if (IS_CRAM_3_VERS(fd)) { +// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], +// BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, +// f->I.len); +// } + break; + case 'i': + uc = f->i.base; + r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], + (char *)&uc, 1); + //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; + //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], + // seq, 1); + break; + case 'D': + i32 = f->D.len; + r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL], + (char *)&i32, 1); + break; - // features (diffs) - if (!(cr->flags & BAM_FUNMAP)) { - int prev_pos = 0, j; + case 'B': + // // Used when we try to store a non ACGTN base or an N + // // that aligns against a non ACGTN reference - r |= h->FN_codec->encode(s, h->FN_codec, core, - (char *)&cr->nfeature, 1); - for (j = 0; j < cr->nfeature; j++) { - cram_feature *f = &s->features[cr->feature + j]; + uc = f->B.base; + r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], + (char *)&uc, 1); - uc = f->X.code; - r |= h->FC_codec->encode(s, h->FC_codec, core, - (char *)&uc, 1); - i32 = f->X.pos - prev_pos; - r |= h->FP_codec->encode(s, h->FP_codec, core, - (char *)&i32, 1); - prev_pos = f->X.pos; + // Already added + // uc = f->B.qual; + // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], + // (char *)&uc, 1); + break; - switch(f->X.code) { - //char *seq; + case 'b': + // string of bases + r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], + (char *)BLOCK_DATA(s->seqs_blk) + + f->b.seq_idx, + f->b.len); + break; - case 'X': - //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base); - - uc = f->X.base; - r |= h->BS_codec->encode(s, h->BS_codec, core, - (char *)&uc, 1); - break; - case 'S': - //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; - //r |= h->SC_codec->encode(s, h->SC_codec, core, - // seq, f->S.len); - break; - case 'I': - //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; - //r |= h->IN_codec->encode(s, h->IN_codec, core, - // seq, f->S.len); - break; - case 'i': - uc = f->i.base; -#ifdef BA_external - s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc; -#else - r |= h->BA_codec->encode(s, h->BA_codec, core, - (char *)&uc, 1); -#endif - //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; - //r |= h->IN_codec->encode(s, h->IN_codec, core, - // seq, 1); - break; - case 'D': - i32 = f->D.len; - r |= h->DL_codec->encode(s, h->DL_codec, core, - (char *)&i32, 1); - break; - - case 'B': -// // Used when we try to store a non ACGTN base or an N -// // that aligns against a non ACGTN reference - - uc = f->B.base; -#ifdef BA_external - s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc; -#else - r |= h->BA_codec->encode(s, h->BA_codec, core, - (char *)&uc, 1); -#endif + case 'Q': + // Already added + // uc = f->B.qual; + // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], + // (char *)&uc, 1); + break; -// Already added -// uc = f->B.qual; -// r |= h->QS_codec->encode(s, h->QS_codec, core, -// (char *)&uc, 1); - break; - - case 'Q': -// Already added -// uc = f->B.qual; -// r |= h->QS_codec->encode(s, h->QS_codec, core, -// (char *)&uc, 1); - break; - - case 'N': - i32 = f->N.len; - r |= h->RS_codec->encode(s, h->RS_codec, core, - (char *)&i32, 1); - break; + case 'N': + i32 = f->N.len; + r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS], + (char *)&i32, 1); + break; - case 'P': - i32 = f->P.len; - r |= h->PD_codec->encode(s, h->PD_codec, core, - (char *)&i32, 1); - break; + case 'P': + i32 = f->P.len; + r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD], + (char *)&i32, 1); + break; - case 'H': - i32 = f->H.len; - r |= h->HC_codec->encode(s, h->HC_codec, core, - (char *)&i32, 1); - break; + case 'H': + i32 = f->H.len; + r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC], + (char *)&i32, 1); + break; - default: - fprintf(stderr, "unhandled feature code %c\n", - f->X.code); - return -1; - } + default: + fprintf(stderr, "unhandled feature code %c\n", + f->X.code); + return -1; } - - r |= h->MQ_codec->encode(s, h->MQ_codec, core, - (char *)&cr->mqual, 1); - } else { - char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; -#ifdef BA_external - memcpy(&s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size], - seq, cr->len); - s->block[s->ba_id]->uncomp_size += cr->len; -#else - r |= h->BA_codec->encode(s, h->BA_codec, core, seq, cr->len); -#endif } - if (r) - return -1; - } - s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7); - s->block[0]->comp_size = s->block[0]->uncomp_size; - - // FIXME: we should avoid creating these in the first place and just - // point them to s->base_blk et al. - cram_free_block(s->block[1]); - cram_free_block(s->block[2]); - cram_free_block(s->block[3]); - cram_free_block(s->block[5]); - if (fd->version != CRAM_1_VERS) { - cram_free_block(s->block[6]); - BLOCK_UPLEN(s->soft_blk); - s->block[6] = s->soft_blk; - s->soft_blk = NULL; - } - BLOCK_UPLEN(s->base_blk); s->block[1] = s->base_blk; s->base_blk = NULL; - BLOCK_UPLEN(s->qual_blk); s->block[2] = s->qual_blk; s->qual_blk = NULL; - BLOCK_UPLEN(s->name_blk); s->block[3] = s->name_blk; s->name_blk = NULL; - BLOCK_UPLEN(s->aux_blk); s->block[5] = s->aux_blk; s->aux_blk = NULL; - -#ifdef TN_external - if (fd->version == CRAM_1_VERS) { - cram_free_block(s->block[s->tn_id]); - BLOCK_UPLEN(s->tn_blk); s->block[s->tn_id] = s->tn_blk; - s->tn_blk = NULL; + r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ], + (char *)&cr->mqual, 1); + } else { + char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; + r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len); } -#endif - s->block[4]->comp_size = s->block[4]->uncomp_size; - -#ifdef BA_external - s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size; -#endif + return r ? -1 : 0; +} - /* Compress the CORE Block too, with minimal zlib level */ - if (fd->level > 5) - cram_compress_block(fd, s->block[0], NULL, 1, Z_CRAM_STRAT, -1, -1); -#define USE_METRICS +/* + * Applies various compression methods to specific blocks, depending on + * known observations of how data series compress. + * + * Returns 0 on success + * -1 on failure + */ +static int cram_compress_slice(cram_fd *fd, cram_slice *s) { + int level = fd->level, i; + int method = 1< 5 && s->block[0]->uncomp_size > 500) + cram_compress_block(fd, s->block[0], NULL, GZIP, 1); + + if (fd->use_bz2) + method |= 1<use_rans) + method |= (1<use_lzma) + method |= (1<= 6) + methodF = method; + - /* Compress the other blocks */ - if (cram_compress_block(fd, s->block[1], NULL, //IN (seq) - fd->level, Z_CRAM_STRAT, - -1, -1)) + /* Specific compression methods for certain block types */ + if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq) + method, level)) return -1; if (fd->level == 0) { /* Do nothing */ } else if (fd->level == 1) { - if (cram_compress_block(fd, s->block[2], fd->m[1], //qual - 1, Z_RLE, -1, -1)) - return -1; - if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags - 1, Z_RLE, -1, -1)) + if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], + methodF, 1)) return -1; + for (i = DS_aux; i <= DS_aux_oz; i++) { + if (s->block[i]) + if (cram_compress_block(fd, s->block[i], fd->m[i], + method, 1)) + return -1; + } } else if (fd->level < 3) { - if (cram_compress_block(fd, s->block[2], fd->m[1], //qual - 1, Z_RLE, - 1, Z_HUFFMAN_ONLY)) + if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], + method, 1)) return -1; - if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags - 1, Z_RLE, - 1, Z_HUFFMAN_ONLY)) + if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], + method, 1)) return -1; + if (s->block[DS_BB]) + if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], + method, 1)) + return -1; + for (i = DS_aux; i <= DS_aux_oz; i++) { + if (s->block[i]) + if (cram_compress_block(fd, s->block[i], fd->m[i], + method, level)) + return -1; + } } else { - if (cram_compress_block(fd, s->block[2], fd->m[1], //qual - fd->level, Z_CRAM_STRAT, - LEVEL2, STRAT2)) + if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], + method, level)) return -1; - if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags - fd->level, Z_CRAM_STRAT, - LEVEL2, STRAT2)) + if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], + method, level)) return -1; + if (s->block[DS_BB]) + if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], + method, level)) + return -1; + for (i = DS_aux; i <= DS_aux_oz; i++) { + if (s->block[i]) + if (cram_compress_block(fd, s->block[i], fd->m[i], + method, level)) + return -1; + } } - if (cram_compress_block(fd, s->block[3], NULL, //Name - fd->level, Z_CRAM_STRAT, - -1, -1)) - return -1; - if (cram_compress_block(fd, s->block[4], NULL, //TS, NP - fd->level, Z_CRAM_STRAT, - -1, -1)) + + // NAME: best is generally xz, bzip2, zlib then rans1 + // It benefits well from a little bit extra compression level. + if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], + method & ~(1<version != CRAM_1_VERS) { - if (cram_compress_block(fd, s->block[6], NULL, //SC (seq) - fd->level, Z_CRAM_STRAT, - -1, -1)) + + // NS shows strong local correlation as rearrangements are localised + if (s->block[DS_NS] != s->block[0]) + if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], + method, level)) return -1; + + + /* + * Minimal compression of any block still uncompressed, bar CORE + */ + { + int i; + for (i = 1; i < DS_END; i++) { + if (!s->block[i] || s->block[i] == s->block[0]) + continue; + + // fast methods only + if (s->block[i]->method == RAW) { + cram_compress_block(fd, s->block[i], fd->m[i], + methodF, level); + } + } } -#ifdef BA_external - if (cram_compress_block(fd, s->block[s->ba_id], NULL, - fd->level, Z_CRAM_STRAT, -1, -1)) + + return 0; +} + +/* + * Encodes a single slice from a container + * + * Returns 0 on success + * -1 on failure + */ +static int cram_encode_slice(cram_fd *fd, cram_container *c, + cram_block_compression_hdr *h, cram_slice *s) { + int rec, r = 0, last_pos; + int embed_ref; + enum cram_DS_ID id; + + embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; + + /* + * Slice external blocks: + * ID 0 => base calls (insertions, soft-clip) + * ID 1 => qualities + * ID 2 => names + * ID 3 => TS (insert size), NP (next frag) + * ID 4 => tag values + * ID 6 => tag IDs (TN), if CRAM_V1.0 + * ID 7 => TD tag dictionary, if !CRAM_V1.0 + */ + + /* Create cram slice header */ + s->hdr->ref_base_id = embed_ref ? DS_ref : -1; + s->hdr->record_counter = c->num_records + c->record_counter; + c->num_records += s->hdr->num_records; + + s->block = calloc(DS_END, sizeof(s->block[0])); + s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); + if (!s->block || !s->hdr->block_content_ids) return -1; -#endif -#ifdef TN_external - if (fd->version == CRAM_1_VERS) { - if (cram_compress_block(fd, s->block[s->tn_id], NULL, - fd->level, Z_DEFAULT_STRATEGY, -1, -1)) - return -1; + + // Create first fixed blocks, always external. + // CORE + if (!(s->block[0] = cram_new_block(CORE, 0))) + return -1; + + // TN block for CRAM v1 + if (CRAM_MAJOR_VERS(fd->version) == 1) { + if (h->codecs[DS_TN]->codec == E_EXTERNAL) { + if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1; + h->codecs[DS_TN]->external.content_id = DS_TN; + } else { + s->block[DS_TN] = s->block[0]; + } + s->block[DS_TN] = s->block[DS_TN]; } -#endif + + // Embedded reference if (embed_ref) { - BLOCK_UPLEN(s->block[s->ref_id]); - if (cram_compress_block(fd, s->block[s->ref_id], NULL, - fd->level, Z_DEFAULT_STRATEGY, -1, -1)) + if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) + return -1; + s->ref_id = DS_ref; // needed? + BLOCK_APPEND(s->block[DS_ref], + c->ref + c->first_base - c->ref_start, + c->last_base - c->first_base + 1); + } + + /* + * All the data-series blocks if appropriate. + */ + for (id = DS_BF; id < DS_TN; id++) { + if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || + h->codecs[id]->codec == E_BYTE_ARRAY_STOP || + h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { + switch (h->codecs[id]->codec) { + case E_EXTERNAL: + if (!(s->block[id] = cram_new_block(EXTERNAL, id))) + return -1; + h->codecs[id]->external.content_id = id; + break; + + case E_BYTE_ARRAY_STOP: + if (!(s->block[id] = cram_new_block(EXTERNAL, id))) + return -1; + h->codecs[id]->byte_array_stop.content_id = id; + break; + + case E_BYTE_ARRAY_LEN: { + cram_codec *cc; + + cc = h->codecs[id]->e_byte_array_len.len_codec; + if (cc->codec == E_EXTERNAL) { + int eid = cc->external.content_id; + if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) + return -1; + cc->external.content_id = eid; + cc->out = s->block[eid]; + } + + cc = h->codecs[id]->e_byte_array_len.val_codec; + if (cc->codec == E_EXTERNAL) { + int eid = cc->external.content_id; + if (!s->block[eid]) + if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) + return -1; + cc->external.content_id = eid; + cc->out = s->block[eid]; + } + break; + } + default: + break; + } + } else { + if (!(id == DS_BB && !h->codecs[DS_BB])) + s->block[id] = s->block[0]; + } + if (h->codecs[id]) + h->codecs[id]->out = s->block[id]; + } + + /* Encode reads */ + last_pos = s->hdr->ref_seq_start; + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1) + return -1; + } + + s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7); + s->block[0]->comp_size = s->block[0]->uncomp_size; + + // Make sure the fixed blocks point to the correct sources + s->block[DS_IN] = s->base_blk; s->base_blk = NULL; + s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; + s->block[DS_RN] = s->name_blk; s->name_blk = NULL; + s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; + s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL; + s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL; + s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL; + s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL; + s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL; + s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL; + s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL; + s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL; + s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL; + + // Ensure block sizes are up to date. + for (id = 1; id < DS_END; id++) { + if (!s->block[id] || s->block[id] == s->block[0]) + continue; + + if (s->block[id]->uncomp_size == 0) + BLOCK_UPLEN(s->block[id]); + } + + // Compress it all + if (cram_compress_slice(fd, s) == -1) + return -1; + + // Collapse empty blocks and create hdr_block + { + int i, j; + for (i = j = 1; i < DS_END; i++) { + if (!s->block[i] || s->block[i] == s->block[0]) + continue; + if (s->block[i]->uncomp_size == 0) { + cram_free_block(s->block[i]); + s->block[i] = NULL; + continue; + } + s->block[j] = s->block[i]; + s->hdr->block_content_ids[j-1] = s->block[i]->content_id; + j++; + } + s->hdr->num_content_ids = j-1; + s->hdr->num_blocks = j; + + if (!(s->hdr_block = cram_encode_slice_header(fd, s))) return -1; } @@ -1096,14 +1236,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { nref = fd->refs->nref; pthread_mutex_unlock(&fd->ref_lock); - if (c->refs_used) { + if (!fd->no_ref && c->refs_used) { for (i = 0; i < nref; i++) { - if (c->refs_used[i]) { + if (c->refs_used[i]) cram_get_ref(fd, i, 1, 0); - } } } + /* To create M5 strings */ /* Fetch reference sequence */ if (!fd->no_ref) { bam_seq_t *b = c->bams[0]; @@ -1123,7 +1263,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { c->ref_seq_id = c->ref_id; // FIXME remove one var! } } else { - c->ref_seq_id = c->ref_id; // FIXME remove one var! + c->ref_id = bam_ref(c->bams[0]); + cram_ref_incr(fd->refs, c->ref_id); + c->ref_seq_id = c->ref_id; } /* Turn bams into cram_records and gather basic stats */ @@ -1183,6 +1325,11 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { s->hdr->num_records = r2; } + if (c->multi_seq && !fd->no_ref) { + if (c->ref_seq_id >= 0) + cram_ref_decr(fd->refs, c->ref_seq_id); + } + /* Link our bams[] array onto the spare bam list for reuse */ spares = malloc(sizeof(*spares)); pthread_mutex_lock(&fd->bam_list_lock); @@ -1193,8 +1340,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { c->bams = NULL; /* Detect if a multi-seq container */ - cram_stats_encoding(fd, c->RI_stats); - multi_ref = c->RI_stats->nvals > 1; + cram_stats_encoding(fd, c->stats[DS_RI]); + multi_ref = c->stats[DS_RI]->nvals > 1; if (multi_ref) { if (fd->verbose) @@ -1209,7 +1356,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; - if (fd->version != CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) != 1) { if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { MD5_CTX md5; MD5_Init(&md5); @@ -1228,201 +1375,203 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { c->length = 0; //fprintf(stderr, "=== BF ===\n"); - h->BF_codec = cram_encoder_init(cram_stats_encoding(fd, c->BF_stats), - c->BF_stats, E_INT, NULL, - fd->version); + h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), + c->stats[DS_BF], E_INT, NULL, + fd->version); //fprintf(stderr, "=== CF ===\n"); - h->CF_codec = cram_encoder_init(cram_stats_encoding(fd, c->CF_stats), - c->CF_stats, E_INT, NULL, - fd->version); + h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), + c->stats[DS_CF], E_INT, NULL, + fd->version); // fprintf(stderr, "=== RN ===\n"); -// h->RN_codec = cram_encoder_init(cram_stats_encoding(fd, c->RN_stats), -// c->RN_stats, E_BYTE_ARRAY, NULL, +// h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]), +// c->stats[DS_RN], E_BYTE_ARRAY, NULL, // fd->version); //fprintf(stderr, "=== AP ===\n"); if (c->pos_sorted) { - h->AP_codec = cram_encoder_init(cram_stats_encoding(fd, c->AP_stats), - c->AP_stats, E_INT, NULL, - fd->version); + h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), + c->stats[DS_AP], E_INT, NULL, + fd->version); } else { int p[2] = {0, c->max_apos}; - h->AP_codec = cram_encoder_init(E_BETA, NULL, E_INT, p, fd->version); + h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, + fd->version); } //fprintf(stderr, "=== RG ===\n"); - h->RG_codec = cram_encoder_init(cram_stats_encoding(fd, c->RG_stats), - c->RG_stats, E_INT, NULL, - fd->version); + h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), + c->stats[DS_RG], E_INT, NULL, + fd->version); //fprintf(stderr, "=== MQ ===\n"); - h->MQ_codec = cram_encoder_init(cram_stats_encoding(fd, c->MQ_stats), - c->MQ_stats, E_INT, NULL, - fd->version); + h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), + c->stats[DS_MQ], E_INT, NULL, + fd->version); //fprintf(stderr, "=== NS ===\n"); -#ifdef NS_external - h->NS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, - (void *)CRAM_EXT_NS, - fd->version); -#else - h->NS_codec = cram_encoder_init(cram_stats_encoding(fd, c->NS_stats), - c->NS_stats, E_INT, NULL, - fd->version); -#endif + h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), + c->stats[DS_NS], E_INT, NULL, + fd->version); //fprintf(stderr, "=== MF ===\n"); - h->MF_codec = cram_encoder_init(cram_stats_encoding(fd, c->MF_stats), - c->MF_stats, E_INT, NULL, - fd->version); - -#ifdef TS_external - h->TS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, - (void *)CRAM_EXT_TS_NP, - fd->version); - h->NP_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, - (void *)CRAM_EXT_TS_NP, - fd->version); -#else + h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), + c->stats[DS_MF], E_INT, NULL, + fd->version); + //fprintf(stderr, "=== TS ===\n"); - h->TS_codec = cram_encoder_init(cram_stats_encoding(fd, c->TS_stats), - c->TS_stats, E_INT, NULL, - fd->version); + h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), + c->stats[DS_TS], E_INT, NULL, + fd->version); //fprintf(stderr, "=== NP ===\n"); - h->NP_codec = cram_encoder_init(cram_stats_encoding(fd, c->NP_stats), - c->NP_stats, E_INT, NULL, - fd->version); -#endif - + h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), + c->stats[DS_NP], E_INT, NULL, + fd->version); //fprintf(stderr, "=== NF ===\n"); - h->NF_codec = cram_encoder_init(cram_stats_encoding(fd, c->NF_stats), - c->NF_stats, E_INT, NULL, - fd->version); + h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), + c->stats[DS_NF], E_INT, NULL, + fd->version); //fprintf(stderr, "=== RL ===\n"); - h->RL_codec = cram_encoder_init(cram_stats_encoding(fd, c->RL_stats), - c->RL_stats, E_INT, NULL, - fd->version); + h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), + c->stats[DS_RL], E_INT, NULL, + fd->version); //fprintf(stderr, "=== FN ===\n"); - h->FN_codec = cram_encoder_init(cram_stats_encoding(fd, c->FN_stats), - c->FN_stats, E_INT, NULL, - fd->version); + h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), + c->stats[DS_FN], E_INT, NULL, + fd->version); //fprintf(stderr, "=== FC ===\n"); - h->FC_codec = cram_encoder_init(cram_stats_encoding(fd, c->FC_stats), - c->FC_stats, E_BYTE, NULL, - fd->version); + h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), + c->stats[DS_FC], E_BYTE, NULL, + fd->version); //fprintf(stderr, "=== FP ===\n"); - h->FP_codec = cram_encoder_init(cram_stats_encoding(fd, c->FP_stats), - c->FP_stats, E_INT, NULL, - fd->version); + h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), + c->stats[DS_FP], E_INT, NULL, + fd->version); //fprintf(stderr, "=== DL ===\n"); - h->DL_codec = cram_encoder_init(cram_stats_encoding(fd, c->DL_stats), - c->DL_stats, E_INT, NULL, - fd->version); - -#ifdef BA_external - h->BA_codec = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, - (void *)CRAM_EXT_BA, - fd->version); -#else + h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), + c->stats[DS_DL], E_INT, NULL, + fd->version); + //fprintf(stderr, "=== BA ===\n"); - h->BA_codec = cram_encoder_init(cram_stats_encoding(fd, c->BA_stats), - c->BA_stats, E_BYTE, NULL, - fd->version); -#endif + h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), + c->stats[DS_BA], E_BYTE, NULL, + fd->version); + + if (CRAM_MAJOR_VERS(fd->version) >= 3) { + cram_byte_array_len_encoder e; + + e.len_encoding = E_EXTERNAL; + e.len_dat = (void *)DS_BB_len; + //e.len_dat = (void *)DS_BB; + + e.val_encoding = E_EXTERNAL; + e.val_dat = (void *)DS_BB; + + h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, + E_BYTE_ARRAY, (void *)&e, + fd->version); + } else { + h->codecs[DS_BB] = NULL; + } //fprintf(stderr, "=== BS ===\n"); - h->BS_codec = cram_encoder_init(cram_stats_encoding(fd, c->BS_stats), - c->BS_stats, E_BYTE, NULL, - fd->version); - - if (fd->version == CRAM_1_VERS) { - h->TL_codec = NULL; - h->RI_codec = NULL; - h->RS_codec = NULL; - h->PD_codec = NULL; - h->HC_codec = NULL; - h->SC_codec = NULL; + h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), + c->stats[DS_BS], E_BYTE, NULL, + fd->version); + + if (CRAM_MAJOR_VERS(fd->version) == 1) { + h->codecs[DS_TL] = NULL; + h->codecs[DS_RI] = NULL; + h->codecs[DS_RS] = NULL; + h->codecs[DS_PD] = NULL; + h->codecs[DS_HC] = NULL; + h->codecs[DS_SC] = NULL; //fprintf(stderr, "=== TC ===\n"); - h->TC_codec = cram_encoder_init(cram_stats_encoding(fd, c->TC_stats), - c->TC_stats, E_BYTE, NULL, - fd->version); + h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), + c->stats[DS_TC], E_BYTE, NULL, + fd->version); //fprintf(stderr, "=== TN ===\n"); -#ifdef TN_external - h->TN_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, - (void *)CRAM_EXT_TN, - fd->version); -#else - h->TN_codec = cram_encoder_init(cram_stats_encoding(fd, c->TN_stats), - c->TN_stats, E_INT, NULL, - fd->version); -#endif + h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), + c->stats[DS_TN], E_INT, NULL, + fd->version); } else { - int i2[2] = {0, CRAM_EXT_SC}; - - h->TC_codec = NULL; - h->TN_codec = NULL; + h->codecs[DS_TC] = NULL; + h->codecs[DS_TN] = NULL; //fprintf(stderr, "=== TL ===\n"); - h->TL_codec = cram_encoder_init(cram_stats_encoding(fd, c->TL_stats), - c->TL_stats, E_INT, NULL, - fd->version); + h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), + c->stats[DS_TL], E_INT, NULL, + fd->version); //fprintf(stderr, "=== RI ===\n"); - h->RI_codec = cram_encoder_init(cram_stats_encoding(fd, c->RI_stats), - c->RI_stats, E_INT, NULL, - fd->version); + h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), + c->stats[DS_RI], E_INT, NULL, + fd->version); //fprintf(stderr, "=== RS ===\n"); - h->RS_codec = cram_encoder_init(cram_stats_encoding(fd, c->RS_stats), - c->RS_stats, E_INT, NULL, - fd->version); + h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), + c->stats[DS_RS], E_INT, NULL, + fd->version); //fprintf(stderr, "=== PD ===\n"); - h->PD_codec = cram_encoder_init(cram_stats_encoding(fd, c->PD_stats), - c->PD_stats, E_INT, NULL, - fd->version); + h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), + c->stats[DS_PD], E_INT, NULL, + fd->version); //fprintf(stderr, "=== HC ===\n"); - h->HC_codec = cram_encoder_init(cram_stats_encoding(fd, c->HC_stats), - c->HC_stats, E_INT, NULL, - fd->version); + h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), + c->stats[DS_HC], E_INT, NULL, + fd->version); //fprintf(stderr, "=== SC ===\n"); - h->SC_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, - E_BYTE_ARRAY, (void *)i2, - fd->version); + if (1) { + int i2[2] = {0, DS_SC}; + + h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, + E_BYTE_ARRAY, (void *)i2, + fd->version); + } else { + // Appears to be no practical benefit to using this method, + // but it may work better if we start mixing SC, IN and BB + // elements into the same external block. + cram_byte_array_len_encoder e; + + e.len_encoding = E_EXTERNAL; + e.len_dat = (void *)DS_SC_len; + + e.val_encoding = E_EXTERNAL; + e.val_dat = (void *)DS_SC; + + h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, + E_BYTE_ARRAY, (void *)&e, + fd->version); + } } //fprintf(stderr, "=== IN ===\n"); { - int i2[2] = {0, CRAM_EXT_IN}; - h->IN_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, - E_BYTE_ARRAY, (void *)i2, - fd->version); + int i2[2] = {0, DS_IN}; + h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, + E_BYTE_ARRAY, (void *)i2, + fd->version); } + h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, + (void *)DS_QS, + fd->version); { - //int i2[2] = {0, 1}; - //h->QS_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, (void *)i2, - // fd->version); - h->QS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, - (void *)CRAM_EXT_QUAL, - fd->version); - } - { - int i2[2] = {0, CRAM_EXT_NAME}; - h->RN_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, - E_BYTE_ARRAY, (void *)i2, - fd->version); + int i2[2] = {0, DS_RN}; + h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, + E_BYTE_ARRAY, (void *)i2, + fd->version); } @@ -1465,7 +1614,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { slice_offset = c_hdr->method == RAW ? c_hdr->uncomp_size : c_hdr->comp_size; - slice_offset += 2 + + slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + itf8_size(c_hdr->content_id) + itf8_size(c_hdr->comp_size) + itf8_size(c_hdr->uncomp_size); @@ -1490,13 +1639,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { ? s->hdr_block->uncomp_size : s->hdr_block->comp_size; - slice_offset += 2 + + slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + itf8_size(s->hdr_block->content_id) + itf8_size(s->hdr_block->comp_size) + itf8_size(s->hdr_block->uncomp_size); for (j = 0; j < s->hdr->num_blocks; j++) { - slice_offset += 2 + + slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + itf8_size(s->block[j]->content_id) + itf8_size(s->block[j]->comp_size) + itf8_size(s->block[j]->uncomp_size); @@ -1515,7 +1664,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } /* Cache references up-front if we have unsorted access patterns */ - if (c->refs_used) { + if (!fd->no_ref && c->refs_used) { for (i = 0; i < fd->refs->nref; i++) { if (c->refs_used[i]) cram_ref_decr(fd->refs, i); @@ -1546,12 +1695,12 @@ static int cram_add_feature(cram_container *c, cram_slice *s, if (!r->nfeature++) { r->feature = s->nfeatures; - cram_stats_add(c->FP_stats, f->X.pos); + cram_stats_add(c->stats[DS_FP], f->X.pos); } else { - cram_stats_add(c->FP_stats, + cram_stats_add(c->stats[DS_FP], f->X.pos - s->features[r->feature + r->nfeature-2].X.pos); } - cram_stats_add(c->FC_stats, f->X.code); + cram_stats_add(c->stats[DS_FC], f->X.code); s->features[s->nfeatures++] = *f; @@ -1568,19 +1717,32 @@ static int cram_add_substitution(cram_fd *fd, cram_container *c, f.X.pos = pos+1; f.X.code = 'X'; f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f]; - cram_stats_add(c->BS_stats, f.X.base); + cram_stats_add(c->stats[DS_BS], f.X.base); } else { f.B.pos = pos+1; f.B.code = 'B'; f.B.base = base; f.B.qual = qual; - cram_stats_add(c->BA_stats, f.B.base); - cram_stats_add(c->QS_stats, f.B.qual); + cram_stats_add(c->stats[DS_BA], f.B.base); + cram_stats_add(c->stats[DS_QS], f.B.qual); BLOCK_APPEND_CHAR(s->qual_blk, qual); } return cram_add_feature(c, s, r, &f); } +static int cram_add_bases(cram_fd *fd, cram_container *c, + cram_slice *s, cram_record *r, + int pos, int len, char *base) { + cram_feature f; + + f.b.pos = pos+1; + f.b.code = 'b'; + f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk); + f.b.len = len; + + return cram_add_feature(c, s, r, &f); +} + static int cram_add_base(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *r, int pos, char base, char qual) { @@ -1589,12 +1751,8 @@ static int cram_add_base(cram_fd *fd, cram_container *c, f.B.code = 'B'; f.B.base = base; f.B.qual = qual; -#ifdef BA_external - s->BA_len++; -#else - cram_stats_add(c->BA_stats, base); -#endif - cram_stats_add(c->QS_stats, qual); + cram_stats_add(c->stats[DS_BA], base); + cram_stats_add(c->stats[DS_QS], qual); BLOCK_APPEND_CHAR(s->qual_blk, qual); return cram_add_feature(c, s, r, &f); } @@ -1606,7 +1764,7 @@ static int cram_add_quality(cram_fd *fd, cram_container *c, f.Q.pos = pos+1; f.Q.code = 'Q'; f.Q.qual = qual; - cram_stats_add(c->QS_stats, qual); + cram_stats_add(c->stats[DS_QS], qual); BLOCK_APPEND_CHAR(s->qual_blk, qual); return cram_add_feature(c, s, r, &f); } @@ -1617,7 +1775,7 @@ static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, f.D.pos = pos+1; f.D.code = 'D'; f.D.len = len; - cram_stats_add(c->DL_stats, len); + cram_stats_add(c->stats[DS_DL], len); return cram_add_feature(c, s, r, &f); } @@ -1627,11 +1785,15 @@ static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'S'; f.S.len = len; - if (version == CRAM_1_VERS) { + switch (CRAM_MAJOR_VERS(version)) { + case 1: f.S.seq_idx = BLOCK_SIZE(s->base_blk); BLOCK_APPEND(s->base_blk, base, len); BLOCK_APPEND_CHAR(s->base_blk, '\0'); - } else { + break; + + case 2: + default: f.S.seq_idx = BLOCK_SIZE(s->soft_blk); if (base) { BLOCK_APPEND(s->soft_blk, base, len); @@ -1641,6 +1803,11 @@ static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, BLOCK_APPEND_CHAR(s->soft_blk, 'N'); } BLOCK_APPEND_CHAR(s->soft_blk, '\0'); + break; + +// default: +// // v3.0 onwards uses BB data-series +// f.S.seq_idx = BLOCK_SIZE(s->soft_blk); } return cram_add_feature(c, s, r, &f); } @@ -1651,7 +1818,7 @@ static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'H'; f.S.len = len; - cram_stats_add(c->HC_stats, len); + cram_stats_add(c->stats[DS_HC], len); return cram_add_feature(c, s, r, &f); } @@ -1661,7 +1828,7 @@ static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'N'; f.S.len = len; - cram_stats_add(c->RS_stats, len); + cram_stats_add(c->stats[DS_RS], len); return cram_add_feature(c, s, r, &f); } @@ -1671,7 +1838,7 @@ static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'P'; f.S.len = len; - cram_stats_add(c->PD_stats, len); + cram_stats_add(c->stats[DS_PD], len); return cram_add_feature(c, s, r, &f); } @@ -1683,11 +1850,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, char b = base ? *base : 'N'; f.i.code = 'i'; f.i.base = b; -#ifdef BA_external - s->BA_len++; -#else - cram_stats_add(c->BA_stats, b); -#endif + cram_stats_add(c->stats[DS_BA], b); } else { f.I.code = 'I'; f.I.len = len; @@ -1711,7 +1874,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, */ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_slice *s, cram_record *cr) { - char *aux, *tmp, *rg = NULL, *tmp_tn; + char *aux, *tmp, *rg = NULL; int aux_size = bam_blk_size(b) - ((char *)bam_aux(b) - (char *)&bam_ref(b)); @@ -1719,15 +1882,9 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, BLOCK_GROW(s->aux_blk, aux_size*1.34+1); tmp = (char *)BLOCK_END(s->aux_blk); -#ifdef TN_external - BLOCK_GROW(s->tn_blk, aux_size); - tmp_tn = (char *)BLOCK_END(s->tn_blk); -#endif - aux = (char *)bam_aux(b); -#ifndef TN_external cr->TN_idx = s->nTN; -#endif + while (aux[0] != 0) { int32_t i32; int r; @@ -1759,17 +1916,13 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, if (-1 == r) return NULL; -#ifndef TN_external if (s->nTN >= s->aTN) { s->aTN = s->aTN ? s->aTN*2 : 1024; if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) return NULL; } s->TN[s->nTN++] = i32; - cram_stats_add(c->TN_stats, i32); -#else - tmp_tn += itf8_put(tmp_tn, i32); -#endif + cram_stats_add(c->stats[DS_TN], i32); switch(aux[2]) { case 'A': case 'C': case 'c': @@ -1842,19 +1995,13 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, return NULL; } } - cram_stats_add(c->TC_stats, cr->ntags); + cram_stats_add(c->stats[DS_TC], cr->ntags); cr->aux = BLOCK_SIZE(s->aux_blk); cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); assert(s->aux_blk->byte <= s->aux_blk->alloc); -#ifdef TN_external - cr->tn = BLOCK_SIZE(s->tn_blk); - BLOCK_SIZE(s->tn_blk) = (uc *)tmp_tn - BLOCK_DATA(s->tn_blk); - assert(s->tn_blk->byte <= s->tn_blk->alloc); -#endif - return rg; } @@ -1868,12 +2015,7 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_slice *s, cram_record *cr) { char *aux, *orig, *tmp, *rg = NULL; -#ifdef SAMTOOLS int aux_size = bam_get_l_aux(b); -#else - int aux_size = bam_blk_size(b) - - ((char *)bam_aux(b) - (char *)&bam_ref(b)); -#endif cram_block *td_b = c->comp_hdr->TD_blk; int TD_blk_size = BLOCK_SIZE(td_b), new; char *key; @@ -1920,6 +2062,150 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (-1 == r) return NULL; + // BQ:Z + if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') { + char *tmp; + if (!s->aux_BQ_blk) + if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ))) + return NULL; + BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_BQ_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk); + continue; + } + + // BD:Z + if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') { + char *tmp; + if (!s->aux_BD_blk) + if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD))) + return NULL; + BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_BD_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk); + continue; + } + + // BI:Z + if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') { + char *tmp; + if (!s->aux_BI_blk) + if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI))) + return NULL; + BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_BI_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk); + continue; + } + + // OQ:Z: + if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') { + char *tmp; + if (!s->aux_OQ_blk) + if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ))) + return NULL; + BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_OQ_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk); + continue; + } + + // FZ:B or ZM:B + if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') || + (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) { + int type = aux[3], blen; + uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + + (((unsigned char *)aux)[5]<< 8) + + (((unsigned char *)aux)[6]<<16) + + (((unsigned char *)aux)[7]<<24)); + char *tmp; + if (!s->aux_FZ_blk) + if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ))) + return NULL; + BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_FZ_blk); + + // skip TN field + aux+=3; + + // We use BYTE_ARRAY_LEN with external length, so store that first + switch (type) { + case 'c': case 'C': + blen = count; + break; + case 's': case 'S': + blen = 2*count; + break; + case 'i': case 'I': case 'f': + blen = 4*count; + break; + default: + fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", + type); + return NULL; + + } + + blen += 5; // sub-type & length + tmp += itf8_put(tmp, blen); + + // The tag data itself + memcpy(tmp, aux, blen); tmp += blen; aux += blen; + + BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk); + continue; + } + + // Other quality data - {Q2,E2,U2,CQ}:Z and similar + if (((aux[0] == 'Q' && aux[1] == '2') || + (aux[0] == 'U' && aux[1] == '2') || + (aux[0] == 'Q' && aux[1] == 'T') || + (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') { + char *tmp; + if (!s->aux_oq_blk) + if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq))) + return NULL; + BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_oq_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk); + continue; + } + + // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar + if (((aux[0] == 'R' && aux[1] == '2') || + (aux[0] == 'E' && aux[1] == '2') || + (aux[0] == 'C' && aux[1] == 'S') || + (aux[0] == 'B' && aux[1] == 'C') || + (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') { + char *tmp; + if (!s->aux_os_blk) + if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os))) + return NULL; + BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_os_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk); + continue; + } + + switch(aux[2]) { case 'A': case 'C': case 'c': aux+=3; @@ -1940,11 +2226,22 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + break; case 'Z': case 'H': - aux+=3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; // stop byte + { + char *tmp; + if (!s->aux_oz_blk) + if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz))) + return NULL; + BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_oz_blk); + aux += 3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; + BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp - + BLOCK_DATA(s->aux_oz_blk); + } break; case 'B': { @@ -1974,10 +2271,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, } - tmp += itf8_put(tmp, blen+5); - - *tmp++=*aux++; // sub-type & length - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + blen += 5; // sub-type & length + tmp += itf8_put(tmp, blen); // The tag data itself memcpy(tmp, aux, blen); tmp += blen; aux += blen; @@ -2011,7 +2306,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, } cr->TL = kh_val(c->comp_hdr->TD_hash, k); - cram_stats_add(c->TL_stats, cr->TL); + cram_stats_add(c->stats[DS_TL], cr->TL); cr->aux = BLOCK_SIZE(s->aux_blk); cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); @@ -2135,13 +2430,14 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, bam_seq_t *b, int rnum) { - int i, fake_qual = 0; + int i, fake_qual = -1; char *cp, *rg; char *ref, *seq, *qual; // FIXME: multi-ref containers ref = c->ref; + cr->len = bam_seq_len(b); cram_stats_add(c->stats[DS_RL], cr->len); //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); @@ -2149,8 +2445,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, //cr->mate_line; // index to another cram_record //cr->mate_flags; // MF //cr->ntags; // TC - cr->ntags = 0; //cram_stats_add(c->TC_stats, cr->ntags); - if (fd->version == CRAM_1_VERS) + cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); + if (CRAM_MAJOR_VERS(fd->version) == 1) rg = cram_encode_aux_1_0(fd, b, c, s, cr); else rg = cram_encode_aux(fd, b, c, s, cr); @@ -2163,45 +2459,46 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (rg) { SAM_RG *brg = sam_hdr_find_rg(fd->header, rg); cr->rg = brg ? brg->id : -1; - } else if (fd->version == CRAM_1_VERS) { + } else if (CRAM_MAJOR_VERS(fd->version) == 1) { SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN"); assert(brg); } else { cr->rg = -1; } - cram_stats_add(c->RG_stats, cr->rg); + cram_stats_add(c->stats[DS_RG], cr->rg); - cr->ref_id = bam_ref(b); cram_stats_add(c->RI_stats, cr->ref_id); + cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id); cr->flags = bam_flag(b); if (bam_cigar_len(b) == 0) cr->flags |= BAM_FUNMAP; - cram_stats_add(c->BF_stats, fd->cram_flag_swap[cr->flags & 0xfff]); + cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]); - if (!fd->no_ref) + // Non reference based encoding means storing the bases verbatim as features, which in + // turn means every base also has a quality already stored. + if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES; else cr->cram_flags = 0; - //cram_stats_add(c->CF_stats, cr->cram_flags); + //cram_stats_add(c->stats[DS_CF], cr->cram_flags); - cr->len = bam_seq_len(b); cram_stats_add(c->RL_stats, cr->len); c->num_bases += cr->len; cr->apos = bam_pos(b)+1; if (c->pos_sorted) { if (cr->apos < s->last_apos) { c->pos_sorted = 0; } else { - cram_stats_add(c->AP_stats, cr->apos - s->last_apos); + cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos); s->last_apos = cr->apos; } } else { - //cram_stats_add(c->AP_stats, cr->apos); + //cram_stats_add(c->stats[DS_AP], cr->apos); } c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos); cr->name = BLOCK_SIZE(s->name_blk); cr->name_len = bam_name_len(b); - cram_stats_add(c->RN_stats, cr->name_len); + cram_stats_add(c->stats[DS_RN], cr->name_len); BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); @@ -2209,7 +2506,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, /* * This seqs_ds is largely pointless and it could reuse the same memory * over and over. - * s->base_ds is what we need for encoding. + * s->base_blk is what we need for encoding. */ cr->seq = BLOCK_SIZE(s->seqs_blk); cr->qual = BLOCK_SIZE(s->qual_blk); @@ -2218,14 +2515,57 @@ static int process_one_read(cram_fd *fd, cram_container *c, seq = cp = (char *)BLOCK_END(s->seqs_blk); *seq = 0; - for (i = 0; i < cr->len; i++) { - // FIXME: do 2 char at a time for efficiency -#ifdef SAMTOOLS - cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; +#ifdef ALLOW_UAC + { + // Convert seq 2 bases at a time for speed. + static const uint16_t code2base[256] = { + 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077, + 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029, + 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081, + 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033, + 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083, + 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035, + 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093, + 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045, + 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087, + 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039, + 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098, + 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050, + 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099, + 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051, + 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102, + 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054, + 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100, + 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052, + 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103, + 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055, + 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105, + 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057, + 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088, + 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040, + 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091, + 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043, + 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084, + 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036, + 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082, + 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034, + 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094, + 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046 + }; + + int l2 = cr->len / 2; + unsigned char *from = (unsigned char *)bam_seq(b); + uint16_t *cpi = (uint16_t *)cp; + cp[0] = 0; + for (i = 0; i < l2; i++) + cpi[i] = le_int2(code2base[from[i]]); + if ((i *= 2) < cr->len) + cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; + } #else - cp[i] = bam_nt16_rev_table[bam_seqi(bam_seq(b), i)]; + for (i = 0; i < cr->len; i++) + cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; #endif - } BLOCK_SIZE(s->seqs_blk) += cr->len; qual = cp = (char *)bam_qual(b); @@ -2269,24 +2609,63 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (!fd->no_ref && cr->len) { int end = cig_len+apos < c->ref_end ? cig_len : c->ref_end - apos; - for (l = 0; l < end && seq[spos]; l++, apos++, spos++) { - if (ref[apos] != seq[spos]) { - //fprintf(stderr, "Subst: %d; %c vs %c\n", - // spos, ref[apos], seq[spos]); - if (cram_add_substitution(fd, c, s, cr, spos, - seq[spos], qual[spos], - ref[apos])) - return -1; + char *sp = &seq[spos]; + char *rp = &ref[apos]; + char *qp = &qual[spos]; + for (l = 0; l < end; l++) { + if (rp[l] != sp[l]) { + if (!sp[l]) + break; + if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { + // Disabled for the time being as it doesn't + // seem to gain us much. + int ol=l; + while (l 1) { + if (cram_add_bases(fd, c, s, cr, spos+ol, + l-ol, &seq[spos+ol])) + return -1; + l--; + } else { + l = ol; + if (cram_add_substitution(fd, c, s, cr, + spos+l, sp[l], + qp[l], rp[l])) + return -1; + } + } else { + if (cram_add_substitution(fd, c, s, cr, spos+l, + sp[l], qp[l], rp[l])) + return -1; + } } } + spos += l; + apos += l; } if (l < cig_len && cr->len) { - /* off end of sequence or non-ref based output */ - for (; l < cig_len && seq[spos]; l++, spos++) { - if (cram_add_base(fd, c, s, cr, spos, - seq[spos], qual[spos])) - return -1; + if (fd->no_ref) { + if (CRAM_MAJOR_VERS(fd->version) == 3) { + if (cram_add_bases(fd, c, s, cr, spos, + cig_len-l, &seq[spos])) + return -1; + spos += cig_len-l; + } else { + for (; l < cig_len && seq[spos]; l++, spos++) { + if (cram_add_base(fd, c, s, cr, spos, + seq[spos], qual[spos])) + return -1; + } + } + } else { + /* off end of sequence or non-ref based output */ + for (; l < cig_len && seq[spos]; l++, spos++) { + if (cram_add_base(fd, c, s, cr, spos, + seq[spos], qual[spos])) + return -1; + } } apos += cig_len; } else if (!cr->len) { @@ -2326,7 +2705,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->len ? &seq[spos] : NULL, fd->version)) return -1; - if (fd->no_ref) { + if (fd->no_ref && + !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { if (cr->len) { for (l = 0; l < cig_len; l++, spos++) { cram_add_quality(fd, c, s, cr, spos, qual[spos]); @@ -2354,7 +2734,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } fake_qual = spos; cr->aend = MIN(apos, c->ref_end); - cram_stats_add(c->FN_stats, cr->nfeature); + cram_stats_add(c->stats[DS_FN], cr->nfeature); } else { // Unmapped cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; @@ -2362,12 +2742,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ncigar = 0; cr->nfeature = 0; cr->aend = cr->apos; -#ifdef BA_external - s->BA_len += cr->len; -#else for (i = 0; i < cr->len; i++) - cram_stats_add(c->BA_stats, seq[i]); -#endif + cram_stats_add(c->stats[DS_BA], seq[i]); } /* @@ -2378,7 +2754,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) { /* Special case of seq "*" */ if (cr->len == 0) { - cram_stats_add(c->RL_stats, cr->len = fake_qual); + cram_stats_add(c->stats[DS_RL], cr->len = fake_qual); BLOCK_GROW(s->qual_blk, cr->len); cp = (char *)BLOCK_END(s->qual_blk); memset(cp, 255, cr->len); @@ -2393,7 +2769,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, BLOCK_SIZE(s->qual_blk) += cr->len; } else { if (cr->len == 0) { - cram_stats_add(c->RL_stats, cr->len = cr->aend - cr->apos + 1); + cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1; + cram_stats_add(c->stats[DS_RL], cr->len); } } @@ -2401,6 +2778,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, { int new; khint_t k; + int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0; //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum, // cr->name_len, DSTRING_STR(s->name_ds)+cr->name); @@ -2411,63 +2789,116 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (!key) return -1; - k = kh_put(m_s2i, s->pair, key, &new); + k = kh_put(m_s2i, s->pair[sec], key, &new); if (-1 == new) return -1; else if (new > 0) - kh_val(s->pair, k) = rnum; + kh_val(s->pair[sec], k) = rnum; } else { new = 1; } if (new == 0) { - cram_record *p = &s->crecs[kh_val(s->pair, k)]; + cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; + int aleft, aright, sign; + + aleft = MIN(cr->apos, p->apos); + aright = MAX(cr->aend, p->aend); + if (cr->apos < p->apos) { + sign = 1; + } else if (cr->apos > p->apos) { + sign = -1; + } else if (cr->flags & BAM_FREAD1) { + sign = 1; + } else { + sign = -1; + } - //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair, k)); + //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k)); - // copy from p to cr - cr->mate_pos = p->apos; - cram_stats_add(c->NP_stats, cr->mate_pos); + // This vs p: tlen, matepos, flags + if (bam_ins_size(b) != sign*(aright-aleft+1)) + goto detached; - cr->tlen = cr->aend - p->apos; - cram_stats_add(c->TS_stats, cr->tlen); + if (MAX(bam_mate_pos(b)+1, 0) != p->apos) + goto detached; - cr->mate_flags = - ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + - ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; - cram_stats_add(c->MF_stats, cr->mate_flags); + if (((bam_flag(b) & BAM_FMUNMAP) != 0) != + ((p->flags & BAM_FUNMAP) != 0)) + goto detached; + + if (((bam_flag(b) & BAM_FMREVERSE) != 0) != + ((p->flags & BAM_FREVERSE) != 0)) + goto detached; + + + // p vs this: tlen, matepos, flags + if (p->tlen != -sign*(aright-aleft+1)) + goto detached; + + if (p->mate_pos != cr->apos) + goto detached; - // copy from cr to p - cram_stats_del(c->NP_stats, p->mate_pos); - p->mate_pos = cr->apos; - cram_stats_add(c->NP_stats, p->mate_pos); + if (((p->flags & BAM_FMUNMAP) != 0) != + ((p->mate_flags & CRAM_M_UNMAP) != 0)) + goto detached; - cram_stats_del(c->MF_stats, p->mate_flags); - p->mate_flags = - ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + - ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE; - cram_stats_add(c->MF_stats, p->mate_flags); + if (((p->flags & BAM_FMREVERSE) != 0) != + ((p->mate_flags & CRAM_M_REVERSE) != 0)) + goto detached; - cram_stats_del(c->TS_stats, p->tlen); - p->tlen = p->apos - cr->aend; - cram_stats_add(c->TS_stats, p->tlen); + // Supplementary reads are just too ill defined + if ((cr->flags & BAM_FSUPPLEMENTARY) || + (p->flags & BAM_FSUPPLEMENTARY)) + goto detached; + + /* + * The fields below are unused when encoding this read as it is + * no longer detached. In theory they may get referred to when + * processing a 3rd or 4th read in this template?, so we set them + * here just to be sure. + * + * They do not need cram_stats_add() calls those as they are + * not emitted. + */ + cr->mate_pos = p->apos; + cr->tlen = sign*(aright-aleft+1); + cr->mate_flags = + ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + + ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; + + // Decrement statistics aggregated earlier + cram_stats_del(c->stats[DS_NP], p->mate_pos); + cram_stats_del(c->stats[DS_MF], p->mate_flags); + cram_stats_del(c->stats[DS_TS], p->tlen); + cram_stats_del(c->stats[DS_NS], p->mate_ref_id); + + /* Similarly we could correct the p-> values too, but these will no + * longer have any code that refers back to them as the new 'p' + * for this template is our current 'cr'. + */ + //p->mate_pos = cr->apos; + //p->mate_flags = + // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + + // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE; + //p->tlen = p->apos - cr->aend; // Clear detached from cr flags - //cram_stats_del(c->CF_stats, cr->cram_flags); cr->cram_flags &= ~CRAM_FLAG_DETACHED; - cram_stats_add(c->CF_stats, cr->cram_flags); + cram_stats_add(c->stats[DS_CF], cr->cram_flags); // Clear detached from p flags and set downstream - cram_stats_del(c->CF_stats, p->cram_flags); + cram_stats_del(c->stats[DS_CF], p->cram_flags); p->cram_flags &= ~CRAM_FLAG_DETACHED; p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; - cram_stats_add(c->CF_stats, p->cram_flags); + cram_stats_add(c->stats[DS_CF], p->cram_flags); - p->mate_line = rnum - (kh_val(s->pair, k) + 1); - cram_stats_add(c->NF_stats, p->mate_line); + p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1); + cram_stats_add(c->stats[DS_NF], p->mate_line); - kh_val(s->pair, k) = rnum; + kh_val(s->pair[sec], k) = rnum; } else { + detached: //fprintf(stderr, "unpaired\n"); /* Derive mate flags from this flag */ @@ -2477,24 +2908,24 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (bam_flag(b) & BAM_FMREVERSE) cr->mate_flags |= CRAM_M_REVERSE; - cram_stats_add(c->MF_stats, cr->mate_flags); + cram_stats_add(c->stats[DS_MF], cr->mate_flags); cr->mate_pos = MAX(bam_mate_pos(b)+1, 0); - cram_stats_add(c->NP_stats, cr->mate_pos); + cram_stats_add(c->stats[DS_NP], cr->mate_pos); cr->tlen = bam_ins_size(b); - cram_stats_add(c->TS_stats, cr->tlen); + cram_stats_add(c->stats[DS_TS], cr->tlen); cr->cram_flags |= CRAM_FLAG_DETACHED; - cram_stats_add(c->CF_stats, cr->cram_flags); + cram_stats_add(c->stats[DS_CF], cr->cram_flags); + cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)); } } cr->mqual = bam_map_qual(b); - cram_stats_add(c->MQ_stats, cr->mqual); + cram_stats_add(c->stats[DS_MQ], cr->mqual); cr->mate_ref_id = bam_mate_ref(b); - cram_stats_add(c->NS_stats, cr->mate_ref_id); if (!(bam_flag(b) & BAM_FUNMAP)) { if (c->first_base > cr->apos) @@ -2549,10 +2980,17 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { slice_rec = c->slice_rec; curr_rec = c->curr_rec; - if (fd->version == CRAM_1_VERS || - c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) - if (NULL == (c = cram_next_container(fd, b))) + if (CRAM_MAJOR_VERS(fd->version) == 1 || + c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) { + if (NULL == (c = cram_next_container(fd, b))) { + if (fd->ctr) { + // prevent cram_close attempting to flush + cram_free_container(fd->ctr); + fd->ctr = NULL; + } return -1; + } + } /* * Due to our processing order, some things we've already done we @@ -2579,7 +3017,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { // Have we seen this reference before? if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && - !fd->unsorted) { + !fd->unsorted && multi_seq) { if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); @@ -2618,7 +3056,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { /* Copy or alloc+copy the bam record, for later encoding */ if (c->bams[c->curr_c_rec]) - bam_copy(&c->bams[c->curr_c_rec], b); + bam_copy1(c->bams[c->curr_c_rec], b); else c->bams[c->curr_c_rec] = bam_dup(b); diff --git a/htslib/cram/cram_index.c b/htslib/cram/cram_index.c index d16f6018..86672231 100644 --- a/htslib/cram/cram_index.c +++ b/htslib/cram/cram_index.c @@ -86,6 +86,52 @@ static void dump_index(cram_fd *fd) { } #endif +static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { + int sign = 1; + int32_t val = 0; + size_t p = *pos; + + while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) + p++; + + if (p < k->l && k->s[p] == '-') + sign = -1, p++; + + if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) + return -1; + + while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') + val = val*10 + k->s[p++]-'0'; + + *pos = p; + *val_p = sign*val; + + return 0; +} + +static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { + int sign = 1; + int64_t val = 0; + size_t p = *pos; + + while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) + p++; + + if (p < k->l && k->s[p] == '-') + sign = -1, p++; + + if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) + return -1; + + while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') + val = val*10 + k->s[p++]-'0'; + + *pos = p; + *val_p = sign*val; + + return 0; +} + /* * Loads a CRAM .crai index into memory. * @@ -162,21 +208,24 @@ int cram_index_load(cram_fd *fd, const char *fn) { // Parse it line at a time do { - int nchars; - char *line = &kstr.s[pos]; - /* 1.1 layout */ - if (sscanf(line, "%d\t%d\t%d\t%"PRId64"\t%d\t%d%n", - &e.refid, - &e.start, - &e.end, - &e.offset, - &e.slice, - &e.len, - &nchars) != 6) { - free(kstr.s); - free(idx_stack); - return -1; + if (kget_int32(&kstr, &pos, &e.refid) == -1) { + free(kstr.s); free(idx_stack); return -1; + } + if (kget_int32(&kstr, &pos, &e.start) == -1) { + free(kstr.s); free(idx_stack); return -1; + } + if (kget_int32(&kstr, &pos, &e.end) == -1) { + free(kstr.s); free(idx_stack); return -1; + } + if (kget_int64(&kstr, &pos, &e.offset) == -1) { + free(kstr.s); free(idx_stack); return -1; + } + if (kget_int32(&kstr, &pos, &e.slice) == -1) { + free(kstr.s); free(idx_stack); return -1; + } + if (kget_int32(&kstr, &pos, &e.len) == -1) { + free(kstr.s); free(idx_stack); return -1; } e.end += e.start-1; @@ -227,7 +276,6 @@ int cram_index_load(cram_fd *fd, const char *fn) { } idx_stack[idx_stack_ptr] = idx; - pos += nchars; while (pos < kstr.l && kstr.s[pos] != '\n') pos++; pos++; @@ -313,6 +361,9 @@ cram_index *cram_index_query(cram_fd *fd, int refid, int pos, continue; } } + // i==j or i==j-1. Check if j is better. + if (from->e[j].start < pos && from->e[j].refid == refid) + i = j; /* The above found *a* bin overlapping, but not necessarily the first */ while (i > 0 && from->e[i-1].end >= pos) @@ -359,6 +410,7 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { if (fd->ctr) { cram_free_container(fd->ctr); fd->ctr = NULL; + fd->ooc = 0; } return 0; diff --git a/htslib/cram/cram_io.c b/htslib/cram/cram_io.c index c5a4c4eb..5efc92d9 100644 --- a/htslib/cram/cram_io.c +++ b/htslib/cram/cram_io.c @@ -57,6 +57,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HAVE_LIBBZ2 #include #endif +#ifdef HAVE_LIBLZMA +#include +#endif #include #include #include @@ -66,6 +69,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/os.h" #include "cram/md5.h" #include "cram/open_trace_file.h" +#include "cram/rANS_static.h" //#define REF_DEBUG @@ -78,19 +82,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RP(...) #endif -#ifdef SAMTOOLS #include "htslib/hfile.h" -#define paranoid_hclose(fp) (hclose(fp)) -#else -#define hclose_abruptly(fp) (fclose(fp)) -#define hflush(fp) (fflush(fp)) -#define hgetc(fp) (getc(fp)) -#define hputc(c, fp) (putc((c), (fp))) -#define hread(fp, buffer, nbytes) (fread((buffer), 1, (nbytes), (fp))) -#define hseek(fp, offset, whence) (fseeko((fp), (offset), (whence))) -#define hwrite(fp, buffer, nbytes) (fwrite((buffer), 1, (nbytes), (fp))) -#define paranoid_hclose(fp) (paranoid_fclose(fp)) -#endif +#include "htslib/bgzf.h" +#include "htslib/faidx.h" + +#define TRIAL_SPAN 50 +#define NTRIALS 3 + /* ---------------------------------------------------------------------- * ITF8 encoding and decoding. @@ -644,6 +642,90 @@ static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, return (char *)cdata; } +#ifdef HAVE_LIBLZMA +/* ------------------------------------------------------------------------ */ +/* + * Data compression routines using liblzma (xz) + * + * On a test set this shrunk the main db from 136157104 bytes to 114796168, but + * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq + * went from 18.3s to 36.3s. So decompression suffers too, but not as bad + * as compression times. + * + * For now we disable this functionality. If it's to be reenabled make sure you + * improve the mem_inflate implementation as it's just a test hack at the + * moment. + */ + +static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size, + int level) { + char *out; + size_t out_size = lzma_stream_buffer_bound(size); + *cdata_size = 0; + + out = malloc(out_size); + + /* Single call compression */ + if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL, + (uint8_t *)data, size, + (uint8_t *)out, cdata_size, + out_size)) + return NULL; + + return out; +} + +static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { + lzma_stream strm = LZMA_STREAM_INIT; + size_t out_size = 0, out_pos = 0; + char *out = NULL; + int r; + + /* Initiate the decoder */ + if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0)) + return NULL; + + /* Decode loop */ + strm.avail_in = csize; + strm.next_in = (uint8_t *)cdata; + + for (;strm.avail_in;) { + if (strm.avail_in > out_size - out_pos) { + out_size += strm.avail_in * 4 + 32768; + out = realloc(out, out_size); + } + strm.avail_out = out_size - out_pos; + strm.next_out = (uint8_t *)&out[out_pos]; + + r = lzma_code(&strm, LZMA_RUN); + if (LZMA_OK != r && LZMA_STREAM_END != r) { + fprintf(stderr, "r=%d\n", r); + fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm)); + return NULL; + } + + out_pos = strm.total_out; + + if (r == LZMA_STREAM_END) + break; + } + + /* finish up any unflushed data; necessary? */ + r = lzma_code(&strm, LZMA_FINISH); + if (r != LZMA_OK && r != LZMA_STREAM_END) { + fprintf(stderr, "r=%d\n", r); + return NULL; + } + + out = realloc(out, strm.total_out); + *size = strm.total_out; + + lzma_end(&strm); + + return out; +} +#endif + /* ---------------------------------------------------------------------- * CRAM blocks - the dynamically growable data block. We have code to * create, update, (un)compress and read/write. @@ -716,6 +798,32 @@ cram_block *cram_read_block(cram_fd *fd) { } } + if (CRAM_MAJOR_VERS(fd->version) >= 3) { + unsigned char dat[100], *cp = dat;; + uint32_t crc; + + + if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) { + free(b); + return NULL; + } + + *cp++ = b->method; + *cp++ = b->content_type; + cp += itf8_put(cp, b->content_id); + cp += itf8_put(cp, b->comp_size); + cp += itf8_put(cp, b->uncomp_size); + crc = crc32(0L, dat, cp-dat); + crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc); + + if (crc != b->crc32) { + fprintf(stderr, "Block CRC32 failure\n"); + free(b->data); + free(b); + return NULL; + } + } + b->orig_method = b->method; b->idx = 0; b->byte = 0; @@ -746,6 +854,27 @@ int cram_write_block(cram_fd *fd, cram_block *b) { return -1; } + if (CRAM_MAJOR_VERS(fd->version) >= 3) { + unsigned char dat[100], *cp = dat;; + uint32_t crc; + + *cp++ = b->method; + *cp++ = b->content_type; + cp += itf8_put(cp, b->content_id); + cp += itf8_put(cp, b->comp_size); + cp += itf8_put(cp, b->uncomp_size); + crc = crc32(0L, dat, cp-dat); + + if (b->method == RAW) { + b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); + } else { + b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size); + } + + if (-1 == int32_encode(fd, b->crc32)) + return -1; + } + return 0; } @@ -775,15 +904,16 @@ int cram_uncompress_block(cram_block *b) { switch (b->method) { case RAW: - b->uncomp_size = b->comp_size; return 0; case GZIP: uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); if (!uncomp) return -1; - if ((int)uncomp_size != b->uncomp_size) + if ((int)uncomp_size != b->uncomp_size) { + free(uncomp); return -1; + } free(b->data); b->data = (unsigned char *)uncomp; b->alloc = uncomp_size; @@ -801,6 +931,7 @@ int cram_uncompress_block(cram_block *b) { free(uncomp); return -1; } + free(b->data); b->data = (unsigned char *)uncomp; b->alloc = usize; b->method = RAW; @@ -814,7 +945,39 @@ int cram_uncompress_block(cram_block *b) { return -1; #endif - case BM_ERROR: +#ifdef HAVE_LIBLZMA + case LZMA: + uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); + if (!uncomp) + return -1; + if ((int)uncomp_size != b->uncomp_size) + return -1; + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = uncomp_size; + b->method = RAW; + break; +#else + case LZMA: + fprintf(stderr, "Lzma compression is not compiled into this " + "version.\nPlease rebuild and try again.\n"); + return -1; + break; +#endif + + case RANS: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2); + assert(usize == usize2); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + default: return -1; } @@ -822,38 +985,68 @@ int cram_uncompress_block(cram_block *b) { return 0; } +static char *cram_compress_by_method(char *in, size_t in_size, + size_t *out_size, + enum cram_block_method method, + int level, int strat) { + switch (method) { + case GZIP: + return zlib_mem_deflate(in, in_size, out_size, level, strat); + + case BZIP2: { #ifdef HAVE_LIBBZ2 -static int cram_compress_block_bzip2(cram_fd *fd, cram_block *b, - cram_metrics *metrics, int level) { - unsigned int comp_size = b->uncomp_size*1.01 + 600; - char *comp = malloc(comp_size); - char *data = (char *)b->data; + unsigned int comp_size = in_size*1.01 + 600; + char *comp = malloc(comp_size); + if (!comp) + return NULL; - if (!comp) - return -1; + if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, + in, in_size, + level, 0, 30)) { + free(comp); + return NULL; + } + *out_size = comp_size; + return comp; +#else + return NULL; +#endif + } - if (!data) - data = ""; + case LZMA: +#ifdef HAVE_LIBLZMA + return lzma_mem_deflate(in, in_size, out_size, level); +#else + return NULL; +#endif - if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, - data, b->uncomp_size, - level, 0, 30)) { - free(comp); - return -1; + case RANS0: { + unsigned int out_size_i; + unsigned char *cp; + cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0); + *out_size = out_size_i; + return (char *)cp; } - free(b->data); - b->data = (unsigned char *)comp; - b->method = BZIP2; - b->comp_size = comp_size; + case RANS1: { + unsigned int out_size_i; + unsigned char *cp; + + cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1); + *out_size = out_size_i; + return (char *)cp; + } - if (fd->verbose) - fprintf(stderr, "Compressed block ID %d from %d to %d\n", - b->content_id, b->uncomp_size, b->comp_size); + case RAW: + break; - return 0; + default: + return NULL; + } + + return NULL; } -#endif + /* * Compresses a block using one of two different zlib strategies. If we only @@ -864,114 +1057,347 @@ static int cram_compress_block_bzip2(cram_fd *fd, cram_block *b, * significantly faster. */ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, - int level, int strat, - int level2, int strat2) { + int method, int level) { + char *comp = NULL; size_t comp_size = 0; + int strat; + + //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size); - if (level == 0) { + if (method == RAW || level == 0 || b->uncomp_size == 0) { b->method = RAW; b->comp_size = b->uncomp_size; + //fprintf(stderr, "Skip block id %d\n", b->content_id); return 0; } - if (b->method != RAW) { - fprintf(stderr, "Attempt to compress an already compressed block.\n"); - return 0; - } + if (metrics) { + pthread_mutex_lock(&fd->metrics_lock); + if (metrics->trial > 0 || --metrics->next_trial <= 0) { + size_t sz_best = INT_MAX; + size_t sz_gz_rle = 0; + size_t sz_gz_def = 0; + size_t sz_rans0 = 0; + size_t sz_rans1 = 0; + size_t sz_bzip2 = 0; + size_t sz_lzma = 0; + int method_best = 0; + char *c_best = NULL, *c = NULL; + + if (metrics->revised_method) + method = metrics->revised_method; + else + metrics->revised_method = method; + + if (metrics->next_trial == 0) { + metrics->next_trial = TRIAL_SPAN; + metrics->trial = NTRIALS; + metrics->sz_gz_rle /= 2; + metrics->sz_gz_def /= 2; + metrics->sz_rans0 /= 2; + metrics->sz_rans1 /= 2; + metrics->sz_bzip2 /= 2; + metrics->sz_lzma /= 2; + } -#ifdef HAVE_LIBBZ2 - if (fd->use_bz2) - // metrics ignored for bzip2 - return cram_compress_block_bzip2(fd, b, metrics, level); -#endif + pthread_mutex_unlock(&fd->metrics_lock); + + if (method & (1<data, b->uncomp_size, + &sz_gz_rle, GZIP, 1, Z_RLE); + if (c && sz_best > sz_gz_rle) { + sz_best = sz_gz_rle; + method_best = GZIP_RLE; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz_gz_rle = b->uncomp_size*2+1000; + } - pthread_mutex_lock(&fd->metrics_lock); - if (strat2 >= 0) - if (fd->verbose > 1) - fprintf(stderr, "metrics trial %d, next_trial %d, m1 %d, m2 %d\n", - metrics->trial, metrics->next_trial, - metrics->m1, metrics->m2); - - if (strat2 >= 0 && (metrics->trial > 0 || --metrics->next_trial <= 0)) { - char *c1, *c2; - size_t s1, s2; - - if (metrics->next_trial == 0) { - metrics->next_trial = 100; - metrics->trial = 3; - metrics->m1 = metrics->m2 = 0; - } - pthread_mutex_unlock(&fd->metrics_lock); - - c1 = zlib_mem_deflate((char *)b->data, b->uncomp_size, - &s1, level, strat); - c2 = zlib_mem_deflate((char *)b->data, b->uncomp_size, - &s2, level2, strat2); - if (!c1 || !c2) - return -1; - - //fprintf(stderr, "1: %6d 2: %6d %5.1f\n", s1, s2, 100.0*s1/s2); + //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle); + } - pthread_mutex_lock(&fd->metrics_lock); - if (s1 < 0.98 * s2) { // 2nd one should be faster alternative - if (fd->verbose > 1) - fprintf(stderr, "M1 wins %d vs %d\n", (int)s1, (int)s2); - comp = c1; comp_size = s1; - free(c2); - metrics->m1++; + if (method & (1<data, b->uncomp_size, + &sz_gz_def, GZIP, level, + Z_FILTERED); + if (c && sz_best > sz_gz_def) { + sz_best = sz_gz_def; + method_best = GZIP; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz_gz_def = b->uncomp_size*2+1000; + } + + //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def); + } + + if (method & (1<data, b->uncomp_size, + &sz_rans0, RANS0, 0, 0); + if (c && sz_best > sz_rans0) { + sz_best = sz_rans0; + method_best = RANS0; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz_rans0 = b->uncomp_size*2+1000; + } + } + + if (method & (1<data, b->uncomp_size, + &sz_rans1, RANS1, 0, 0); + if (c && sz_best > sz_rans1) { + sz_best = sz_rans1; + method_best = RANS1; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz_rans1 = b->uncomp_size*2+1000; + } + } + + if (method & (1<data, b->uncomp_size, + &sz_bzip2, BZIP2, level, 0); + if (c && sz_best > sz_bzip2) { + sz_best = sz_bzip2; + method_best = BZIP2; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz_bzip2 = b->uncomp_size*2+1000; + } + } + + if (method & (1<data, b->uncomp_size, + &sz_lzma, LZMA, level, 0); + if (c && sz_best > sz_lzma) { + sz_best = sz_lzma; + method_best = LZMA; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz_lzma = b->uncomp_size*2+1000; + } + } + + //fprintf(stderr, "sz_best = %d\n", sz_best); + + free(b->data); + b->data = (unsigned char *)c_best; + //printf("method_best = %s\n", cram_block_method2str(method_best)); + b->method = method_best == GZIP_RLE ? GZIP : method_best; + b->comp_size = sz_best; + + pthread_mutex_lock(&fd->metrics_lock); + metrics->sz_gz_rle += sz_gz_rle; + metrics->sz_gz_def += sz_gz_def; + metrics->sz_rans0 += sz_rans0; + metrics->sz_rans1 += sz_rans1; + metrics->sz_bzip2 += sz_bzip2; + metrics->sz_lzma += sz_lzma; + if (--metrics->trial == 0) { + int best_method = RAW; + int best_sz = INT_MAX; + + // Scale methods by cost + if (fd->level <= 3) { + metrics->sz_rans1 *= 1.02; + metrics->sz_gz_def *= 1.04; + metrics->sz_bzip2 *= 1.08; + metrics->sz_lzma *= 1.10; + } else if (fd->level <= 6) { + metrics->sz_rans1 *= 1.01; + metrics->sz_gz_def *= 1.02; + metrics->sz_bzip2 *= 1.03; + metrics->sz_lzma *= 1.05; + } + + if (method & (1< metrics->sz_gz_rle) + best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE; + + if (method & (1< metrics->sz_gz_def) + best_sz = metrics->sz_gz_def, best_method = GZIP; + + if (method & (1< metrics->sz_rans0) + best_sz = metrics->sz_rans0, best_method = RANS0; + + if (method & (1< metrics->sz_rans1) + best_sz = metrics->sz_rans1, best_method = RANS1; + + if (method & (1< metrics->sz_bzip2) + best_sz = metrics->sz_bzip2, best_method = BZIP2; + + if (method & (1< metrics->sz_lzma) + best_sz = metrics->sz_lzma, best_method = LZMA; + + if (best_method == GZIP_RLE) { + metrics->method = GZIP; + metrics->strat = Z_RLE; + } else { + metrics->method = best_method; + metrics->strat = Z_FILTERED; + } + + // If we see at least MAXFAIL trials in a row for a specific + // compression method with more than MAXDELTA aggregate + // size then we drop this from the list of methods used + // for this block type. +#define MAXDELTA 0.20 +#define MAXFAILS 4 + if (best_method == GZIP_RLE) { + metrics->gz_rle_cnt = 0; + metrics->gz_rle_extra = 0; + } else if (best_sz < metrics->sz_gz_rle) { + double r = (double)metrics->sz_gz_rle / best_sz - 1; + if (++metrics->gz_rle_cnt >= MAXFAILS && + (metrics->gz_rle_extra += r) >= MAXDELTA) + method &= ~(1<gz_def_cnt = 0; + metrics->gz_def_extra = 0; + } else if (best_sz < metrics->sz_gz_def) { + double r = (double)metrics->sz_gz_def / best_sz - 1; + if (++metrics->gz_def_cnt >= MAXFAILS && + (metrics->gz_def_extra += r) >= MAXDELTA) + method &= ~(1<rans0_cnt = 0; + metrics->rans0_extra = 0; + } else if (best_sz < metrics->sz_rans0) { + double r = (double)metrics->sz_rans0 / best_sz - 1; + if (++metrics->rans0_cnt >= MAXFAILS && + (metrics->rans0_extra += r) >= MAXDELTA) + method &= ~(1<rans1_cnt = 0; + metrics->rans1_extra = 0; + } else if (best_sz < metrics->sz_rans1) { + double r = (double)metrics->sz_rans1 / best_sz - 1; + if (++metrics->rans1_cnt >= MAXFAILS && + (metrics->rans1_extra += r) >= MAXDELTA) + method &= ~(1<bzip2_cnt = 0; + metrics->bzip2_extra = 0; + } else if (best_sz < metrics->sz_bzip2) { + double r = (double)metrics->sz_bzip2 / best_sz - 1; + if (++metrics->bzip2_cnt >= MAXFAILS && + (metrics->bzip2_extra += r) >= MAXDELTA) + method &= ~(1<lzma_cnt = 0; + metrics->lzma_extra = 0; + } else if (best_sz < metrics->sz_lzma) { + double r = (double)metrics->sz_lzma / best_sz - 1; + if (++metrics->lzma_cnt >= MAXFAILS && + (metrics->lzma_extra += r) >= MAXDELTA) + method &= ~(1<revised_method) + // fprintf(stderr, "%d: method from %x to %x\n", + // b->content_id, metrics->revised_method, method); + metrics->revised_method = method; + } + pthread_mutex_unlock(&fd->metrics_lock); } else { - if (fd->verbose > 1) - fprintf(stderr, "M2 wins %d vs %d\n", (int)s1, (int)s2); - comp = c2; comp_size = s2; - free(c1); - metrics->m2++; - } - metrics->trial--; - pthread_mutex_unlock(&fd->metrics_lock); - } else if (strat2 >= 0) { - int xlevel = metrics->m1 > metrics->m2 ? level : level2; - int xstrat = metrics->m1 > metrics->m2 ? strat : strat2; - pthread_mutex_unlock(&fd->metrics_lock); - comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size, - xlevel, xstrat); + strat = metrics->strat; + method = metrics->method; + + pthread_mutex_unlock(&fd->metrics_lock); + comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + &comp_size, method, + level, strat); + if (!comp) + return -1; + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = method; + } + } else { - pthread_mutex_unlock(&fd->metrics_lock); - comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size, - level, strat); + // no cached metrics, so just do zlib? + comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + &comp_size, GZIP, level, Z_FILTERED); + if (!comp) { + fprintf(stderr, "Compression failed!\n"); + return -1; + } + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = GZIP; } - if (!comp) - return -1; - - free(b->data); - b->data = (unsigned char *)comp; - b->method = GZIP; - b->comp_size = comp_size; - if (fd->verbose) - fprintf(stderr, "Compressed block ID %d from %d to %d\n", - b->content_id, b->uncomp_size, b->comp_size); + fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n", + b->content_id, b->uncomp_size, b->comp_size, + cram_block_method2str(b->method)); + + if (b->method == RANS1) + b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing return 0; } cram_metrics *cram_new_metrics(void) { - cram_metrics *m = malloc(sizeof(*m)); + cram_metrics *m = calloc(1, sizeof(*m)); if (!m) return NULL; - m->m1 = m->m2 = 0; - m->trial = 2; - m->next_trial = 100; + m->trial = NTRIALS-1; + m->next_trial = TRIAL_SPAN; + m->method = RAW; + m->strat = 0; + m->revised_method = 0; + return m; } char *cram_block_method2str(enum cram_block_method m) { switch(m) { - case RAW: return "RAW"; - case GZIP: return "GZIP"; - case BZIP2: return "BZIP2"; - case BM_ERROR: break; + case RAW: return "RAW"; + case GZIP: return "GZIP"; + case BZIP2: return "BZIP2"; + case LZMA: return "LZMA"; + case RANS0: return "RANS0"; + case RANS1: return "RANS1"; + case GZIP_RLE: return "GZIP_RLE"; + case ERROR: break; } return "?"; } @@ -1069,7 +1495,7 @@ void refs_free(refs_t *r) { free(r->ref_id); if (r->fp) - fclose(r->fp); + bgzf_close(r->fp); pthread_mutex_destroy(&r->lock); @@ -1104,6 +1530,37 @@ static refs_t *refs_create(void) { return NULL; } +/* + * Opens a reference fasta file as a BGZF stream, allowing for + * compressed files. It automatically builds a .fai file if + * required and if compressed a .gzi bgzf index too. + * + * Returns a BGZF handle on success; + * NULL on failure. + */ +static BGZF *bgzf_open_ref(char *fn, char *mode) { + BGZF *fp; + char fai_file[PATH_MAX]; + + snprintf(fai_file, PATH_MAX, "%s.fai", fn); + if (access(fai_file, R_OK) != 0) + if (fai_build(fn) != 0) + return NULL; + + if (!(fp = bgzf_open(fn, mode))) { + perror(fn); + return NULL; + } + + if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) { + fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn); + bgzf_close(fp); + return NULL; + } + + return fp; +} + /* * Loads a FAI file for a reference.fasta. * "is_err" indicates whether failure to load is worthy of emitting an @@ -1120,6 +1577,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { char line[8192]; refs_t *r = r_orig; size_t fn_l = strlen(fn); + int id = 0, id_alloc = 0; RP("refs_load_fai %s\n", fn); @@ -1135,7 +1593,8 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { } if (r->fp) - fclose(r->fp); + if (bgzf_close(r->fp) != 0) + goto err; r->fp = NULL; if (!(r->fn = string_dup(r->pool, fn))) @@ -1144,11 +1603,8 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) r->fn[fn_l-4] = 0; - if (!(r->fp = fopen(r->fn, "r"))) { - if (is_err) - perror(fn); + if (!(r->fp = bgzf_open_ref(r->fn, "r"))) goto err; - } /* Parse .fai file and load meta-data */ sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn); @@ -1224,6 +1680,18 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { kh_val(r->h_meta, k) = e; } } + + if (id >= id_alloc) { + int x; + + id_alloc = id_alloc ?id_alloc*2 : 16; + r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id)); + + for (x = id; x < id_alloc; x++) + r->ref_id[x] = NULL; + } + r->ref_id[id] = e; + r->nref = ++id; } return r; @@ -1277,7 +1745,7 @@ int refs2id(refs_t *r, SAM_hdr *h) { * -1 on failure */ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { - int i; + int i, j; if (!h || h->nref == 0) return 0; @@ -1285,48 +1753,46 @@ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); /* Existing refs are fine, as long as they're compatible with the hdr. */ - i = r->nref; - if (r->nref < h->nref) - r->nref = h->nref; - - if (!(r->ref_id = realloc(r->ref_id, r->nref * sizeof(*r->ref_id)))) + if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id)))) return -1; - for (; i < r->nref; i++) - r->ref_id[i] = NULL; - /* Copy info from h->ref[i] over to r */ - for (i = 0; i < h->nref; i++) { + for (i = 0, j = r->nref; i < h->nref; i++) { SAM_hdr_type *ty; SAM_hdr_tag *tag; khint_t k; int n; - if (r->ref_id[i] && 0 == strcmp(r->ref_id[i]->name, h->ref[i].name)) + k = kh_get(refs, r->h_meta, h->ref[i].name); + if (k != kh_end(r->h_meta)) + // Ref already known about continue; - if (!(r->ref_id[i] = calloc(1, sizeof(ref_entry)))) + if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry)))) return -1; - if (!h->ref[i].name) + if (!h->ref[j].name) return -1; - r->ref_id[i]->name = string_dup(r->pool, h->ref[i].name); - r->ref_id[i]->length = 0; // marker for not yet loaded + r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name); + r->ref_id[j]->length = 0; // marker for not yet loaded /* Initialise likely filename if known */ if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) { if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) { - r->ref_id[i]->fn = string_dup(r->pool, tag->str+3); - //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[i]->name, r->ref_id[i]->fn); + r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); + //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); } } - k = kh_put(refs, r->h_meta, r->ref_id[i]->name, &n); + k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n); if (n <= 0) // already exists or error return -1; - kh_val(r->h_meta, k) = r->ref_id[i]; + kh_val(r->h_meta, k) = r->ref_id[j]; + + j++; } + r->nref = j; return 0; } @@ -1339,6 +1805,8 @@ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { * in). */ int cram_set_header(cram_fd *fd, SAM_hdr *hdr) { + if (fd->header) + sam_hdr_free(fd->header); fd->header = hdr; return refs_from_header(fd->refs, fd, hdr); } @@ -1415,6 +1883,30 @@ void mkdir_prefix(char *path, int mode) { *cp = '/'; } +/* + * Return the cache directory to use, based on the first of these + * environment variables to be set to a non-empty value. + */ +static const char *get_cache_basedir(const char **extra) { + char *base; + + *extra = ""; + + base = getenv("XDG_CACHE_HOME"); + if (base && *base) return base; + + base = getenv("HOME"); + if (base && *base) { *extra = "/.cache"; return base; } + + base = getenv("TMPDIR"); + if (base && *base) return base; + + base = getenv("TEMP"); + if (base && *base) return base; + + return "/tmp"; +} + /* * Queries the M5 string from the header and attempts to populate the * reference from this using the REF_PATH environment. @@ -1426,15 +1918,28 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { char *ref_path = getenv("REF_PATH"); SAM_hdr_type *ty; SAM_hdr_tag *tag; - char path[PATH_MAX], path_tmp[PATH_MAX]; + char path[PATH_MAX], path_tmp[PATH_MAX], cache[PATH_MAX]; char *local_cache = getenv("REF_CACHE"); mFILE *mf; if (fd->verbose) fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id); - if (!ref_path || *ref_path == 0) + if (!ref_path || *ref_path == '\0') { + /* + * If we have no ref path, we use the EBI server. + * However to avoid spamming it we require a local ref cache too. + */ ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s"; + if (!local_cache || *local_cache == '\0') { + const char *extra; + const char *base = get_cache_basedir(&extra); + snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); + local_cache = cache; + if (fd->verbose) + fprintf(stderr, "Populating local cache: %s\n", local_cache); + } + } if (!r->name) return -1; @@ -1451,18 +1956,19 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Use cache if available */ if (local_cache && *local_cache) { struct stat sb; - FILE *fp; + BGZF *fp; expand_cache_path(path, local_cache, tag->str+3); - if (0 == stat(path, &sb) && (fp = fopen(path, "r"))) { + if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) { r->length = sb.st_size; r->offset = r->line_length = r->bases_per_line = 0; r->fn = string_dup(fd->refs->pool, path); if (fd->refs->fp) - fclose(fd->refs->fp); + if (bgzf_close(fd->refs->fp) != 0) + return -1; fd->refs->fp = fp; fd->refs->fn = r->fn; @@ -1491,14 +1997,16 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { : tag->str+3; if (fd->refs->fp) { - fclose(fd->refs->fp); + if (bgzf_close(fd->refs->fp) != 0) + return -1; fd->refs->fp = NULL; } if (!(refs = refs_load_fai(fd->refs, fn, 0))) return -1; fd->refs = refs; if (fd->refs->fp) { - fclose(fd->refs->fp); + if (bgzf_close(fd->refs->fp) != 0) + return -1; fd->refs->fp = NULL; } @@ -1590,10 +2098,8 @@ static void cram_ref_decr_locked(refs_t *r, int id) { r->ref_id[r->last_id]->seq = NULL; r->ref_id[r->last_id]->length = 0; } - r->last_id = -1; - } else { - r->last_id = id; } + r->last_id = id; } } @@ -1612,7 +2118,7 @@ void cram_ref_decr(refs_t *r, int id) { * Returns all or part of a reference sequence on success (malloced); * NULL on failure. */ -static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) { +static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { off_t offset, len; char *seq; @@ -1633,8 +2139,8 @@ static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) { (end-1) % e->bases_per_line : end-1) - offset + 1; - if (0 != fseeko(fp, offset, SEEK_SET)) { - perror("fseeko() on reference file"); + if (bgzf_useek(fp, offset, SEEK_SET) < 0) { + perror("bgzf_useek() on reference file"); return NULL; } @@ -1642,8 +2148,8 @@ static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) { return NULL; } - if (len != fread(seq, 1, len, fp)) { - perror("fread() on reference file"); + if (len != bgzf_read(fp, seq, len)) { + perror("bgzf_read() on reference file"); free(seq); return NULL; } @@ -1714,12 +2220,11 @@ ref_entry *cram_ref_load(refs_t *r, int id) { /* Open file if it's not already the current open reference */ if (strcmp(r->fn, e->fn) || r->fp == NULL) { if (r->fp) - fclose(r->fp); + if (bgzf_close(r->fp) != 0) + return NULL; r->fn = e->fn; - if (!(r->fp = fopen(r->fn, "r"))) { - perror(r->fn); + if (!(r->fp = bgzf_open_ref(r->fn, "r"))) return NULL; - } } RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end); @@ -1834,6 +2339,8 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { return NULL; } r = fd->refs->ref_id[id]; + if (fd->unsorted) + cram_ref_incr_locked(fd->refs, id); } @@ -1924,10 +2431,10 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { /* Open file if it's not already the current open reference */ if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) { if (fd->refs->fp) - fclose(fd->refs->fp); + if (bgzf_close(fd->refs->fp) != 0) + return NULL; fd->refs->fn = r->fn; - if (!(fd->refs->fp = fopen(fd->refs->fn, "r"))) { - perror(fd->refs->fn); + if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r"))) { pthread_mutex_unlock(&fd->refs->lock); pthread_mutex_unlock(&fd->ref_lock); return NULL; @@ -1969,14 +2476,17 @@ int cram_load_reference(cram_fd *fd, char *fn) { fd->ref_fn = fn; if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) { + if (fd->refs) + refs_free(fd->refs); if (!(fd->refs = refs_create())) return -1; if (-1 == refs_from_header(fd->refs, fd, fd->header)) return -1; } - if (-1 == refs2id(fd->refs, fd->header)) - return -1; + if (fd->header) + if (-1 == refs2id(fd->refs, fd->header)) + return -1; return fn ? 0 : -1; } @@ -1994,6 +2504,8 @@ int cram_load_reference(cram_fd *fd, char *fn) { */ cram_container *cram_new_container(int nrec, int nslice) { cram_container *c = calloc(1, sizeof(*c)); + enum cram_DS_ID id; + if (!c) return NULL; @@ -2023,32 +2535,8 @@ cram_container *cram_new_container(int nrec, int nslice) { goto err; c->comp_hdr_block = NULL; - if (!(c->BF_stats = cram_stats_create())) goto err; - if (!(c->CF_stats = cram_stats_create())) goto err; - if (!(c->RN_stats = cram_stats_create())) goto err; - if (!(c->AP_stats = cram_stats_create())) goto err; - if (!(c->RG_stats = cram_stats_create())) goto err; - if (!(c->MQ_stats = cram_stats_create())) goto err; - if (!(c->NS_stats = cram_stats_create())) goto err; - if (!(c->NP_stats = cram_stats_create())) goto err; - if (!(c->TS_stats = cram_stats_create())) goto err; - if (!(c->MF_stats = cram_stats_create())) goto err; - if (!(c->NF_stats = cram_stats_create())) goto err; - if (!(c->RL_stats = cram_stats_create())) goto err; - if (!(c->FN_stats = cram_stats_create())) goto err; - if (!(c->FC_stats = cram_stats_create())) goto err; - if (!(c->FP_stats = cram_stats_create())) goto err; - if (!(c->DL_stats = cram_stats_create())) goto err; - if (!(c->BA_stats = cram_stats_create())) goto err; - if (!(c->QS_stats = cram_stats_create())) goto err; - if (!(c->BS_stats = cram_stats_create())) goto err; - if (!(c->TC_stats = cram_stats_create())) goto err; - if (!(c->TN_stats = cram_stats_create())) goto err; - if (!(c->TL_stats = cram_stats_create())) goto err; - if (!(c->RI_stats = cram_stats_create())) goto err; - if (!(c->RS_stats = cram_stats_create())) goto err; - if (!(c->PD_stats = cram_stats_create())) goto err; - if (!(c->HC_stats = cram_stats_create())) goto err; + for (id = DS_RN; id < DS_TN; id++) + if (!(c->stats[id] = cram_stats_create())) goto err; //c->aux_B_stats = cram_stats_create(); @@ -2068,6 +2556,7 @@ cram_container *cram_new_container(int nrec, int nslice) { } void cram_free_container(cram_container *c) { + enum cram_DS_ID id; int i; if (!c) @@ -2092,34 +2581,8 @@ void cram_free_container(cram_container *c) { free(c->slices); } - if (c->TS_stats) cram_stats_free(c->TS_stats); - if (c->RG_stats) cram_stats_free(c->RG_stats); - if (c->FP_stats) cram_stats_free(c->FP_stats); - if (c->NS_stats) cram_stats_free(c->NS_stats); - if (c->RN_stats) cram_stats_free(c->RN_stats); - if (c->CF_stats) cram_stats_free(c->CF_stats); - if (c->TN_stats) cram_stats_free(c->TN_stats); - if (c->BA_stats) cram_stats_free(c->BA_stats); - if (c->TV_stats) cram_stats_free(c->TV_stats); - if (c->BS_stats) cram_stats_free(c->BS_stats); - if (c->FC_stats) cram_stats_free(c->FC_stats); - if (c->BF_stats) cram_stats_free(c->BF_stats); - if (c->AP_stats) cram_stats_free(c->AP_stats); - if (c->NF_stats) cram_stats_free(c->NF_stats); - if (c->MF_stats) cram_stats_free(c->MF_stats); - if (c->FN_stats) cram_stats_free(c->FN_stats); - if (c->RL_stats) cram_stats_free(c->RL_stats); - if (c->DL_stats) cram_stats_free(c->DL_stats); - if (c->TC_stats) cram_stats_free(c->TC_stats); - if (c->TL_stats) cram_stats_free(c->TL_stats); - if (c->MQ_stats) cram_stats_free(c->MQ_stats); - if (c->TM_stats) cram_stats_free(c->TM_stats); - if (c->QS_stats) cram_stats_free(c->QS_stats); - if (c->NP_stats) cram_stats_free(c->NP_stats); - if (c->RI_stats) cram_stats_free(c->RI_stats); - if (c->RS_stats) cram_stats_free(c->RS_stats); - if (c->PD_stats) cram_stats_free(c->PD_stats); - if (c->HC_stats) cram_stats_free(c->HC_stats); + for (id = DS_RN; id < DS_TN; id++) + if (c->stats[id]) cram_stats_free(c->stats[id]); //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats); @@ -2140,9 +2603,10 @@ cram_container *cram_read_container(cram_fd *fd) { size_t rd = 0; fd->err = 0; + fd->eof = 0; memset(&c2, 0, sizeof(c2)); - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { if ((s = itf8_decode(fd, &c2.length)) == -1) { fd->eof = fd->empty_container ? 1 : 2; return NULL; @@ -2151,7 +2615,11 @@ cram_container *cram_read_container(cram_fd *fd) { } } else { if ((s = int32_decode(fd, &c2.length)) == -1) { - fd->eof = fd->empty_container ? 1 : 2; + if (CRAM_MAJOR_VERS(fd->version) == 2 && + CRAM_MINOR_VERS(fd->version) == 0) + fd->eof = 1; // EOF blocks arrived in v2.1 + else + fd->eof = fd->empty_container ? 1 : 2; return NULL; } else { rd+=s; @@ -2162,14 +2630,23 @@ cram_container *cram_read_container(cram_fd *fd) { if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s; if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s; - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { c2.record_counter = 0; c2.num_bases = 0; } else { - if ((s = itf8_decode(fd, &c2.record_counter)) == -1) - return NULL; - else - rd += s; + if (CRAM_MAJOR_VERS(fd->version) >= 3) { + if ((s = ltf8_decode(fd, &c2.record_counter)) == -1) + return NULL; + else + rd += s; + } else { + int32_t i32; + if ((s = itf8_decode(fd, &i32)) == -1) + return NULL; + else + rd += s; + c2.record_counter = i32; + } if ((s = ltf8_decode(fd, &c2.num_bases))== -1) return NULL; @@ -2198,8 +2675,52 @@ cram_container *cram_read_container(cram_fd *fd) { rd += s; } } - c->offset = rd; + if (CRAM_MAJOR_VERS(fd->version) >= 3) { + uint32_t crc, i; + unsigned char *dat = malloc(50 + 5*(c->num_landmarks)), *cp = dat; + if (!dat) { + cram_free_container(c); + return NULL; + } + if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) + return NULL; + else + rd+=4; + + /* Reencode first as we can't easily access the original byte stream. + * + * FIXME: Technically this means this may not be fool proof. We could + * create a CRAM file using a 2 byte ITF8 value that can fit in a + * 1 byte field, meaning the encoding is different to the original + * form and so has a different CRC. + * + * The correct implementation would be to have an alternative form + * of itf8_decode which also squirrels away the raw byte stream + * during decoding so we can then CRC that. + */ + *(unsigned int *)cp = le_int4(c->length); cp += 4; + cp += itf8_put(cp, c->ref_seq_id); + cp += itf8_put(cp, c->ref_seq_start); + cp += itf8_put(cp, c->ref_seq_span); + cp += itf8_put(cp, c->num_records); + cp += ltf8_put((char *)cp, c->record_counter); + cp += itf8_put(cp, c->num_bases); + cp += itf8_put(cp, c->num_blocks); + cp += itf8_put(cp, c->num_landmarks); + for (i = 0; i < c->num_landmarks; i++) { + cp += itf8_put(cp, c->landmark[i]); + } + + crc = crc32(0L, dat, cp-dat); + if (crc != c->crc32) { + fprintf(stderr, "Container header CRC32 failure\n"); + cram_free_container(c); + return NULL; + } + } + + c->offset = rd; c->slices = NULL; c->curr_slice = 0; c->max_slice = c->num_landmarks; @@ -2230,11 +2751,11 @@ int cram_write_container(cram_fd *fd, cram_container *c) { char buf_a[1024], *buf = buf_a, *cp; int i; - if (50 + c->num_landmarks * 5 >= 1024) - buf = malloc(50 + c->num_landmarks * 5); + if (55 + c->num_landmarks * 5 >= 1024) + buf = malloc(55 + c->num_landmarks * 5); cp = buf; - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { cp += itf8_put(cp, c->length); } else { *(int32_t *)cp = le_int4(c->length); @@ -2250,14 +2771,28 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += itf8_put(cp, c->ref_seq_span); } cp += itf8_put(cp, c->num_records); - if (fd->version != CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 2) { cp += itf8_put(cp, c->record_counter); cp += ltf8_put(cp, c->num_bases); + } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { + cp += ltf8_put(cp, c->record_counter); + cp += ltf8_put(cp, c->num_bases); } + cp += itf8_put(cp, c->num_blocks); cp += itf8_put(cp, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) cp += itf8_put(cp, c->landmark[i]); + + if (CRAM_MAJOR_VERS(fd->version) >= 3) { + c->crc32 = crc32(0L, (uc *)buf, cp-buf); + cp[0] = c->crc32 & 0xff; + cp[1] = (c->crc32 >> 8) & 0xff; + cp[2] = (c->crc32 >> 16) & 0xff; + cp[3] = (c->crc32 >> 24) & 0xff; + cp += 4; + } + if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { if (buf != buf_a) free(buf); @@ -2452,35 +2987,10 @@ void cram_free_compression_header(cram_block_compression_hdr *hdr) { } } - if (hdr->BF_codec) hdr->BF_codec->free(hdr->BF_codec); - if (hdr->CF_codec) hdr->CF_codec->free(hdr->CF_codec); - if (hdr->RL_codec) hdr->RL_codec->free(hdr->RL_codec); - if (hdr->AP_codec) hdr->AP_codec->free(hdr->AP_codec); - if (hdr->RG_codec) hdr->RG_codec->free(hdr->RG_codec); - if (hdr->MF_codec) hdr->MF_codec->free(hdr->MF_codec); - if (hdr->NS_codec) hdr->NS_codec->free(hdr->NS_codec); - if (hdr->NP_codec) hdr->NP_codec->free(hdr->NP_codec); - if (hdr->TS_codec) hdr->TS_codec->free(hdr->TS_codec); - if (hdr->NF_codec) hdr->NF_codec->free(hdr->NF_codec); - if (hdr->TC_codec) hdr->TC_codec->free(hdr->TC_codec); - if (hdr->TN_codec) hdr->TN_codec->free(hdr->TN_codec); - if (hdr->TL_codec) hdr->TL_codec->free(hdr->TL_codec); - if (hdr->FN_codec) hdr->FN_codec->free(hdr->FN_codec); - if (hdr->FC_codec) hdr->FC_codec->free(hdr->FC_codec); - if (hdr->FP_codec) hdr->FP_codec->free(hdr->FP_codec); - if (hdr->BS_codec) hdr->BS_codec->free(hdr->BS_codec); - if (hdr->IN_codec) hdr->IN_codec->free(hdr->IN_codec); - if (hdr->SC_codec) hdr->SC_codec->free(hdr->SC_codec); - if (hdr->DL_codec) hdr->DL_codec->free(hdr->DL_codec); - if (hdr->BA_codec) hdr->BA_codec->free(hdr->BA_codec); - if (hdr->MQ_codec) hdr->MQ_codec->free(hdr->MQ_codec); - if (hdr->RN_codec) hdr->RN_codec->free(hdr->RN_codec); - if (hdr->QS_codec) hdr->QS_codec->free(hdr->QS_codec); - if (hdr->Qs_codec) hdr->Qs_codec->free(hdr->Qs_codec); - if (hdr->RI_codec) hdr->RI_codec->free(hdr->RI_codec); - if (hdr->RS_codec) hdr->RS_codec->free(hdr->RS_codec); - if (hdr->PD_codec) hdr->PD_codec->free(hdr->PD_codec); - if (hdr->HC_codec) hdr->HC_codec->free(hdr->HC_codec); + for (i = 0; i < DS_END; i++) { + if (hdr->codecs[i]) + hdr->codecs[i]->free(hdr->codecs[i]); + } if (hdr->TL) free(hdr->TL); @@ -2547,17 +3057,30 @@ void cram_free_slice(cram_slice *s) { if (s->aux_blk) cram_free_block(s->aux_blk); + if (s->aux_OQ_blk) + cram_free_block(s->aux_OQ_blk); + + if (s->aux_BQ_blk) + cram_free_block(s->aux_BQ_blk); + + if (s->aux_FZ_blk) + cram_free_block(s->aux_FZ_blk); + + if (s->aux_oq_blk) + cram_free_block(s->aux_oq_blk); + + if (s->aux_os_blk) + cram_free_block(s->aux_os_blk); + + if (s->aux_oz_blk) + cram_free_block(s->aux_oz_blk); + if (s->base_blk) cram_free_block(s->base_blk); if (s->soft_blk) cram_free_block(s->soft_blk); -#ifdef TN_external - if (s->tn_blk) - cram_free_block(s->tn_blk); -#endif - if (s->cigar) free(s->cigar); @@ -2567,16 +3090,16 @@ void cram_free_slice(cram_slice *s) { if (s->features) free(s->features); -#ifndef TN_external if (s->TN) free(s->TN); -#endif - + if (s->pair_keys) string_pool_destroy(s->pair_keys); - if (s->pair) - kh_destroy(m_s2i, s->pair); + if (s->pair[0]) + kh_destroy(m_s2i, s->pair[0]); + if (s->pair[1]) + kh_destroy(m_s2i, s->pair[1]); free(s); } @@ -2601,21 +3124,17 @@ cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { s->block = NULL; s->block_by_id = NULL; s->last_apos = 0; - s->id = 0; - if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; + if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; s->cigar = NULL; s->cigar_alloc = 0; s->ncigar = 0; - if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; - if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err; - if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err; - if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err; - if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err; - if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err; -#ifdef TN_external - if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err; -#endif + if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; + if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; + if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; + if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; + if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; + if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; s->features = NULL; s->nfeatures = s->afeatures = 0; @@ -2627,7 +3146,8 @@ cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { // Volatile keys as we do realloc in dstring if (!(s->pair_keys = string_pool_create(8192))) goto err; - if (!(s->pair = kh_init(m_s2i))) goto err; + if (!(s->pair[0] = kh_init(m_s2i))) goto err; + if (!(s->pair[1] = kh_init(m_s2i))) goto err; #ifdef BA_external s->BA_len = 0; @@ -2706,23 +3226,17 @@ cram_slice *cram_read_slice(cram_fd *fd) { s->cigar_alloc = 0; s->ncigar = 0; - if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; - if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err; - if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err; - if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err; - if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err; - if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err; -#ifdef TN_external - if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err; -#endif - + if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; + if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; + if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; + if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; + if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; + if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; s->crecs = NULL; s->last_apos = s->hdr->ref_seq_start; - s->id = fd->slice_num++; - return s; err: @@ -2760,9 +3274,9 @@ cram_file_def *cram_read_file_def(cram_fd *fd) { return NULL; } - if (def->major_version > 2) { + if (def->major_version > 3) { fprintf(stderr, "CRAM version number mismatch\n" - "Expected 1.x or 2.x, got %d.%d\n", + "Expected 1.x, 2.x or 3.x, got %d.%d\n", def->major_version, def->minor_version); free(def); return NULL; @@ -2806,7 +3320,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { SAM_hdr *hdr; /* 1.1 onwards stores the header in the first block of a container */ - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { /* Length */ if (-1 == int32_decode(fd, &header_len)) return NULL; @@ -2837,8 +3351,9 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { cram_free_container(c); return NULL; } + cram_uncompress_block(b); - len = b->comp_size + 2 + + len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + itf8_size(b->content_id) + itf8_size(b->uncomp_size) + itf8_size(b->comp_size); @@ -2850,12 +3365,13 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { cram_free_block(b); return NULL; } - if (NULL == (header = malloc(header_len))) { + if (NULL == (header = malloc(header_len+1))) { cram_free_container(c); cram_free_block(b); return NULL; } memcpy(header, BLOCK_END(b), header_len); + header[header_len]='\0'; cram_free_block(b); /* Consume any remaining blocks */ @@ -2864,7 +3380,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { cram_free_container(c); return NULL; } - len += b->comp_size + 2 + + len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + itf8_size(b->content_id) + itf8_size(b->uncomp_size) + itf8_size(b->comp_size); @@ -2890,11 +3406,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { } /* Parse */ -#ifdef SAMTOOLS hdr = sam_hdr_parse_(header, header_len); -#else - hdr = sam_hdr_parse(header, header_len); -#endif free(header); return hdr; @@ -2930,14 +3442,20 @@ static void full_path(char *out, char *in) { * Returns 0 on success * -1 on failure */ -//#define BLANK_BLOCK -//#define PADDED_CONTAINER -#define PADDED_BLOCK int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { int header_len; + int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3); + + /* Write CRAM MAGIC if not yet written. */ + if (fd->file_def->major_version == 0) { + fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version); + fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version); + if (0 != cram_write_file_def(fd, fd->file_def)) + return -1; + } /* 1.0 requires and UNKNOWN read-group */ - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { if (!sam_hdr_find_rg(hdr, "UNKNOWN")) if (sam_hdr_add(hdr, "RG", "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) @@ -2996,7 +3514,7 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { /* Length */ header_len = sam_hdr_length(hdr); - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { if (-1 == int32_encode(fd, header_len)) return -1; @@ -3004,11 +3522,12 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len)) return -1; } else { - /* Create a block inside a container */ + /* Create block(s) inside a container */ cram_block *b = cram_new_block(FILE_HEADER, 0); cram_container *c = cram_new_container(0, 0); int padded_length; char *pads; + int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3); if (!b || !c) { if (b) cram_free_block(b); @@ -3020,53 +3539,62 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); BLOCK_UPLEN(b); -#ifndef BLANK_BLOCK - c->num_blocks = 1; - c->num_landmarks = 1; - if (!(c->landmark = malloc(sizeof(*c->landmark)))) { - cram_free_block(b); - cram_free_container(c); - return -1; - } - c->landmark[0] = 0; + // Compress header block if V3.0 and above + if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) { + int method = 1<use_bz2) + method |= 1<use_lzma) + method |= 1<level); + } + + if (blank_block) { + c->length = b->comp_size + 2 + 4*is_cram_3 + + itf8_size(b->content_id) + + itf8_size(b->uncomp_size) + + itf8_size(b->comp_size); - c->length = b->uncomp_size + 2 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); -#else - c->length = b->uncomp_size + 2 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + c->num_blocks = 2; + c->num_landmarks = 2; + if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) { + cram_free_block(b); + cram_free_container(c); + return -1; + } + c->landmark[0] = 0; + c->landmark[1] = c->length; - c->num_blocks = 2; - c->num_landmarks = 2; - if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) - return -1; - c->landmark[0] = 0; - c->landmark[1] = c->length; + // Plus extra storage for uncompressed secondary blank block + padded_length = MIN(c->length*.5, 10000); + c->length += padded_length + 2 + 4*is_cram_3 + + itf8_size(b->content_id) + + itf8_size(padded_length)*2; + } else { + // Pad the block instead. + c->num_blocks = 1; + c->num_landmarks = 1; + if (!(c->landmark = malloc(sizeof(*c->landmark)))) + return -1; + c->landmark[0] = 0; - c->length *= 2; -#endif + padded_length = MAX(c->length*1.5, 10000) - c->length; -#ifdef PADDED_BLOCK - padded_length = MAX(c->length*1.5, 10000) - c->length; - c->length += padded_length; - if (NULL == (pads = calloc(1, padded_length))) { - cram_free_block(b); - cram_free_container(c); - return -1; - } - BLOCK_APPEND(b, pads, padded_length); - BLOCK_UPLEN(b); - free(pads); -#endif + c->length = b->comp_size + padded_length + + 2 + 4*is_cram_3 + + itf8_size(b->content_id) + + itf8_size(b->uncomp_size) + + itf8_size(b->comp_size); -#ifdef PADDED_CONTAINER - padded_length = MAX(c->length*2, 10000) - c->length; - c->length += padded_length; -#endif + if (NULL == (pads = calloc(1, padded_length))) { + cram_free_block(b); + cram_free_container(c); + return -1; + } + BLOCK_APPEND(b, pads, padded_length); + BLOCK_UPLEN(b); + free(pads); + } if (-1 == cram_write_container(fd, c)) { cram_free_block(b); @@ -3074,32 +3602,27 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { return -1; } - // Keep it uncompressed if (-1 == cram_write_block(fd, b)) { cram_free_block(b); cram_free_container(c); return -1; } -#ifdef BLANK_BLOCK - if (-1 == cram_write_block(fd, b)) { - cram_free_block(b); - cram_free_container(c); - return -1; + if (blank_block) { + BLOCK_RESIZE(b, padded_length); + memset(BLOCK_DATA(b), 0, padded_length); + BLOCK_SIZE(b) = padded_length; + BLOCK_UPLEN(b); + b->method = RAW; + if (-1 == cram_write_block(fd, b)) { + cram_free_block(b); + cram_free_container(c); + return -1; + } } -#endif cram_free_block(b); cram_free_container(c); - -#ifdef PADDED_CONTAINER - // Write out padding to allow for in-line SAM header editing - if (NULL == (pads = calloc(1, padded_length))) - return -1; - if (padded_length != hwrite(fd->fp, pads, padded_length)) - return -1; - free(pads); -#endif } if (-1 == refs_from_header(fd->refs, fd, fd->header)) @@ -3140,7 +3663,7 @@ static void cram_init_tables(cram_fd *fd) { fd->L2['T'] = 3; fd->L2['t'] = 3; fd->L2['N'] = 4; fd->L2['n'] = 4; - if (fd->version == CRAM_1_VERS) { + if (CRAM_MAJOR_VERS(fd->version) == 1) { for (i = 0; i < 0x200; i++) { int f = 0; @@ -3215,7 +3738,7 @@ static int minor_version = 1; * NULL on failure. */ cram_fd *cram_open(const char *filename, const char *mode) { - cram_FILE *fp; + hFILE *fp; cram_fd *fd; char fmode[3]= { mode[0], '\0', '\0' }; @@ -3223,15 +3746,7 @@ cram_fd *cram_open(const char *filename, const char *mode) { fmode[1] = 'b'; } -#ifdef SAMTOOLS fp = hopen(filename, fmode); -#else - if (strcmp(filename, "-") == 0) { - fp = (*fmode == 'r') ? stdin : stdout; - } else { - fp = fopen(filename, fmode); - } -#endif if (!fp) return NULL; @@ -3246,11 +3761,8 @@ cram_fd *cram_open(const char *filename, const char *mode) { * * Returns file handle on success; * NULL on failure. - * - * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how - * cram_structs.h has been configured. */ -cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { +cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { int i; char *cp; cram_fd *fd = calloc(1, sizeof(*fd)); @@ -3258,8 +3770,12 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { return NULL; fd->level = 5; - if (strlen(mode) > 2 && mode[2] >= '0' && mode[2] <= '9') - fd->level = mode[2] - '0'; + for (i = 0; mode[i]; i++) { + if (mode[i] >= '0' && mode[i] <= '9') { + fd->level = mode[i] - '0'; + break; + } + } fd->fp = fp; fd->mode = *mode; @@ -3271,7 +3787,7 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { if (!(fd->file_def = cram_read_file_def(fd))) goto err; - fd->version = fd->file_def->major_version * 100 + + fd->version = fd->file_def->major_version * 256 + fd->file_def->minor_version; if (!(fd->header = cram_read_SAM_hdr(fd))) @@ -3279,22 +3795,24 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { } else { /* Writer */ - cram_file_def def; - - def.magic[0] = 'C'; - def.magic[1] = 'R'; - def.magic[2] = 'A'; - def.magic[3] = 'M'; - def.major_version = major_version; - def.minor_version = minor_version; - memset(def.file_id, 0, 20); - strncpy(def.file_id, filename, 20); - if (0 != cram_write_file_def(fd, &def)) - goto err; + cram_file_def *def = calloc(1, sizeof(*def)); + if (!def) + return NULL; - fd->version = def.major_version * 100 + def.minor_version; + fd->file_def = def; - /* SAM header written later */ + def->magic[0] = 'C'; + def->magic[1] = 'R'; + def->magic[2] = 'A'; + def->magic[3] = 'M'; + def->major_version = 0; // Indicator to write file def later. + def->minor_version = 0; + memset(def->file_id, 0, 20); + strncpy(def->file_id, filename, 20); + + fd->version = major_version * 256 + minor_version; + + /* SAM header written later along with this file_def */ } cram_init_tables(fd); @@ -3302,7 +3820,6 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); if (!fd->prefix) goto err; - fd->slice_num = 0; fd->first_base = fd->last_base = -1; fd->record_counter = 0; @@ -3321,7 +3838,9 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { fd->no_ref = 0; fd->ignore_md5 = 0; fd->use_bz2 = 0; - fd->multi_seq = 0; + fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); + fd->use_lzma = 0; + fd->multi_seq = -1; fd->unsorted = 0; fd->shared_ref = 0; @@ -3331,8 +3850,9 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { fd->rqueue = NULL; fd->job_pending = NULL; fd->ooc = 0; + fd->required_fields = INT_MAX; - for (i = 0; i < 7; i++) + for (i = 0; i < DS_END; i++) fd->m[i] = cram_new_metrics(); fd->range.refid = -2; // no ref. @@ -3363,6 +3883,8 @@ cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { int cram_seek(cram_fd *fd, off_t offset, int whence) { char buf[65536]; + fd->ooc = 0; + if (hseek(fd->fp, offset, whence) >= 0) return 0; @@ -3439,16 +3961,25 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w') { /* Write EOF block */ - if (30 != hwrite(fd->fp, "\x0b\x00\x00\x00\xff\xff\xff\xff" - "\xff\xe0\x45\x4f\x46\x00\x00\x00" - "\x00\x01\x00\x00\x01\x00\x06\x06" - "\x01\x00\x01\x00\x01\x00", 30)) - return -1; - -// if (1 != fwrite("\x00\x00\x00\x00\xff\xff\xff\xff" -// "\xff\xe0\x45\x4f\x46\x00\x00\x00" -// "\x00\x00\x00", 19, 1, fd->fp)) -// return -1; + if (CRAM_MAJOR_VERS(fd->version) == 3) { + if (38 != hwrite(fd->fp, + "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR + "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR + "\x00\x01\x00" // Cont HDR + "\x05\xbd\xd9\x4f" // CRC32 + "\x00\x01\x00\x06\x06" // Comp.HDR blk + "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk + "\xee\x63\x01\x4b", // CRC32 + 38)) + return -1; + } else { + if (30 != hwrite(fd->fp, + "\x0b\x00\x00\x00\xff\xff\xff\xff" + "\x0f\xe0\x45\x4f\x46\x00\x00\x00" + "\x00\x01\x00\x00\x01\x00\x06\x06" + "\x01\x00\x01\x00\x01\x00", 30)) + return -1; + } } for (bl = fd->bl; bl; bl = next) { @@ -3463,7 +3994,7 @@ int cram_close(cram_fd *fd) { free(bl); } - if (paranoid_hclose(fd->fp) != 0) + if (hclose(fd->fp) != 0) return -1; if (fd->file_def) @@ -3482,7 +4013,7 @@ int cram_close(cram_fd *fd) { if (fd->ref_free) free(fd->ref_free); - for (i = 0; i < 7; i++) + for (i = 0; i < DS_END; i++) if (fd->m[i]) free(fd->m[i]); @@ -3532,6 +4063,9 @@ int cram_set_option(cram_fd *fd, enum cram_option opt, ...) { int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { refs_t *refs; + if (!fd) + return -1; + switch (opt) { case CRAM_OPT_DECODE_MD: fd->decode_md = va_arg(args, int); @@ -3572,6 +4106,14 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { fd->use_bz2 = va_arg(args, int); break; + case CRAM_OPT_USE_RANS: + fd->use_rans = va_arg(args, int); + break; + + case CRAM_OPT_USE_LZMA: + fd->use_lzma = va_arg(args, int); + break; + case CRAM_OPT_SHARED_REF: fd->shared_ref = 1; refs = va_arg(args, refs_t *); @@ -3604,6 +4146,10 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { "use 1.0, 2.0, 2.1 or 3.0\n"); return -1; } + fd->version = major*256 + minor; + + if (CRAM_MAJOR_VERS(fd->version) >= 3) + fd->use_rans = 1; break; } @@ -3643,6 +4189,10 @@ int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { //t_pool_dispatch(fd->pool, cram_decoder_thread, fd); break; + case CRAM_OPT_REQUIRED_FIELDS: + fd->required_fields = va_arg(args, int); + break; + default: fprintf(stderr, "Unknown CRAM option code %d\n", opt); return -1; diff --git a/htslib/cram/cram_io.h b/htslib/cram/cram_io.h index 49073f76..43344c2d 100644 --- a/htslib/cram/cram_io.h +++ b/htslib/cram/cram_io.h @@ -100,6 +100,9 @@ int itf8_put(char *cp, int32_t val); #endif +int ltf8_get(char *cp, int64_t *val_p); +int ltf8_put(char *cp, int64_t val); + /*! Pushes a value in ITF8 format onto the end of a block. * * This shouldn't be used for high-volume data as it is not the fastest @@ -179,8 +182,7 @@ int cram_uncompress_block(cram_block *b); * -1 on failure */ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, - int level, int strat, - int level2, int strat2); + int method, int level); cram_metrics *cram_new_metrics(void); char *cram_block_method2str(enum cram_block_method m); @@ -222,19 +224,84 @@ char *cram_content_type2str(enum cram_content_type t); (b)->data[(b)->byte++] = (c); \ } while (0) -/* Append via sprintf with 1 arg */ -#define BLOCK_APPENDF_1(b,buf,fmt, a1) \ - do { \ - int l = sprintf((buf), (fmt), (a1)); \ - BLOCK_APPEND((b), (buf), l); \ +/* Append a single unsigned integer */ +#define BLOCK_APPEND_UINT(b,i) \ + do { \ + unsigned char *cp; \ + BLOCK_GROW((b),11); \ + cp = &(b)->data[(b)->byte]; \ + (b)->byte += append_uint32(cp, (i)) - cp; \ } while (0) -/* Append via sprintf with 2 args */ -#define BLOCK_APPENDF_2(b,buf,fmt, a1,a2) \ - do { \ - int l = sprintf((buf), (fmt), (a1), (a2)); \ - BLOCK_APPEND((b), (buf), l); \ - } while (0) +static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) { + uint32_t j; + + if (i == 0) { + *cp++ = '0'; + return cp; + } + + if (i < 100) goto b1; + if (i < 10000) goto b3; + if (i < 1000000) goto b5; + if (i < 100000000) goto b7; + + if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;} + if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;} + b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;} + if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;} + b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;} + if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;} + b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;} + if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;} + b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;} + if (i) *cp++ = i + '0'; + return cp; + + x8: *cp++ = i / 100000000 + '0', i %= 100000000; + x7: *cp++ = i / 10000000 + '0', i %= 10000000; + x6: *cp++ = i / 1000000 + '0', i %= 1000000; + x5: *cp++ = i / 100000 + '0', i %= 100000; + x4: *cp++ = i / 10000 + '0', i %= 10000; + x3: *cp++ = i / 1000 + '0', i %= 1000; + x2: *cp++ = i / 100 + '0', i %= 100; + x1: *cp++ = i / 10 + '0', i %= 10; + x0: *cp++ = i + '0'; + + return cp; +} + +static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) { + *cp++ = i / 100000000 + '0', i %= 100000000; + *cp++ = i / 10000000 + '0', i %= 10000000; + *cp++ = i / 1000000 + '0', i %= 1000000; + *cp++ = i / 100000 + '0', i %= 100000; + *cp++ = i / 10000 + '0', i %= 10000; + *cp++ = i / 1000 + '0', i %= 1000; + *cp++ = i / 100 + '0', i %= 100; + *cp++ = i / 10 + '0', i %= 10; + *cp++ = i + '0'; + + return cp; +} + +static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) { + uint64_t j; + + if (i <= 0xffffffff) + return append_uint32(cp, i); + + if ((j = i/1000000000) > 1000000000) { + cp = append_uint32(cp, j/1000000000); + j %= 1000000000; + cp = append_sub32(cp, j); + } else { + cp = append_uint32(cp, i / 1000000000); + } + cp = append_sub32(cp, i % 1000000000); + + return cp; +} #define BLOCK_UPLEN(b) \ (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b)) @@ -449,11 +516,8 @@ cram_fd *cram_open(const char *filename, const char *mode); * @return * Returns file handle on success; * NULL on failure. - * - * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how - * cram_structs.h has been configured. */ -cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode); +cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode); /*! Closes a CRAM file. * diff --git a/htslib/cram/cram_samtools.c b/htslib/cram/cram_samtools.c index 66f2efa4..27c54e55 100644 --- a/htslib/cram/cram_samtools.c +++ b/htslib/cram/cram_samtools.c @@ -112,7 +112,10 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, if (i < len) *cp++ = L[(uc)seq[i]]<<4; - memcpy(cp, qual, len); + if (qual) + memcpy(cp, qual, len); + else + memset(cp, '\xff', len); return 0; } diff --git a/htslib/cram/cram_stats.c b/htslib/cram/cram_stats.c index 18d06056..9551f00d 100644 --- a/htslib/cram/cram_stats.c +++ b/htslib/cram/cram_stats.c @@ -209,31 +209,124 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { return E_HUFFMAN; } + if (fd->verbose > 1) + fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n", + min_val, max_val, nvals, ntot); + + /* Theoretical entropy */ +// if (fd->verbose > 1) { +// double dbits = 0; +// for (i = 0; i < nvals; i++) { +// dbits += freqs[i] * log((double)freqs[i]/ntot); +// } +// dbits /= -log(2); +// if (fd->verbose > 1) +// fprintf(stderr, "Entropy = %f\n", dbits); +// } + + if (nvals > 1 && ntot > 256) { +#if 0 + /* + * CRUDE huffman estimator. Round to closest and round up from 0 + * to 1 bit. + * + * With and without ITF8 incase we have a few discrete values but with + * large magnitude. + * + * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be + * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes + * may detect the correlation of high bytes to low bytes in multi- + * byte values. So this predictor breaks down. + */ + double dbits = 0; // entropy + ~huffman + double dbitsH = 0; + double dbitsE = 0; // external entropy + ~huffman + double dbitsEH = 0; + int F[256] = {0}, n = 0; + double e = 0; // accumulated error bits + for (i = 0; i < nvals; i++) { + double x; int X; + unsigned int v = vals[i]; + + //Better encoding would cope with sign. + //v = ABS(vals[i])*2+(vals[i]<0); + + if (!(v & ~0x7f)) { + F[v] += freqs[i], n+=freqs[i]; + } else if (!(v & ~0x3fff)) { + F[(v>>8) |0x80] += freqs[i]; + F[ v &0xff] += freqs[i], n+=2*freqs[i]; + } else if (!(v & ~0x1fffff)) { + F[(v>>16)|0xc0] += freqs[i]; + F[(v>>8 )&0xff] += freqs[i]; + F[ v &0xff] += freqs[i], n+=3*freqs[i]; + } else if (!(v & ~0x0fffffff)) { + F[(v>>24)|0xe0] += freqs[i]; + F[(v>>16)&0xff] += freqs[i]; + F[(v>>8 )&0xff] += freqs[i]; + F[ v &0xff] += freqs[i], n+=4*freqs[i]; + } else { + F[(v>>28)|0xf0] += freqs[i]; + F[(v>>20)&0xff] += freqs[i]; + F[(v>>12)&0xff] += freqs[i]; + F[(v>>4 )&0xff] += freqs[i]; + F[ v &0x0f] += freqs[i], n+=5*freqs[i]; + } + + x = -log((double)freqs[i]/ntot)/.69314718055994530941; + X = x+0.5; + if ((int)(x+((double)e/freqs[i])+.5)>X) { + X++; + } else if ((int)(x+((double)e/freqs[i])+.5) 1.1) { + //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals); + free(vals); free(freqs); + return nvals < 200 ? E_HUFFMAN : E_BETA; + } +#endif + free(vals); free(freqs); + return E_EXTERNAL; + } + /* * Avoid complex stats for now, just do heuristic of HUFFMAN for small * alphabets and BETA for anything large. */ free(vals); free(freqs); return nvals < 200 ? E_HUFFMAN : E_BETA; + //return E_HUFFMAN; + //return E_EXTERNAL; + /* We only support huffman now anyway... */ //free(vals); free(freqs); return E_HUFFMAN; - if (fd->verbose > 1) - fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n", - min_val, max_val, nvals, ntot); - - /* Theoretical entropy */ - { - double dbits = 0; - for (i = 0; i < nvals; i++) { - dbits += freqs[i] * log((double)freqs[i]/ntot); - } - dbits /= -log(2); - if (fd->verbose > 1) - fprintf(stderr, "Entropy = %f\n", dbits); - } - /* Beta */ bits = nbits(max_val - min_val) * ntot; if (fd->verbose > 1) diff --git a/htslib/cram/cram_structs.h b/htslib/cram/cram_structs.h index 6d3f1a13..ab9f5bf0 100644 --- a/htslib/cram/cram_structs.h +++ b/htslib/cram/cram_structs.h @@ -53,11 +53,8 @@ extern "C" { #include #include "cram/thread_pool.h" - -#ifdef SAMTOOLS -// From within samtools/HTSlib -# include "cram/string_alloc.h" -# include "htslib/khash.h" +#include "cram/string_alloc.h" +#include "htslib/khash.h" // Generic hash-map integer -> integer KHASH_MAP_INIT_INT(m_i2i, int) @@ -82,24 +79,12 @@ typedef union { KHASH_MAP_INIT_STR(map, pmap_t) struct hFILE; -typedef struct hFILE cram_FILE; - -#else -// From within io_lib -# include "cram/bam.h" // For BAM header parsing -typedef FILE cram_FILE; -#endif #define SEQS_PER_SLICE 10000 #define SLICE_PER_CNT 1 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" -#define TN_external -//#define NS_external -#define TS_external -//#define BA_external - #define MAX_STAT_VAL 1024 //#define MAX_STAT_VAL 16 typedef struct { @@ -131,6 +116,63 @@ enum cram_external_type { E_BYTE_ARRAY_BLOCK = 5, }; +/* External IDs used by this implementation (only assumed during writing) */ +enum cram_DS_ID { + DS_CORE = 0, + DS_aux = 1, // aux_blk + DS_aux_OQ = 2, + DS_aux_BQ = 3, + DS_aux_BD = 4, + DS_aux_BI = 5, + DS_aux_FZ = 6, // also ZM:B + DS_aux_oq = 7, // other qualities + DS_aux_os = 8, // other sequences + DS_aux_oz = 9, // other strings + DS_ref, + DS_RN, // name_blk + DS_QS, // qual_blk + DS_IN, // base_blk + DS_SC, // soft_blk + + DS_BF, // start loop + DS_CF, + DS_AP, + DS_RG, + DS_MQ, + DS_NS, + DS_MF, + DS_TS, + DS_NP, + DS_NF, + DS_RL, + DS_FN, + DS_FC, + DS_FP, + DS_DL, + DS_BA, + DS_BS, + DS_TL, + DS_RI, + DS_RS, + DS_PD, + DS_HC, + DS_BB, + DS_QQ, + + DS_TN, // end loop + + DS_RN_len, + DS_SC_len, + DS_BB_len, + DS_QQ_len, + + DS_TC, // CRAM v1.0 tags + DS_TM, // test + DS_TV, // test + + DS_END, +}; + /* "File Definition Structure" */ typedef struct { char magic[4]; @@ -139,16 +181,21 @@ typedef struct { char file_id[20]; // Filename or SHA1 checksum } cram_file_def; -#define CRAM_1_VERS 100 // 1.0 -#define CRAM_2_VERS 200 // 1.1, or 2.0? +#define CRAM_MAJOR_VERS(v) ((v) >> 8) +#define CRAM_MINOR_VERS(v) ((v) & 0xff) struct cram_slice; enum cram_block_method { - BM_ERROR = -1, - RAW = 0, - GZIP = 1, - BZIP2 = 2, + ERROR = -1, + RAW = 0, + GZIP = 1, + BZIP2 = 2, + LZMA = 3, + RANS = 4, // Generic; either order + RANS0 = 4, + RANS1 = 10, // Not externalised; stored as RANS (generic) + GZIP_RLE = 11, // NB: not externalised in CRAM }; enum cram_content_type { @@ -156,17 +203,44 @@ enum cram_content_type { FILE_HEADER = 0, COMPRESSION_HEADER = 1, MAPPED_SLICE = 2, - UNMAPPED_SLICE = 3, // CRAM_1_VERS only + UNMAPPED_SLICE = 3, // CRAM V1.0 only EXTERNAL = 4, CORE = 5, }; /* Compression metrics */ typedef struct { - int m1; - int m2; + // number of trials and time to next trial int trial; int next_trial; + + // aggregate sizes during trials + int sz_gz_rle; + int sz_gz_def; + int sz_rans0; + int sz_rans1; + int sz_bzip2; + int sz_lzma; + + // resultant method from trials + int method; + int strat; + + // Revisions of method, to allow culling of continually failing ones. + int gz_rle_cnt; + int gz_def_cnt; + int rans0_cnt; + int rans1_cnt; + int bzip2_cnt; + int lzma_cnt; + int revised_method; + + double gz_rle_extra; + double gz_def_extra; + double rans0_extra; + double rans1_extra; + double bzip2_extra; + double lzma_extra; } cram_metrics; /* Block */ @@ -176,6 +250,7 @@ typedef struct { int32_t content_id; int32_t comp_size; int32_t uncomp_size; + uint32_t crc32; int32_t idx; /* offset into data */ unsigned char *data; @@ -221,40 +296,12 @@ typedef struct { struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; - struct cram_codec *BF_codec; // bam bit flags - struct cram_codec *CF_codec; // compression flags - struct cram_codec *RL_codec; // read length - struct cram_codec *AP_codec; // alignment pos - struct cram_codec *RG_codec; // read group - struct cram_codec *MF_codec; // mate flags - struct cram_codec *NS_codec; // next frag ref ID - struct cram_codec *NP_codec; // next frag pos - struct cram_codec *TS_codec; // template size - struct cram_codec *NF_codec; // next frag distance - struct cram_codec *TC_codec; // tag count CRAM_1_VERS - struct cram_codec *TN_codec; // tag name/type CRAM_1_VERS - struct cram_codec *TL_codec; // tag line CRAM_2_VERS - struct cram_codec *FN_codec; // no. features - struct cram_codec *FC_codec; // feature code - struct cram_codec *FP_codec; // feature pos - struct cram_codec *BS_codec; // base subst feature - struct cram_codec *IN_codec; // insertion feature - struct cram_codec *SC_codec; // soft-clip feature - struct cram_codec *DL_codec; // deletion len feature - struct cram_codec *BA_codec; // base feature - struct cram_codec *RS_codec; // ref skip length feature - struct cram_codec *PD_codec; // padding length feature - struct cram_codec *HC_codec; // hard clip length feature - struct cram_codec *MQ_codec; // mapping quality - struct cram_codec *RN_codec; // read names - struct cram_codec *QS_codec; // quality value (single) - struct cram_codec *Qs_codec; // quality values (string) - struct cram_codec *RI_codec; // ref ID - struct cram_codec *TM_codec; // ? - struct cram_codec *TV_codec; // ? + struct cram_codec *codecs[DS_END]; char *uncomp; // A single block of uncompressed data size_t uncomp_size, uncomp_alloc; + + unsigned int data_series; // See cram_fields enum below } cram_block_compression_hdr; typedef struct cram_map { @@ -273,7 +320,7 @@ typedef struct { int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ int32_t num_records; - int32_t record_counter; + int64_t record_counter; int32_t num_blocks; int32_t num_content_ids; int32_t *block_content_ids; @@ -297,7 +344,7 @@ typedef struct { int32_t ref_seq_id; int32_t ref_seq_start; int32_t ref_seq_span; - int32_t record_counter; + int64_t record_counter; int64_t num_bases; int32_t num_records; int32_t num_blocks; @@ -334,37 +381,12 @@ typedef struct { bam_seq_t **bams; /* Statistics for encoding */ - cram_stats *TS_stats; - cram_stats *RG_stats; - cram_stats *FP_stats; - cram_stats *NS_stats; - cram_stats *RN_stats; - cram_stats *CF_stats; - cram_stats *TN_stats; - cram_stats *BA_stats; - cram_stats *TV_stats; - cram_stats *BS_stats; - cram_stats *FC_stats; - cram_stats *BF_stats; - cram_stats *AP_stats; - cram_stats *NF_stats; - cram_stats *MF_stats; - cram_stats *FN_stats; - cram_stats *RL_stats; - cram_stats *DL_stats; - cram_stats *TC_stats; - cram_stats *TL_stats; - cram_stats *MQ_stats; - cram_stats *TM_stats; - cram_stats *QS_stats; - cram_stats *NP_stats; - cram_stats *RI_stats; - cram_stats *RS_stats; - cram_stats *PD_stats; - cram_stats *HC_stats; + cram_stats *stats[DS_END]; khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map int *refs_used; // array of frequency of ref seq IDs + + uint32_t crc32; // CRC32 } cram_container; /* @@ -437,6 +459,12 @@ typedef struct { int base; // actual base & qual int qual; } B; + struct { + int pos; + int code; + int seq_idx; // index to s->seqs_blk + int len; + } b; struct { int pos; int code; @@ -496,9 +524,6 @@ typedef struct cram_slice { /* State used during encoding/decoding */ int last_apos, max_apos; - /* Identifier used for auto-assigning read names */ - uint64_t id; - /* Array of decoded cram records */ cram_record *crecs; @@ -508,12 +533,6 @@ typedef struct cram_slice { uint32_t *cigar; uint32_t cigar_alloc; uint32_t ncigar; - cram_block *name_blk; - cram_block *seqs_blk; - cram_block *qual_blk; - cram_block *aux_blk; - cram_block *base_blk; // substitutions (soft-clips for 1.0) - cram_block *soft_blk; // soft-clips cram_feature *features; int nfeatures; @@ -528,17 +547,28 @@ typedef struct cram_slice { int tn_id; #endif + // For variable sized elements which are always external blocks. + cram_block *name_blk; + cram_block *seqs_blk; + cram_block *qual_blk; + cram_block *base_blk; + cram_block *soft_blk; + cram_block *aux_blk; + cram_block *aux_OQ_blk; + cram_block *aux_BQ_blk; + cram_block *aux_BD_blk; + cram_block *aux_BI_blk; + cram_block *aux_FZ_blk; + cram_block *aux_oq_blk; + cram_block *aux_os_blk; + cram_block *aux_oz_blk; + string_alloc_t *pair_keys; // Pooled keys for pair hash. - khash_t(m_s2i) *pair; // for identifying read-pairs in this slice. + khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. char *ref; // slice of current reference int ref_start; // start position of current reference; int ref_end; // end position of current reference; - -#ifdef BA_external - int BA_len; - int ba_id; -#endif int ref_id; } cram_slice; @@ -568,7 +598,7 @@ typedef struct { int nref; // number of ref_entry char *fn; // current file opened - FILE *fp; // and the FILE* to go with it. + BGZF *fp; // and the hFILE* to go with it. int count; // how many cram_fd sharing this refs struct @@ -620,15 +650,14 @@ typedef struct spare_bams { } spare_bams; typedef struct cram_fd { - cram_FILE *fp; + struct hFILE *fp; int mode; // 'r' or 'w' int version; cram_file_def *file_def; SAM_hdr *header; char *prefix; - int record_counter; - int slice_num; + int64_t record_counter; int err; // Most recent compression header decoded @@ -651,7 +680,7 @@ typedef struct cram_fd { // compression level and metrics int level; - cram_metrics *m[7]; + cram_metrics *m[DS_END]; // options int decode_md; // Whether to export MD and NM tags @@ -662,7 +691,10 @@ typedef struct cram_fd { int no_ref; int ignore_md5; int use_bz2; + int use_rans; + int use_lzma; int shared_ref; + unsigned int required_fields; cram_range range; // lookup tables, stored here so we can be trivially multi-threaded @@ -693,25 +725,52 @@ typedef struct cram_fd { int ooc; // out of containers. } cram_fd; -enum cram_option { - CRAM_OPT_DECODE_MD, - CRAM_OPT_PREFIX, - CRAM_OPT_VERBOSITY, - CRAM_OPT_SEQS_PER_SLICE, - CRAM_OPT_SLICES_PER_CONTAINER, - CRAM_OPT_RANGE, - CRAM_OPT_VERSION, - CRAM_OPT_EMBED_REF, - CRAM_OPT_IGNORE_MD5, - CRAM_OPT_REFERENCE, - CRAM_OPT_MULTI_SEQ_PER_SLICE, - CRAM_OPT_NO_REF, - CRAM_OPT_USE_BZIP2, - CRAM_OPT_SHARED_REF, - CRAM_OPT_NTHREADS, - CRAM_OPT_THREAD_POOL, +// Translation of required fields to cram data series +enum cram_fields { + CRAM_BF = 0x00000001, + CRAM_AP = 0x00000002, + CRAM_FP = 0x00000004, + CRAM_RL = 0x00000008, + CRAM_DL = 0x00000010, + CRAM_NF = 0x00000020, + CRAM_BA = 0x00000040, + CRAM_QS = 0x00000080, + CRAM_FC = 0x00000100, + CRAM_FN = 0x00000200, + CRAM_BS = 0x00000400, + CRAM_IN = 0x00000800, + CRAM_RG = 0x00001000, + CRAM_MQ = 0x00002000, + CRAM_TL = 0x00004000, + CRAM_RN = 0x00008000, + CRAM_NS = 0x00010000, + CRAM_NP = 0x00020000, + CRAM_TS = 0x00040000, + CRAM_MF = 0x00080000, + CRAM_CF = 0x00100000, + CRAM_RI = 0x00200000, + CRAM_RS = 0x00400000, + CRAM_PD = 0x00800000, + CRAM_HC = 0x01000000, + CRAM_SC = 0x02000000, + CRAM_BB = 0x04000000, + CRAM_BB_len = 0x08000000, + CRAM_QQ = 0x10000000, + CRAM_QQ_len = 0x20000000, + CRAM_aux= 0x40000000, + CRAM_ALL= 0x7fffffff, }; +// A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may +// encode a base difference, but we don't need to know what it is for CIGAR. +// If we have a soft-clip or insertion, we do need SC/IN though to know how +// long that array is. +#define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ + CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) + +#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \ + CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ) + /* BF bitfields */ /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ #define CRAM_FPAIRED 256 @@ -724,6 +783,16 @@ enum cram_option { #define CRAM_FQCFAIL 2 #define CRAM_FDUP 1 +#define DS_aux_S "\001" +#define DS_aux_OQ_S "\002" +#define DS_aux_BQ_S "\003" +#define DS_aux_BD_S "\004" +#define DS_aux_BI_S "\005" +#define DS_aux_FZ_S "\006" +#define DS_aux_oq_S "\007" +#define DS_aux_os_S "\010" +#define DS_aux_oz_S "\011" + #define CRAM_M_REVERSE 1 #define CRAM_M_UNMAP 2 @@ -733,18 +802,6 @@ enum cram_option { #define CRAM_FLAG_DETACHED (1<<1) #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) -/* External IDs used by this implementation (only assumed during writing) */ -#define CRAM_EXT_IN 0 -#define CRAM_EXT_QUAL 1 -#define CRAM_EXT_NAME 2 -#define CRAM_EXT_TS_NP 3 -#define CRAM_EXT_TAG 4 -#define CRAM_EXT_TAG_S "\004" -#define CRAM_EXT_BA 5 -#define CRAM_EXT_TN 6 -#define CRAM_EXT_SC 7 -#define CRAM_EXT_REF 8 - #ifdef __cplusplus } #endif diff --git a/htslib/cram/os.h b/htslib/cram/os.h index b2affe0b..22d80964 100644 --- a/htslib/cram/os.h +++ b/htslib/cram/os.h @@ -225,10 +225,12 @@ extern "C" { */ #ifdef SP_BIG_ENDIAN #define le_int4(x) iswap_int4((x)) +#define le_int2(x) iswap_int2((x)) #endif #ifdef SP_LITTLE_ENDIAN #define le_int4(x) (x) +#define le_int2(x) (x) #endif /*----------------------------------------------------------------------------- diff --git a/htslib/cram/rANS_byte.h b/htslib/cram/rANS_byte.h new file mode 100644 index 00000000..c61ed9d1 --- /dev/null +++ b/htslib/cram/rANS_byte.h @@ -0,0 +1,336 @@ +/* rans_byte.h originally from https://github.com/rygorous/ryg_rans + * + * This is a public-domain implementation of several rANS variants. rANS is an + * entropy coder from the ANS family, as described in Jarek Duda's paper + * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540). + */ + +/*-------------------------------------------------------------------------- */ + +// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014 +// +// Not intended to be "industrial strength"; just meant to illustrate the general +// idea. + +#ifndef RANS_BYTE_HEADER +#define RANS_BYTE_HEADER + +#include + +#ifdef assert +#define RansAssert assert +#else +#define RansAssert(x) +#endif + +// READ ME FIRST: +// +// This is designed like a typical arithmetic coder API, but there's three +// twists you absolutely should be aware of before you start hacking: +// +// 1. You need to encode data in *reverse* - last symbol first. rANS works +// like a stack: last in, first out. +// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give +// it a pointer to the *end* of your buffer (exclusive), and it will +// slowly move towards the beginning as more bytes are emitted. +// 3. Unlike basically any other entropy coder implementation you might +// have used, you can interleave data from multiple independent rANS +// encoders into the same bytestream without any extra signaling; +// you can also just write some bytes by yourself in the middle if +// you want to. This is in addition to the usual arithmetic encoder +// property of being able to switch models on the fly. Writing raw +// bytes can be useful when you have some data that you know is +// incompressible, and is cheaper than going through the rANS encode +// function. Using multiple rANS coders on the same byte stream wastes +// a few bytes compared to using just one, but execution of two +// independent encoders can happen in parallel on superscalar and +// Out-of-Order CPUs, so this can be *much* faster in tight decoding +// loops. +// +// This is why all the rANS functions take the write pointer as an +// argument instead of just storing it in some context struct. + +// -------------------------------------------------------------------------- + +// L ('l' in the paper) is the lower bound of our normalization interval. +// Between this and our byte-aligned emission, we use 31 (not 32!) bits. +// This is done intentionally because exact reciprocals for 31-bit uints +// fit in 32-bit uints: this permits some optimizations during encoding. +#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval + +// State for a rANS encoder. Yep, that's all there is to it. +typedef uint32_t RansState; + +// Initialize a rANS encoder. +static inline void RansEncInit(RansState* r) +{ + *r = RANS_BYTE_L; +} + +// Renormalize the encoder. Internal function. +static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits) +{ + uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift. + if (x >= x_max) { + uint8_t* ptr = *pptr; + do { + *--ptr = (uint8_t) (x & 0xff); + x >>= 8; + } while (x >= x_max); + *pptr = ptr; + } + return x; +} + +// Encodes a single symbol with range start "start" and frequency "freq". +// All frequencies are assumed to sum to "1 << scale_bits", and the +// resulting bytes get written to ptr (which is updated). +// +// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from +// beginning to end! Likewise, the output bytestream is written *backwards*: +// ptr starts pointing at the end of the output buffer and keeps decrementing. +static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) +{ + // renormalize + RansState x = RansEncRenorm(*r, pptr, freq, scale_bits); + + // x = C(s,x) + *r = ((x / freq) << scale_bits) + (x % freq) + start; +} + +// Flushes the rANS encoder. +static inline void RansEncFlush(RansState* r, uint8_t** pptr) +{ + uint32_t x = *r; + uint8_t* ptr = *pptr; + + ptr -= 4; + ptr[0] = (uint8_t) (x >> 0); + ptr[1] = (uint8_t) (x >> 8); + ptr[2] = (uint8_t) (x >> 16); + ptr[3] = (uint8_t) (x >> 24); + + *pptr = ptr; +} + +// Initializes a rANS decoder. +// Unlike the encoder, the decoder works forwards as you'd expect. +static inline void RansDecInit(RansState* r, uint8_t** pptr) +{ + uint32_t x; + uint8_t* ptr = *pptr; + + x = ptr[0] << 0; + x |= ptr[1] << 8; + x |= ptr[2] << 16; + x |= ptr[3] << 24; + ptr += 4; + + *pptr = ptr; + *r = x; +} + +// Returns the current cumulative frequency (map it to a symbol yourself!) +static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits) +{ + return *r & ((1u << scale_bits) - 1); +} + +// Advances in the bit stream by "popping" a single symbol with range start +// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits", +// and the resulting bytes get written to ptr (which is updated). +static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) +{ + uint32_t mask = (1u << scale_bits) - 1; + + // s, x = D(x) + uint32_t x = *r; + x = freq * (x >> scale_bits) + (x & mask) - start; + + // renormalize + if (x < RANS_BYTE_L) { + uint8_t* ptr = *pptr; + do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); + *pptr = ptr; + } + + *r = x; +} + +// -------------------------------------------------------------------------- + +// That's all you need for a full encoder; below here are some utility +// functions with extra convenience or optimizations. + +// Encoder symbol description +// This (admittedly odd) selection of parameters was chosen to make +// RansEncPutSymbol as cheap as possible. +typedef struct { + uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval + uint32_t rcp_freq; // Fixed-point reciprocal frequency + uint32_t bias; // Bias + uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq + uint16_t rcp_shift; // Reciprocal shift +} RansEncSymbol; + +// Decoder symbols are straightforward. +typedef struct { + uint16_t start; // Start of range. + uint16_t freq; // Symbol frequency. +} RansDecSymbol; + +// Initializes an encoder symbol to start "start" and frequency "freq" +static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits) +{ + RansAssert(scale_bits <= 16); + RansAssert(start <= (1u << scale_bits)); + RansAssert(freq <= (1u << scale_bits) - start); + + // Say M := 1 << scale_bits. + // + // The original encoder does: + // x_new = (x/freq)*M + start + (x%freq) + // + // The fast encoder does (schematically): + // q = mul_hi(x, rcp_freq) >> rcp_shift (division) + // r = x - q*freq (remainder) + // x_new = q*M + bias + r (new x) + // plugging in r into x_new yields: + // x_new = bias + x + q*(M - freq) + // =: bias + x + q*cmpl_freq (*) + // + // and we can just precompute cmpl_freq. Now we just need to + // set up our parameters such that the original encoder and + // the fast encoder agree. + + s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; + s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq); + if (freq < 2) { + // freq=0 symbols are never valid to encode, so it doesn't matter what + // we set our values to. + // + // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately, + // our fixed-point reciprocal approximation can only multiply by values + // smaller than 1. + // + // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0. + // This gives: + // q = mul_hi(x, rcp_freq) >> rcp_shift + // = mul_hi(x, (1<<32) - 1)) >> 0 + // = floor(x - x/(2^32)) + // = x - 1 if 1 <= x < 2^32 + // and we know that x>0 (x=0 is never in a valid normalization interval). + // + // So we now need to choose the other parameters such that + // x_new = x*M + start + // plug it in: + // x*M + start (desired result) + // = bias + x + q*cmpl_freq (*) + // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq) + // = bias + 1 + (x - 1)*M + // = x*M + (bias + 1 - M) + // + // so we have start = bias + 1 - M, or equivalently + // bias = start + M - 1. + s->rcp_freq = ~0u; + s->rcp_shift = 0; + s->bias = start + (1 << scale_bits) - 1; + } else { + // Alverson, "Integer Division using reciprocals" + // shift=ceil(log2(freq)) + uint32_t shift = 0; + while (freq > (1u << shift)) + shift++; + + s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq); + s->rcp_shift = shift - 1; + + // With these values, 'q' is the correct quotient, so we + // have bias=start. + s->bias = start; + } + + s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol +} + +// Initialize a decoder symbol to start "start" and frequency "freq" +static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq) +{ + RansAssert(start <= (1 << 16)); + RansAssert(freq <= (1 << 16) - start); + s->start = (uint16_t) start; + s->freq = (uint16_t) freq; +} + +// Encodes a given symbol. This is faster than straight RansEnc since we can do +// multiplications instead of a divide. +// +// See RansEncSymbolInit for a description of how this works. +static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) +{ + RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 + + // renormalize + uint32_t x = *r; + uint32_t x_max = sym->x_max; + + if (x >= x_max) { + uint8_t* ptr = *pptr; + do { + *--ptr = (uint8_t) (x & 0xff); + x >>= 8; + } while (x >= x_max); + *pptr = ptr; + } + + // x = C(s,x) + // NOTE: written this way so we get a 32-bit "multiply high" when + // available. If you're on a 64-bit platform with cheap multiplies + // (e.g. x64), just bake the +32 into rcp_shift. + //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; + + // The extra >>32 has already been added to RansEncSymbolInit + uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); + *r = x + sym->bias + q * sym->cmpl_freq; +} + +// Equivalent to RansDecAdvance that takes a symbol. +static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits) +{ + RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits); +} + +// Advances in the bit stream by "popping" a single symbol with range start +// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits". +// No renormalization or output happens. +static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits) +{ + uint32_t mask = (1u << scale_bits) - 1; + + // s, x = D(x) + uint32_t x = *r; + *r = freq * (x >> scale_bits) + (x & mask) - start; +} + +// Equivalent to RansDecAdvanceStep that takes a symbol. +static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits) +{ + RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits); +} + +// Renormalize. +static inline void RansDecRenorm(RansState* r, uint8_t** pptr) +{ + // renormalize + uint32_t x = *r; + + if (x < RANS_BYTE_L) { + uint8_t* ptr = *pptr; + do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); + *pptr = ptr; + } + + *r = x; +} + +#endif // RANS_BYTE_HEADER diff --git a/htslib/cram/rANS_static.c b/htslib/cram/rANS_static.c new file mode 100644 index 00000000..19c26f31 --- /dev/null +++ b/htslib/cram/rANS_static.c @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2014 Genome Research Ltd. + * Author(s): James Bonfield + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger + * Institute nor the names of its contributors may be used to endorse + * or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH + * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cram/rANS_static.h" +#include "cram/rANS_byte.h" + +#define TF_SHIFT 12 +#define TOTFREQ (1<0?(a):-(a)) +#ifndef BLK_SIZE +# define BLK_SIZE 1024*1024 +#endif + +// Room to allow for expanded BLK_SIZE on worst case compression. +#define BLK_SIZE2 ((int)(1.05*BLK_SIZE)) + +/*----------------------------------------------------------------------------- + * Memory to memory compression functions. + * + * These are original versions without any manual loop unrolling. They + * are easier to understand, but can be up to 2x slower. + */ + +unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size, + unsigned int *out_size) { + unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9); + unsigned char *cp, *out_end; + RansEncSymbol syms[256]; + RansState rans0, rans1, rans2, rans3; + uint8_t* ptr; + int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0; + int m = 0, M = 0; + uint64_t tr; + + if (!out_buf) + return NULL; + + ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; + + // Compute statistics + for (i = 0; i < in_size; i++) { + F[in[i]]++; + } + tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size; + + // Normalise so T[i] == TOTFREQ + for (m = M = j = 0; j < 256; j++) { + if (!F[j]) + continue; + + if (m < F[j]) + m = F[j], M = j; + + if ((F[j] = (F[j]*tr)>>31) == 0) + F[j] = 1; + fsum += F[j]; + } + + fsum++; + if (fsum < TOTFREQ) + F[M] += TOTFREQ-fsum; + else + F[M] -= fsum-TOTFREQ; + + //printf("F[%d]=%d\n", M, F[M]); + assert(F[M]>0); + + // Encode statistics. + cp = out_buf+9; + + for (x = rle = j = 0; j < 256; j++) { + if (F[j]) { + // j + if (rle) { + rle--; + } else { + *cp++ = j; + if (!rle && j && F[j-1]) { + for(rle=j+1; rle<256 && F[rle]; rle++) + ; + rle -= j+1; + *cp++ = rle; + } + //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]); + } + + // F[j] + if (F[j]<128) { + *cp++ = F[j]; + } else { + *cp++ = 128 | (F[j]>>8); + *cp++ = F[j]&0xff; + } + RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); + x += F[j]; + } + } + *cp++ = 0; + + //write(1, out_buf+4, cp-(out_buf+4)); + tab_size = cp-out_buf; + + RansEncInit(&rans0); + RansEncInit(&rans1); + RansEncInit(&rans2); + RansEncInit(&rans3); + + switch (i=(in_size&3)) { + case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); + case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); + case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); + case 0: + break; + } + for (i=(in_size &~3); i>0; i-=4) { + RansEncSymbol *s3 = &syms[in[i-1]]; + RansEncSymbol *s2 = &syms[in[i-2]]; + RansEncSymbol *s1 = &syms[in[i-3]]; + RansEncSymbol *s0 = &syms[in[i-4]]; + + RansEncPutSymbol(&rans3, &ptr, s3); + RansEncPutSymbol(&rans2, &ptr, s2); + RansEncPutSymbol(&rans1, &ptr, s1); + RansEncPutSymbol(&rans0, &ptr, s0); + } + + RansEncFlush(&rans3, &ptr); + RansEncFlush(&rans2, &ptr); + RansEncFlush(&rans1, &ptr); + RansEncFlush(&rans0, &ptr); + + // Finalise block size and return it + *out_size = (out_end - ptr) + tab_size; + + cp = out_buf; + + *cp++ = 0; // order + *cp++ = ((*out_size-9)>> 0) & 0xff; + *cp++ = ((*out_size-9)>> 8) & 0xff; + *cp++ = ((*out_size-9)>>16) & 0xff; + *cp++ = ((*out_size-9)>>24) & 0xff; + + *cp++ = (in_size>> 0) & 0xff; + *cp++ = (in_size>> 8) & 0xff; + *cp++ = (in_size>>16) & 0xff; + *cp++ = (in_size>>24) & 0xff; + + memmove(out_buf + tab_size, ptr, out_end-ptr); + + return out_buf; +} + +typedef struct { + struct { + int F; + int C; + } fc[256]; + unsigned char *R; +} ari_decoder; + +unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, + unsigned int *out_size) { + /* Load in the static tables */ + unsigned char *cp = in + 9; + int i, j, x, out_sz, in_sz, rle; + char *out_buf; + ari_decoder D; + RansDecSymbol syms[256]; + + memset(&D, 0, sizeof(D)); + + if (*in++ != 0) // Order-0 check + return NULL; + + in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24); + out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24); + if (in_sz != in_size-9) + return NULL; + + out_buf = malloc(out_sz); + if (!out_buf) + return NULL; + + //fprintf(stderr, "out_sz=%d\n", out_sz); + + // Precompute reverse lookup of frequency. + rle = x = 0; + j = *cp++; + do { + if ((D.fc[j].F = *cp++) >= 128) { + D.fc[j].F &= ~128; + D.fc[j].F = ((D.fc[j].F & 127) << 8) | *cp++; + } + D.fc[j].C = x; + + RansDecSymbolInit(&syms[j], D.fc[j].C, D.fc[j].F); + + /* Build reverse lookup table */ + if (!D.R) D.R = (unsigned char *)malloc(TOTFREQ); + memset(&D.R[x], j, D.fc[j].F); + + x += D.fc[j].F; + + if (!rle && j+1 == *cp) { + j = *cp++; + rle = *cp++; + } else if (rle) { + rle--; + j++; + } else { + j = *cp++; + } + } while(j); + + assert(x < TOTFREQ); + + RansState rans0, rans1, rans2, rans3; + uint8_t *ptr = cp; + RansDecInit(&rans0, &ptr); + RansDecInit(&rans1, &ptr); + RansDecInit(&rans2, &ptr); + RansDecInit(&rans3, &ptr); + + int out_end = (out_sz&~3); + + RansState R[4]; + R[0] = rans0; + R[1] = rans1; + R[2] = rans2; + R[3] = rans3; + uint32_t mask = (1u << TF_SHIFT)-1; + + for (i=0; i < out_end; i+=4) { + uint32_t m[4] = {R[0] & mask, + R[1] & mask, + R[2] & mask, + R[3] & mask}; + uint8_t c[4] = {D.R[m[0]], + D.R[m[1]], + D.R[m[2]], + D.R[m[3]]}; + out_buf[i+0] = c[0]; + out_buf[i+1] = c[1]; + out_buf[i+2] = c[2]; + out_buf[i+3] = c[3]; + + // RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT); + // RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT); + // RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT); + // RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT); + R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT); + R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT); + R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT); + R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT); + + R[0] += m[0] - syms[c[0]].start; + R[1] += m[1] - syms[c[1]].start; + R[2] += m[2] - syms[c[2]].start; + R[3] += m[3] - syms[c[3]].start; + + RansDecRenorm(&R[0], &ptr); + RansDecRenorm(&R[1], &ptr); + RansDecRenorm(&R[2], &ptr); + RansDecRenorm(&R[3], &ptr); + } + + rans0 = R[0]; + rans1 = R[1]; + rans2 = R[2]; + rans3 = R[3]; + + switch(out_sz&3) { + unsigned char c; + case 0: + break; + case 1: + c = D.R[RansDecGet(&rans0, TF_SHIFT)]; + RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT); + out_buf[out_end] = c; + break; + + case 2: + c = D.R[RansDecGet(&rans0, TF_SHIFT)]; + RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT); + out_buf[out_end] = c; + + c = D.R[RansDecGet(&rans1, TF_SHIFT)]; + RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT); + out_buf[out_end+1] = c; + break; + + case 3: + c = D.R[RansDecGet(&rans0, TF_SHIFT)]; + RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT); + out_buf[out_end] = c; + + c = D.R[RansDecGet(&rans1, TF_SHIFT)]; + RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT); + out_buf[out_end+1] = c; + + c = D.R[RansDecGet(&rans2, TF_SHIFT)]; + RansDecAdvanceSymbol(&rans2, &ptr, &syms[c], TF_SHIFT); + out_buf[out_end+2] = c; + break; + } + + *out_size = out_sz; + + if (D.R) free(D.R); + + return (unsigned char *)out_buf; +} + +unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size, + unsigned int *out_size) { + unsigned char *out_buf, *out_end, *cp; + unsigned int last_i, tab_size, rle_i, rle_j; + RansEncSymbol syms[256][256]; + + if (in_size < 4) + return rans_compress_O0(in, in_size, out_size); + + out_buf = malloc(1.05*in_size + 257*257*3 + 9); + if (!out_buf) + return NULL; + + out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; + cp = out_buf+9; + + int F[256][256], T[256], i, j; + unsigned char c; + + memset(F, 0, 256*256*sizeof(int)); + memset(T, 0, 256*sizeof(int)); + //for (last = 0, i=in_size-1; i>=0; i--) { + // F[last][c = in[i]]++; + // T[last]++; + // last = c; + //} + + for (last_i=i=0; i>2)]]++; + F[0][in[2*(in_size>>2)]]++; + F[0][in[3*(in_size>>2)]]++; + T[0]+=3; + + // Normalise so T[i] == TOTFREQ + for (rle_i = i = 0; i < 256; i++) { + int t2, m, M; + unsigned int x; + + if (T[i] == 0) + continue; + + //uint64_t p = (TOTFREQ * TOTFREQ) / t; + double p = ((double)TOTFREQ)/T[i]; + for (t2 = m = M = j = 0; j < 256; j++) { + if (!F[i][j]) + continue; + + if (m < F[i][j]) + m = F[i][j], M = j; + + //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0) + if ((F[i][j] *= p) == 0) + F[i][j] = 1; + t2 += F[i][j]; + } + + t2++; + if (t2 < TOTFREQ) + F[i][M] += TOTFREQ-t2; + else + F[i][M] -= t2-TOTFREQ; + + // Store frequency table + // i + if (rle_i) { + rle_i--; + } else { + *cp++ = i; + // FIXME: could use order-0 statistics to observe which alphabet + // symbols are present and base RLE on that ordering instead. + if (i && T[i-1]) { + for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++) + ; + rle_i -= i+1; + *cp++ = rle_i; + } + } + + int *F_i_ = F[i]; + x = 0; + rle_j = 0; + for (j = 0; j < 256; j++) { + if (F_i_[j]) { + //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x); + + // j + if (rle_j) { + rle_j--; + } else { + *cp++ = j; + if (!rle_j && j && F_i_[j-1]) { + for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++) + ; + rle_j -= j+1; + *cp++ = rle_j; + } + } + + // F_i_[j] + if (F_i_[j]<128) { + *cp++ = F_i_[j]; + } else { + *cp++ = 128 | (F_i_[j]>>8); + *cp++ = F_i_[j]&0xff; + } + + RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT); + x += F_i_[j]; + } + } + *cp++ = 0; + } + *cp++ = 0; + + //write(1, out_buf+4, cp-(out_buf+4)); + tab_size = cp - out_buf; + assert(tab_size < 257*257*3); + + RansState rans0, rans1, rans2, rans3; + RansEncInit(&rans0); + RansEncInit(&rans1); + RansEncInit(&rans2); + RansEncInit(&rans3); + + uint8_t* ptr = out_end; + + int isz4 = in_size>>2; + int i0 = 1*isz4-2; + int i1 = 2*isz4-2; + int i2 = 3*isz4-2; + int i3 = 4*isz4-2; + + unsigned char l0 = in[i0+1]; + unsigned char l1 = in[i1+1]; + unsigned char l2 = in[i2+1]; + unsigned char l3 = in[i3+1]; + + // Deal with the remainder + l3 = in[in_size-1]; + for (i3 = in_size-2; i3 > 4*isz4-2; i3--) { + unsigned char c3 = in[i3]; + RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]); + l3 = c3; + } + + for (; i0 >= 0; i0--, i1--, i2--, i3--) { + unsigned char c0, c1, c2, c3; + RansEncSymbol *s3 = &syms[c3 = in[i3]][l3]; + RansEncSymbol *s2 = &syms[c2 = in[i2]][l2]; + RansEncSymbol *s1 = &syms[c1 = in[i1]][l1]; + RansEncSymbol *s0 = &syms[c0 = in[i0]][l0]; + + RansEncPutSymbol(&rans3, &ptr, s3); + RansEncPutSymbol(&rans2, &ptr, s2); + RansEncPutSymbol(&rans1, &ptr, s1); + RansEncPutSymbol(&rans0, &ptr, s0); + + l0 = c0; + l1 = c1; + l2 = c2; + l3 = c3; + } + + RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]); + RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]); + RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]); + RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]); + + RansEncFlush(&rans3, &ptr); + RansEncFlush(&rans2, &ptr); + RansEncFlush(&rans1, &ptr); + RansEncFlush(&rans0, &ptr); + + *out_size = (out_end - ptr) + tab_size; + + cp = out_buf; + *cp++ = 1; // order + + *cp++ = ((*out_size-9)>> 0) & 0xff; + *cp++ = ((*out_size-9)>> 8) & 0xff; + *cp++ = ((*out_size-9)>>16) & 0xff; + *cp++ = ((*out_size-9)>>24) & 0xff; + + *cp++ = (in_size>> 0) & 0xff; + *cp++ = (in_size>> 8) & 0xff; + *cp++ = (in_size>>16) & 0xff; + *cp++ = (in_size>>24) & 0xff; + + memmove(out_buf + tab_size, ptr, out_end-ptr); + + return out_buf; +} + +unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size, + unsigned int *out_size) { + /* Load in the static tables */ + unsigned char *cp = in + 9; + int i, j = -999, x, out_sz, in_sz, rle_i, rle_j; + char *out_buf; + ari_decoder D[256]; + RansDecSymbol syms[256][256]; + + memset(D, 0, 256*sizeof(*D)); + + if (*in++ != 1) // Order-1 check + return NULL; + + in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24); + out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24); + if (in_sz != in_size-9) + return NULL; + + out_buf = malloc(out_sz); + if (!out_buf) + return NULL; + + //fprintf(stderr, "out_sz=%d\n", out_sz); + + //i = *cp++; + rle_i = 0; + i = *cp++; + do { + rle_j = x = 0; + j = *cp++; + do { + if ((D[i].fc[j].F = *cp++) >= 128) { + D[i].fc[j].F &= ~128; + D[i].fc[j].F = ((D[i].fc[j].F & 127) << 8) | *cp++; + } + D[i].fc[j].C = x; + + //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, D[i].fc[j].F, D[i].fc[j].C); + + if (!D[i].fc[j].F) + D[i].fc[j].F = TOTFREQ; + + RansDecSymbolInit(&syms[i][j], D[i].fc[j].C, D[i].fc[j].F); + + /* Build reverse lookup table */ + if (!D[i].R) D[i].R = (unsigned char *)malloc(TOTFREQ); + memset(&D[i].R[x], j, D[i].fc[j].F); + + x += D[i].fc[j].F; + assert(x <= TOTFREQ); + + if (!rle_j && j+1 == *cp) { + j = *cp++; + rle_j = *cp++; + } else if (rle_j) { + rle_j--; + j++; + } else { + j = *cp++; + } + } while(j); + + if (!rle_i && i+1 == *cp) { + i = *cp++; + rle_i = *cp++; + } else if (rle_i) { + rle_i--; + i++; + } else { + i = *cp++; + } + } while (i); + + // Precompute reverse lookup of frequency. + + RansState rans0, rans1, rans2, rans3; + uint8_t *ptr = cp; + RansDecInit(&rans0, &ptr); + RansDecInit(&rans1, &ptr); + RansDecInit(&rans2, &ptr); + RansDecInit(&rans3, &ptr); + + int isz4 = out_sz>>2; + int l0 = 0; + int l1 = 0; + int l2 = 0; + int l3 = 0; + int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4}; + + RansState R[4]; + R[0] = rans0; + R[1] = rans1; + R[2] = rans2; + R[3] = rans3; + + for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { + uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1), + R[1] & ((1u << TF_SHIFT)-1), + R[2] & ((1u << TF_SHIFT)-1), + R[3] & ((1u << TF_SHIFT)-1)}; + + uint8_t c[4] = {D[l0].R[m[0]], + D[l1].R[m[1]], + D[l2].R[m[2]], + D[l3].R[m[3]]}; + + out_buf[i4[0]] = c[0]; + out_buf[i4[1]] = c[1]; + out_buf[i4[2]] = c[2]; + out_buf[i4[3]] = c[3]; + + //RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT); + //RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT); + //RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT); + //RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT); + + R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT); + R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT); + R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT); + R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT); + + R[0] += m[0] - syms[l0][c[0]].start; + R[1] += m[1] - syms[l1][c[1]].start; + R[2] += m[2] - syms[l2][c[2]].start; + R[3] += m[3] - syms[l3][c[3]].start; + + RansDecRenorm(&R[0], &ptr); + RansDecRenorm(&R[1], &ptr); + RansDecRenorm(&R[2], &ptr); + RansDecRenorm(&R[3], &ptr); + + l0 = c[0]; + l1 = c[1]; + l2 = c[2]; + l3 = c[3]; + } + + rans0 = R[0]; + rans1 = R[1]; + rans2 = R[2]; + rans3 = R[3]; + + // Remainder + for (; i4[3] < out_sz; i4[3]++) { + unsigned char c3 = D[l3].R[RansDecGet(&rans3, TF_SHIFT)]; + out_buf[i4[3]] = c3; + RansDecAdvanceSymbol(&rans3, &ptr, &syms[l3][c3], TF_SHIFT); + l3 = c3; + } + + *out_size = out_sz; + + for (i = 0; i < 256; i++) + if (D[i].R) free(D[i].R); + + return (unsigned char *)out_buf; +} + +/*----------------------------------------------------------------------------- + * Simple interface to the order-0 vs order-1 encoders and decoders. + */ +unsigned char *rans_compress(unsigned char *in, unsigned int in_size, + unsigned int *out_size, int order) { + return order + ? rans_compress_O1(in, in_size, out_size) + : rans_compress_O0(in, in_size, out_size); +} + +unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, + unsigned int *out_size) { + return in[0] + ? rans_uncompress_O1(in, in_size, out_size) + : rans_uncompress_O0(in, in_size, out_size); +} + + +#ifdef TEST_MAIN +/*----------------------------------------------------------------------------- + * Main. + * + * This is a simple command line tool for testing order-0 and order-1 + * compression using the rANS codec. Simply compile with + * + * gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static + * + * Usage: cram/rANS_static -o0 < file > file.o0 + * cram/rANS_static -d < file.o0 > file2 + * + * cram/rANS_static -o1 < file > file.o1 + * cram/rANS_static -d < file.o1 > file2 + */ +int main(int argc, char **argv) { + int opt, order = 0; + unsigned char in_buf[BLK_SIZE2+257*257*3]; + int decode = 0; + FILE *infp = stdin, *outfp = stdout; + struct timeval tv1, tv2; + size_t bytes = 0; + + extern char *optarg; + extern int optind; + + while ((opt = getopt(argc, argv, "o:d")) != -1) { + switch (opt) { + case 'o': + order = atoi(optarg); + break; + + case 'd': + decode = 1; + break; + } + } + + order = order ? 1 : 0; // Only support O(0) and O(1) + + if (optind < argc) { + if (!(infp = fopen(argv[optind], "rb"))) { + perror(argv[optind]); + return 1; + } + optind++; + } + + if (optind < argc) { + if (!(outfp = fopen(argv[optind], "wb"))) { + perror(argv[optind]); + return 1; + } + optind++; + } + + gettimeofday(&tv1, NULL); + + if (decode) { + // Only used in some test implementations of RC_GetFreq() + //RC_init(); + //RC_init2(); + + for (;;) { + uint32_t in_size, out_size; + unsigned char *out; + + if (4 != fread(&in_size, 1, 4, infp)) + break; + if (in_size != fread(in_buf, 1, in_size, infp)) { + fprintf(stderr, "Truncated input\n"); + exit(1); + } + out = rans_uncompress(in_buf, in_size, &out_size); + if (!out) + abort(); + + fwrite(out, 1, out_size, outfp); + free(out); + + bytes += out_size; + } + } else { + for (;;) { + uint32_t in_size, out_size; + unsigned char *out; + + in_size = fread(in_buf, 1, BLK_SIZE, infp); + if (in_size <= 0) + break; + + out = rans_compress(in_buf, in_size, &out_size, order); + + fwrite(&out_size, 1, 4, outfp); + fwrite(out, 1, out_size, outfp); + free(out); + + bytes += in_size; + } + } + + gettimeofday(&tv2, NULL); + + fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n", + (long)(tv2.tv_sec - tv1.tv_sec)*1000000 + + tv2.tv_usec - tv1.tv_usec, + (double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 + + tv2.tv_usec - tv1.tv_usec)); + return 0; +} +#endif diff --git a/htslib/cram/rANS_static.h b/htslib/cram/rANS_static.h new file mode 100644 index 00000000..971099cc --- /dev/null +++ b/htslib/cram/rANS_static.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2014 Genome Research Ltd. + * Author(s): James Bonfield + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger + * Institute nor the names of its contributors may be used to endorse + * or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH + * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef RANS_STATIC_H +#define RANS_STATIC_H + +unsigned char *rans_compress(unsigned char *in, unsigned int in_size, + unsigned int *out_size, int order); +unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, + unsigned int *out_size); + + +#endif /* RANS_STATIC_H */ diff --git a/htslib/cram/sam_header.c b/htslib/cram/sam_header.c index 2a8110cb..3367f19b 100644 --- a/htslib/cram/sam_header.c +++ b/htslib/cram/sam_header.c @@ -38,10 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/sam_header.h" #include "cram/string_alloc.h" -#ifdef SAMTOOLS -#define sam_hdr_parse sam_hdr_parse_ -#endif - static void sam_hdr_error(char *msg, char *line, int len, int lno) { int j; @@ -888,7 +884,7 @@ SAM_hdr *sam_hdr_new() { * Returns a SAM_hdr struct on success (free with sam_hdr_free()) * NULL on failure */ -SAM_hdr *sam_hdr_parse(const char *hdr, int len) { +SAM_hdr *sam_hdr_parse_(const char *hdr, int len) { /* Make an empty SAM_hdr */ SAM_hdr *sh; @@ -925,7 +921,7 @@ SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) { if (-1 == sam_hdr_rebuild(hdr)) return NULL; - return sam_hdr_parse(sam_hdr_str(hdr), sam_hdr_length(hdr)); + return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr)); } /*! Increments a reference count on hdr. diff --git a/htslib/cram/sam_header.h b/htslib/cram/sam_header.h index b9ea2983..8e0929e7 100644 --- a/htslib/cram/sam_header.h +++ b/htslib/cram/sam_header.h @@ -34,10 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * These functions can be shared between SAM, BAM and CRAM file * formats as all three internally use the same string encoding for * header fields. - * - * Consider using the scram() generic API and calling - * scram_get_header() to obtain the format-specific pointer to the - * SAM_hdr struct. */ /* @@ -228,11 +224,7 @@ SAM_hdr *sam_hdr_new(void); * Returns a SAM_hdr struct on success (free with sam_hdr_free()); * NULL on failure */ -#ifdef SAMTOOLS SAM_hdr *sam_hdr_parse_(const char *hdr, int len); -#else -SAM_hdr *sam_hdr_parse(const char *hdr, int len); -#endif /*! Produces a duplicate copy of hdr and returns it. diff --git a/htslib/cram/thread_pool.c b/htslib/cram/thread_pool.c index 90652a76..dea9e909 100644 --- a/htslib/cram/thread_pool.c +++ b/htslib/cram/thread_pool.c @@ -35,18 +35,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/thread_pool.h" //#define DEBUG -#define DEBUG_TIME +//#define DEBUG_TIME + +#define IN_ORDER #ifdef DEBUG static int worker_id(t_pool *p) { int i; pthread_t s = pthread_self(); for (i = 0; i < p->tsize; i++) { - if (pthread_equal(s, p->t[i])) + if (pthread_equal(s, p->t[i].tid)) return i; } return -1; @@ -103,7 +106,7 @@ static int t_pool_add_result(t_pool_job *j, void *data) { fprintf(stderr, "%d: Broadcasting result_avail (id %d)\n", worker_id(j->p), r->serial); #endif - pthread_cond_broadcast(&q->result_avail_c); + pthread_cond_signal(&q->result_avail_c); #ifdef DEBUG fprintf(stderr, "%d: Broadcast complete\n", worker_id(j->p)); #endif @@ -301,7 +304,8 @@ void t_results_queue_destroy(t_results_queue *q) { * and then executes the job. */ static void *t_pool_worker(void *arg) { - t_pool *p = (t_pool *)arg; + t_pool_worker_t *w = (t_pool_worker_t *)arg; + t_pool *p = w->p; t_pool_job *j; #ifdef DEBUG_TIME struct timeval t1, t2, t3; @@ -318,28 +322,62 @@ static void *t_pool_worker(void *arg) { #ifdef DEBUG_TIME gettimeofday(&t2, NULL); p->wait_time += TDIFF(t2,t1); + w->wait_time += TDIFF(t2,t1); #endif - p->nwaiting++; + // If there is something on the job list and a higher priority + // thread waiting, let it handle this instead. +// while (p->head && p->t_stack_top != -1 && p->t_stack_top < w->idx) { +// pthread_mutex_unlock(&p->pool_m); +// pthread_cond_signal(&p->t[p->t_stack_top].pending_c); +// pthread_mutex_lock(&p->pool_m); +// } + while (!p->head && !p->shutdown) { + p->nwaiting++; + if (p->njobs == 0) pthread_cond_signal(&p->empty_c); #ifdef DEBUG_TIME gettimeofday(&t2, NULL); #endif +#ifdef IN_ORDER + // Push this thread to the top of the waiting stack + if (p->t_stack_top == -1 || p->t_stack_top > w->idx) + p->t_stack_top = w->idx; + + p->t_stack[w->idx] = 1; + pthread_cond_wait(&w->pending_c, &p->pool_m); + p->t_stack[w->idx] = 0; + + /* Find new t_stack_top */ + { + int i; + p->t_stack_top = -1; + for (i = 0; i < p->tsize; i++) { + if (p->t_stack[i]) { + p->t_stack_top = i; + break; + } + } + } +#else pthread_cond_wait(&p->pending_c, &p->pool_m); +#endif #ifdef DEBUG_TIME gettimeofday(&t3, NULL); p->wait_time += TDIFF(t3,t2); + w->wait_time += TDIFF(t3,t2); #endif + p->nwaiting--; } - p->nwaiting--; - if (p->shutdown) { +#ifdef DEBUG_TIME p->total_time += TDIFF(t3,t1); +#endif #ifdef DEBUG fprintf(stderr, "%d: Shutting down\n", worker_id(p)); #endif @@ -351,7 +389,7 @@ static void *t_pool_worker(void *arg) { if (!(p->head = j->next)) p->tail = NULL; - if (p->njobs-- == p->qsize) + if (p->njobs-- >= p->qsize) pthread_cond_signal(&p->full_c); if (p->njobs == 0) @@ -389,6 +427,7 @@ t_pool *t_pool_init(int qsize, int tsize) { p->nwaiting = 0; p->shutdown = 0; p->head = p->tail = NULL; + p->t_stack = NULL; #ifdef DEBUG_TIME p->total_time = p->wait_time = 0; #endif @@ -397,14 +436,40 @@ t_pool *t_pool_init(int qsize, int tsize) { pthread_mutex_init(&p->pool_m, NULL); pthread_cond_init(&p->empty_c, NULL); - pthread_cond_init(&p->pending_c, NULL); pthread_cond_init(&p->full_c, NULL); + pthread_mutex_lock(&p->pool_m); + +#ifdef IN_ORDER + if (!(p->t_stack = malloc(tsize * sizeof(*p->t_stack)))) + return NULL; + p->t_stack_top = -1; + for (i = 0; i < tsize; i++) { - if (0 != pthread_create(&p->t[i], NULL, t_pool_worker, p)) + t_pool_worker_t *w = &p->t[i]; + p->t_stack[i] = 0; + w->p = p; + w->idx = i; + w->wait_time = 0; + pthread_cond_init(&w->pending_c, NULL); + if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w)) return NULL; } - +#else + pthread_cond_init(&p->pending_c, NULL); + + for (i = 0; i < tsize; i++) { + t_pool_worker_t *w = &p->t[i]; + w->p = p; + w->idx = i; + pthread_cond_init(&w->pending_c, NULL); + if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w)) + return NULL; + } +#endif + + pthread_mutex_unlock(&p->pool_m); + return p; } @@ -447,7 +512,7 @@ int t_pool_dispatch(t_pool *p, t_results_queue *q, pthread_mutex_lock(&p->pool_m); // Check if queue is full - while (p->njobs == p->qsize) + while (p->njobs >= p->qsize) pthread_cond_wait(&p->full_c, &p->pool_m); p->njobs++; @@ -459,11 +524,13 @@ int t_pool_dispatch(t_pool *p, t_results_queue *q, p->head = p->tail = j; } - if (p->njobs == 1) { - // First job => tell all worker threads to start up - pthread_cond_broadcast(&p->pending_c); - } - + // Let a worker know we have data. +#ifdef IN_ORDER + if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting) + pthread_cond_signal(&p->t[p->t_stack_top].pending_c); +#else + pthread_cond_signal(&p->pending_c); +#endif pthread_mutex_unlock(&p->pool_m); #ifdef DEBUG @@ -482,9 +549,21 @@ int t_pool_dispatch(t_pool *p, t_results_queue *q, */ int t_pool_dispatch2(t_pool *p, t_results_queue *q, void *(*func)(void *arg), void *arg, int nonblock) { - t_pool_job *j = malloc(sizeof(*j)); + t_pool_job *j; - if (!j) +#ifdef DEBUG + fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, q->curr_serial); +#endif + + pthread_mutex_lock(&p->pool_m); + + if (p->njobs >= p->qsize && nonblock == 1) { + pthread_mutex_unlock(&p->pool_m); + errno = EAGAIN; + return -1; + } + + if (!(j = malloc(sizeof(*j)))) return -1; j->func = func; j->arg = arg; @@ -499,19 +578,6 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q, j->serial = 0; } -#ifdef DEBUG - fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, j->serial); -#endif - - pthread_mutex_lock(&p->pool_m); - - if (p->njobs == p->qsize && nonblock == 1) { - pthread_mutex_unlock(&p->pool_m); - errno = EAGAIN; - free(j); - return -1; - } - if (q) { pthread_mutex_lock(&q->result_m); q->curr_serial++; @@ -521,7 +587,7 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q, // Check if queue is full if (nonblock == 0) - while (p->njobs == p->qsize) + while (p->njobs >= p->qsize) pthread_cond_wait(&p->full_c, &p->pool_m); p->njobs++; @@ -540,10 +606,18 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q, fprintf(stderr, "Dispatched (serial %d)\n", j->serial); #endif - if (p->njobs == 1) { - // First job => tell all worker threads to start up - pthread_cond_broadcast(&p->pending_c); - } + // Let a worker know we have data. +#ifdef IN_ORDER + // Keep incoming queue at 1 per running thread, so there is always + // something waiting when they end their current task. If we go above + // this signal to start more threads (if available). This has the effect + // of concentrating jobs to fewer cores when we are I/O bound, which in + // turn benefits systems with auto CPU frequency scaling. + if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting) + pthread_cond_signal(&p->t[p->t_stack_top].pending_c); +#else + pthread_cond_signal(&p->pending_c); +#endif pthread_mutex_unlock(&p->pool_m); @@ -558,12 +632,20 @@ int t_pool_dispatch2(t_pool *p, t_results_queue *q, * -1 on failure */ int t_pool_flush(t_pool *p) { + int i; + #ifdef DEBUG fprintf(stderr, "Flushing pool %p\n", p); #endif // Drains the queue pthread_mutex_lock(&p->pool_m); + + // Wake up everything for the final sprint! + for (i = 0; i < p->tsize; i++) + if (p->t_stack[i]) + pthread_cond_signal(&p->t[i].pending_c); + while (p->njobs || p->nwaiting != p->tsize) pthread_cond_wait(&p->empty_c, &p->pool_m); @@ -601,31 +683,47 @@ void t_pool_destroy(t_pool *p, int kill) { fprintf(stderr, "Sending shutdown request\n"); #endif +#ifdef IN_ORDER + for (i = 0; i < p->tsize; i++) + pthread_cond_signal(&p->t[i].pending_c); +#else pthread_cond_broadcast(&p->pending_c); +#endif pthread_mutex_unlock(&p->pool_m); #ifdef DEBUG fprintf(stderr, "Shutdown complete\n"); #endif for (i = 0; i < p->tsize; i++) - pthread_join(p->t[i], NULL); + pthread_join(p->t[i].tid, NULL); } else { for (i = 0; i < p->tsize; i++) - pthread_kill(p->t[i], SIGINT); + pthread_kill(p->t[i].tid, SIGINT); } pthread_mutex_destroy(&p->pool_m); pthread_cond_destroy(&p->empty_c); - pthread_cond_destroy(&p->pending_c); pthread_cond_destroy(&p->full_c); +#ifdef IN_ORDER + for (i = 0; i < p->tsize; i++) + pthread_cond_destroy(&p->t[i].pending_c); +#else + pthread_cond_destroy(&p->pending_c); +#endif #ifdef DEBUG_TIME fprintf(stderr, "Total time=%f\n", p->total_time / 1000000.0); fprintf(stderr, "Wait time=%f\n", p->wait_time / 1000000.0); fprintf(stderr, "%d%% utilisation\n", (int)(100 - ((100.0 * p->wait_time) / p->total_time + 0.5))); + for (i = 0; i < p->tsize; i++) + fprintf(stderr, "%d: Wait time=%f\n", i, + p->t[i].wait_time / 1000000.0); #endif + if (p->t_stack) + free(p->t_stack); + free(p->t); free(p); diff --git a/htslib/cram/thread_pool.h b/htslib/cram/thread_pool.h index 18e8b427..d26c5d9d 100644 --- a/htslib/cram/thread_pool.h +++ b/htslib/cram/thread_pool.h @@ -68,6 +68,16 @@ typedef struct t_res { void *data; // result itself } t_pool_result; +struct t_pool; + +typedef struct { + struct t_pool *p; + int idx; + pthread_t tid; + pthread_cond_t pending_c; + long long wait_time; +} t_pool_worker_t; + typedef struct t_pool { int qsize; // size of queue int njobs; // pending job count @@ -79,7 +89,7 @@ typedef struct t_pool { // threads int tsize; // maximum number of jobs - pthread_t *t; + t_pool_worker_t *t; // Mutexes pthread_mutex_t pool_m; // used when updating head/tail @@ -88,6 +98,9 @@ typedef struct t_pool { pthread_cond_t pending_c; // not empty pthread_cond_t full_c; + // array of worker IDs free + int *t_stack, t_stack_top; + // Debugging to check wait time long long total_time, wait_time; } t_pool; diff --git a/htslib/cram/vlen.c b/htslib/cram/vlen.c index bc7e7d4b..e451bbd5 100644 --- a/htslib/cram/vlen.c +++ b/htslib/cram/vlen.c @@ -238,7 +238,7 @@ int vflen(char *fmt, va_list ap) * Note that %10c and %.10c act differently. * Besides, I think precision is not really allowed for %c. */ - len += MAX(conv_len1, 1); + len += MAX(conv_len1, i>=0x80 ?MB_CUR_MAX :1); break; case 'f': diff --git a/htslib/faidx.c b/htslib/faidx.c index 75ec84cc..b48fce98 100644 --- a/htslib/faidx.c +++ b/htslib/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- FASTA random access. - Copyright (C) 2008, 2009, 2013, 2014 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -23,8 +23,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "config.h" - #include #include #include @@ -33,10 +31,8 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/bgzf.h" #include "htslib/faidx.h" +#include "htslib/hfile.h" #include "htslib/khash.h" -#ifdef _USE_KNETFILE -#include "htslib/knetfile.h" -#endif typedef struct { int32_t line_len, line_blen; @@ -74,7 +70,8 @@ static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int faidx_t *fai_build_core(BGZF *bgzf) { - char c, *name; + char *name; + int c; int l_name, m_name; int line_len, line_blen, state; int l1, l2; @@ -221,6 +218,7 @@ int fai_build(const char *fn) if ( !fai ) { if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n"); + free(str); return -1; } if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi"); @@ -238,13 +236,12 @@ int fai_build(const char *fn) return 0; } -#ifdef _USE_KNETFILE -FILE *download_and_open(const char *fn) +static FILE *download_and_open(const char *fn) { const int buf_size = 1 * 1024 * 1024; uint8_t *buf; FILE *fp; - knetFile *fp_remote; + hFILE *fp_remote; const char *url = fn; const char *p; int l = strlen(fn); @@ -258,26 +255,26 @@ FILE *download_and_open(const char *fn) return fp; // If failed, download from remote and open - fp_remote = knet_open(url, "rb"); + fp_remote = hopen(url, "rb"); if (fp_remote == 0) { fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); return NULL; } if ((fp = fopen(fn, "wb")) == 0) { fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); - knet_close(fp_remote); + hclose_abruptly(fp_remote); return NULL; } buf = (uint8_t*)calloc(buf_size, 1); - while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp); free(buf); fclose(fp); - knet_close(fp_remote); + if (hclose(fp_remote) != 0) + fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url); return fopen(fn, "r"); } -#endif faidx_t *fai_load(const char *fn) { @@ -287,8 +284,7 @@ faidx_t *fai_load(const char *fn) str = (char*)calloc(strlen(fn) + 5, 1); sprintf(str, "%s.fai", fn); -#ifdef _USE_KNETFILE - if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) + if (hisremote(str)) { fp = download_and_open(str); if ( !fp ) @@ -299,8 +295,8 @@ faidx_t *fai_load(const char *fn) } } else -#endif fp = fopen(str, "rb"); + if (fp == 0) { fprintf(stderr, "[fai_load] build FASTA index.\n"); fai_build(fn); @@ -335,8 +331,8 @@ faidx_t *fai_load(const char *fn) char *fai_fetch(const faidx_t *fai, const char *str, int *len) { - char *s, c; - int i, l, k, name_end; + char *s; + int c, i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; @@ -409,14 +405,21 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) return s; } +int faidx_fetch_nseq(const faidx_t *fai) +{ + return fai->n; +} + int faidx_nseq(const faidx_t *fai) { return fai->n; } + const char *faidx_iseq(const faidx_t *fai, int i) { return fai->name[i]; } + int faidx_seq_len(const faidx_t *fai, const char *seq) { khint_t k = kh_get(s, fai->hash, seq); @@ -426,8 +429,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq) char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) { - int l; - char c; + int l, c; khiter_t iter; faidx1_t val; char *seq=NULL; diff --git a/htslib/hfile.c b/htslib/hfile.c index 3f33bf65..d722c133 100644 --- a/htslib/hfile.c +++ b/htslib/hfile.c @@ -1,6 +1,6 @@ /* hfile.c -- buffered low-level input/output streams. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: John Marshall @@ -254,6 +254,12 @@ off_t hseek(hFILE *fp, off_t offset, int whence) int ret = flush_buffer(fp); if (ret < 0) return ret; } + else { + // Convert relative offsets from being relative to the hFILE's stream + // position (at begin) to being relative to the backend's physical + // stream position (at end, due to the buffering read-ahead). + if (whence == SEEK_CUR) offset -= fp->end - fp->begin; + } pos = fp->backend->seek(fp, offset, whence); if (pos < 0) { fp->has_errno = errno; return pos; } @@ -520,7 +526,22 @@ hFILE *hopen(const char *fname, const char *mode) { if (strncmp(fname, "http://", 7) == 0 || strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode); +#ifdef HAVE_IRODS + else if (strncmp(fname, "irods:", 6) == 0) return hopen_irods(fname, mode); +#endif else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode); else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode); else return hopen_fd(fname, mode); } + +int hisremote(const char *fname) +{ + // FIXME Make a new backend entry to return this + if (strncmp(fname, "http://", 7) == 0 || + strncmp(fname, "https://", 8) == 0 || + strncmp(fname, "ftp://", 6) == 0) return 1; +#ifdef HAVE_IRODS + else if (strncmp(fname, "irods:", 6) == 0) return 1; +#endif + else return 0; +} diff --git a/htslib/hfile_internal.h b/htslib/hfile_internal.h index 7ac06ba4..bfce2f64 100644 --- a/htslib/hfile_internal.h +++ b/htslib/hfile_internal.h @@ -1,6 +1,6 @@ /* hfile_internal.h -- internal parts of low-level input/output streams. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: John Marshall @@ -56,6 +56,7 @@ struct hFILE_backend { /* These are called from the hopen() dispatcher, and should call hfile_init() to malloc a struct "derived" from hFILE and initialise it appropriately, including setting base.backend to their own backend vector. */ +hFILE *hopen_irods(const char *filename, const char *mode); hFILE *hopen_net(const char *filename, const char *mode); /* May be called by hopen_*() functions to decode a fopen()-style mode into diff --git a/htslib/hfile_irods.c b/htslib/hfile_irods.c new file mode 100644 index 00000000..6bdbf212 --- /dev/null +++ b/htslib/hfile_irods.c @@ -0,0 +1,243 @@ +/* hfile_irods.c -- iRODS backend for low-level file streams. + + Copyright (C) 2013, 2015 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include + +#include "hfile_internal.h" + +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + hFILE base; + int descriptor; +} hFILE_irods; + +static int status_errno(int status) +{ + switch (status) { + case SYS_NO_API_PRIV: return EACCES; + case SYS_MALLOC_ERR: return ENOMEM; + case SYS_OUT_OF_FILE_DESC: return ENFILE; + case SYS_BAD_FILE_DESCRIPTOR: return EBADF; + case CAT_NO_ROWS_FOUND: return ENOENT; + case CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME: return EEXIST; + default: return EIO; + } +} + +static void set_errno(int status) +{ + int err = abs(status) % 1000; + errno = err? err : status_errno(status); +} + +static struct { + rcComm_t *conn; + rodsEnv env; +} irods = { NULL }; + +static void irods_exit() +{ + (void) rcDisconnect(irods.conn); + irods.conn = NULL; +} + +static int irods_init() +{ + rErrMsg_t err; + int ret; + + ret = getRodsEnv(&irods.env); + if (ret < 0) goto error; + + irods.conn = rcConnect(irods.env.rodsHost, irods.env.rodsPort, + irods.env.rodsUserName, irods.env.rodsZone, + NO_RECONN, &err); + if (irods.conn == NULL) { ret = err.status; goto error; } + + if (strcmp(irods.env.rodsUserName, PUBLIC_USER_NAME) != 0) { + ret = clientLogin(irods.conn); + if (ret != 0) goto error; + } + + // In the unlikely event atexit() fails, it's better to succeed here and + // carry on and do the I/O; then eventually when the program exits, we'll + // merely disconnect from the server uncleanly, as if we had aborted. + (void) atexit(irods_exit); + + return 0; + +error: + if (irods.conn) { (void) rcDisconnect(irods.conn); } + irods.conn = NULL; + set_errno(ret); + return -1; +} + +static ssize_t irods_read(hFILE *fpv, void *buffer, size_t nbytes) +{ + hFILE_irods *fp = (hFILE_irods *) fpv; + openedDataObjInp_t args; + bytesBuf_t buf; + int ret; + + memset(&args, 0, sizeof args); + args.l1descInx = fp->descriptor; + args.len = nbytes; + + buf.buf = buffer; + buf.len = nbytes; + + ret = rcDataObjRead(irods.conn, &args, &buf); + if (ret < 0) set_errno(ret); + return ret; +} + +static ssize_t irods_write(hFILE *fpv, const void *buffer, size_t nbytes) +{ + hFILE_irods *fp = (hFILE_irods *) fpv; + openedDataObjInp_t args; + bytesBuf_t buf; + int ret; + + memset(&args, 0, sizeof args); + args.l1descInx = fp->descriptor; + args.len = nbytes; + + buf.buf = (void *) buffer; // ...the iRODS API is not const-correct here + buf.len = nbytes; + + ret = rcDataObjWrite(irods.conn, &args, &buf); + if (ret < 0) set_errno(ret); + return ret; +} + +static off_t irods_seek(hFILE *fpv, off_t offset, int whence) +{ + hFILE_irods *fp = (hFILE_irods *) fpv; + openedDataObjInp_t args; + fileLseekOut_t *out = NULL; + int ret; + + memset(&args, 0, sizeof args); + args.l1descInx = fp->descriptor; + args.offset = offset; + args.whence = whence; + + ret = rcDataObjLseek(irods.conn, &args, &out); + + if (out) { offset = out->offset; free(out); } + else offset = -1; + if (ret < 0) { set_errno(ret); return -1; } + return offset; +} + +static int irods_flush(hFILE *fpv) +{ +// FIXME rcDataObjFsync() doesn't seem to function as expected. +// For now, flush is a no-op: see https://github.com/samtools/htslib/issues/168 +#if 0 + hFILE_irods *fp = (hFILE_irods *) fpv; + openedDataObjInp_t args; + int ret; + + memset(&args, 0, sizeof args); + args.l1descInx = fp->descriptor; + + ret = rcDataObjFsync(irods.conn, &args); + if (ret < 0) set_errno(ret); + return ret; +#endif + return 0; +} + +static int irods_close(hFILE *fpv) +{ + hFILE_irods *fp = (hFILE_irods *) fpv; + openedDataObjInp_t args; + int ret; + + memset(&args, 0, sizeof args); + args.l1descInx = fp->descriptor; + + ret = rcDataObjClose(irods.conn, &args); + if (ret < 0) set_errno(ret); + return ret; +} + +static const struct hFILE_backend irods_backend = +{ + irods_read, irods_write, irods_seek, irods_flush, irods_close +}; + +hFILE *hopen_irods(const char *filename, const char *mode) +{ + hFILE_irods *fp; + rodsPath_t path; + dataObjInp_t args; + int ret; + + // Initialise the iRODS connection if this is the first use. + if (irods.conn == NULL) { if (irods_init() < 0) return NULL; } + + if (strncmp(filename, "irods:", 6) == 0) filename += 6; + else { errno = EINVAL; return NULL; } + + fp = (hFILE_irods *) hfile_init(sizeof (hFILE_irods), mode, 0); + if (fp == NULL) return NULL; + + strncpy(path.inPath, filename, MAX_NAME_LEN-1); + path.inPath[MAX_NAME_LEN-1] = '\0'; + + ret = parseRodsPath(&path, &irods.env); + if (ret < 0) goto error; + + memset(&args, 0, sizeof args); + strcpy(args.objPath, path.outPath); + args.openFlags = hfile_oflags(mode); + if (args.openFlags & O_CREAT) { + args.createMode = 0666; + addKeyVal(&args.condInput, DEST_RESC_NAME_KW,irods.env.rodsDefResource); + } + + ret = rcDataObjOpen(irods.conn, &args); + if (ret < 0) goto error; + fp->descriptor = ret; + + fp->base.backend = &irods_backend; + return &fp->base; + +error: + hfile_destroy((hFILE *) fp); + set_errno(ret); + return NULL; +} diff --git a/htslib/hts.c b/htslib/hts.c index 5fab4baa..5f4d677e 100644 --- a/htslib/hts.c +++ b/htslib/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2014 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2015 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li @@ -28,6 +28,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -40,7 +41,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kseq.h" #define KS_BGZF 1 #if KS_BGZF - // bgzf now supports gzip-compressed files + // bgzf now supports gzip-compressed files, the gzFile branch can be removed KSTREAM_INIT2(, BGZF*, bgzf_read, 65536) #else KSTREAM_INIT2(, gzFile, gzread, 16384) @@ -78,10 +79,44 @@ const unsigned char seq_nt16_table[256] = { const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; +const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + /********************** *** Basic file I/O *** **********************/ +static enum htsFormatCategory format_category(enum htsExactFormat fmt) +{ + switch (fmt) { + case bam: + case sam: + case cram: + return sequence_data; + + case vcf: + case bcf: + return variant_data; + + case bai: + case crai: + case csi: + case gzi: + case tbi: + return index_file; + + case bed: + return region_list; + + case unknown_format: + case binary_format: + case text_format: + case format_maximum: + break; + } + + return unknown_category; +} + // Decompress up to ten or so bytes by peeking at the file, which must be // positioned at the start of a GZIP block. static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) @@ -112,91 +147,322 @@ static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) return destsize; } -// Returns whether the block contains any control characters, i.e., -// characters less than SPACE other than whitespace etc (ASCII BEL..CR). -static int is_binary(unsigned char *s, size_t n) +// Parse "x.y" text, taking care because the string is not NUL-terminated +// and filling in major/minor only when the digits are followed by a delimiter, +// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer. +static void +parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim) { - size_t i; - for (i = 0; i < n; i++) - if (s[i] < 0x07 || (s[i] >= 0x0e && s[i] < 0x20)) return 1; + const char *str = (const char *) u; + const char *slim = (const char *) ulim; + const char *s; + + fmt->version.major = fmt->version.minor = -1; + + for (s = str; s < slim; s++) if (!isdigit(*s)) break; + if (s < slim) { + fmt->version.major = atoi(str); + if (*s == '.') { + str = &s[1]; + for (s = str; s < slim; s++) if (!isdigit(*s)) break; + if (s < slim) + fmt->version.minor = atoi(str); + } + else + fmt->version.minor = 0; + } +} + +int hts_detect_format(hFILE *hfile, htsFormat *fmt) +{ + unsigned char s[21]; + ssize_t len = hpeek(hfile, s, 18); + if (len < 0) return -1; + + if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { + // The stream is either gzip-compressed or BGZF-compressed. + // Determine which, and decompress the first few bytes. + fmt->compression = (len >= 18 && (s[3] & 4) && + memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; + len = decompress_peek(hfile, s, sizeof s); + } + else { + fmt->compression = no_compression; + len = hpeek(hfile, s, sizeof s); + } + if (len < 0) return -1; + + fmt->compression_level = -1; + fmt->specific = NULL; + + if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) { + fmt->category = sequence_data; + fmt->format = cram; + fmt->version.major = s[4], fmt->version.minor = s[5]; + fmt->compression = custom; + return 0; + } + else if (len >= 4 && s[3] <= '\4') { + if (memcmp(s, "BAM\1", 4) == 0) { + fmt->category = sequence_data; + fmt->format = bam; + // TODO Decompress enough to pick version from @HD-VN header + fmt->version.major = 1, fmt->version.minor = -1; + return 0; + } + else if (memcmp(s, "BAI\1", 4) == 0) { + fmt->category = index_file; + fmt->format = bai; + fmt->version.major = -1, fmt->version.minor = -1; + return 0; + } + else if (memcmp(s, "BCF\4", 4) == 0) { + fmt->category = variant_data; + fmt->format = bcf; + fmt->version.major = 1, fmt->version.minor = -1; + return 0; + } + else if (memcmp(s, "BCF\2", 4) == 0) { + fmt->category = variant_data; + fmt->format = bcf; + fmt->version.major = s[3]; + fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0; + return 0; + } + else if (memcmp(s, "CSI\1", 4) == 0) { + fmt->category = index_file; + fmt->format = csi; + fmt->version.major = 1, fmt->version.minor = -1; + return 0; + } + else if (memcmp(s, "TBI\1", 4) == 0) { + fmt->category = index_file; + fmt->format = tbi; + fmt->version.major = -1, fmt->version.minor = -1; + return 0; + } + } + else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { + fmt->category = variant_data; + fmt->format = vcf; + if (len >= 21 && s[16] == 'v') + parse_version(fmt, &s[17], &s[len]); + else + fmt->version.major = fmt->version.minor = -1; + return 0; + } + else if (len >= 4 && s[0] == '@' && + (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 || + memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) { + fmt->category = sequence_data; + fmt->format = sam; + // @HD-VN is not guaranteed to be the first tag, but then @HD is + // not guaranteed to be present at all... + if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0) + parse_version(fmt, &s[7], &s[len]); + else + fmt->version.major = 1, fmt->version.minor = -1; + return 0; + } + else { + // Various possibilities for tab-delimited text: + // .crai (gzipped tab-delimited six columns: seqid 5*number) + // .bed ([3..12] tab-delimited columns) + // .bedpe (>= 10 tab-delimited columns) + // .sam (tab-delimited >= 11 columns: seqid number seqid...) + // FIXME For now, assume it's SAM + fmt->category = sequence_data; + fmt->format = sam; + fmt->version.major = 1, fmt->version.minor = -1; + return 0; + } + + fmt->category = unknown_category; + fmt->format = unknown_format; + fmt->version.major = fmt->version.minor = -1; + fmt->compression = no_compression; return 0; } +char *hts_format_description(const htsFormat *format) +{ + kstring_t str = { 0, 0, NULL }; + + switch (format->format) { + case sam: kputs("SAM", &str); break; + case bam: kputs("BAM", &str); break; + case cram: kputs("CRAM", &str); break; + case vcf: kputs("VCF", &str); break; + case bcf: + if (format->version.major == 1) kputs("Legacy BCF", &str); + else kputs("BCF", &str); + break; + case bai: kputs("BAI", &str); break; + case crai: kputs("CRAI", &str); break; + case csi: kputs("CSI", &str); break; + case tbi: kputs("Tabix", &str); break; + default: kputs("unknown", &str); break; + } + + if (format->version.major >= 0) { + kputs(" version ", &str); + kputw(format->version.major, &str); + if (format->version.minor >= 0) { + kputc('.', &str); + kputw(format->version.minor, &str); + } + } + + switch (format->compression) { + case custom: kputs(" compressed", &str); break; + case gzip: kputs(" gzip-compressed", &str); break; + case bgzf: + switch (format->format) { + case bam: + case bcf: + case csi: + case tbi: + // These are by definition BGZF, so just use the generic term + kputs(" compressed", &str); + break; + default: + kputs(" BGZF-compressed", &str); + break; + } + break; + default: break; + } + + switch (format->category) { + case sequence_data: kputs(" sequence", &str); break; + case variant_data: kputs(" variant calling", &str); break; + case index_file: kputs(" index", &str); break; + case region_list: kputs(" genomic region", &str); break; + default: break; + } + + if (format->compression == no_compression) + switch (format->format) { + case sam: + case crai: + case vcf: + case bed: + kputs(" text", &str); + break; + + default: + kputs(" data", &str); + break; + } + else + kputs(" data", &str); + + return ks_release(&str); +} + htsFile *hts_open(const char *fn, const char *mode) { htsFile *fp = NULL; hFILE *hfile = hopen(fn, mode); if (hfile == NULL) goto error; - fp = (htsFile*)calloc(1, sizeof(htsFile)); + fp = hts_hopen(hfile, fn, mode); + if (fp == NULL) goto error; + + return fp; + +error: + if (hts_verbose >= 2) + fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); + + if (hfile) + hclose_abruptly(hfile); + + return NULL; +} + +htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode) +{ + htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile)); if (fp == NULL) goto error; fp->fn = strdup(fn); fp->is_be = ed_is_big(); if (strchr(mode, 'r')) { - unsigned char s[18]; - if (hpeek(hfile, s, 6) == 6 && memcmp(s, "CRAM", 4) == 0 && - s[4] >= 1 && s[4] <= 2 && s[5] <= 1) { - fp->is_cram = 1; - } - else if (hpeek(hfile, s, 18) == 18 && s[0] == 0x1f && s[1] == 0x8b && - (s[3] & 4) && memcmp(&s[12], "BC\2\0", 4) == 0) { - // The stream is BGZF-compressed. Decompress a few bytes to see - // whether it's in a binary format (e.g., BAM or BCF, starting - // with four bytes of magic including a control character) or is - // a bgzipped SAM or VCF text file. - fp->is_compressed = 1; - if (is_binary(s, decompress_peek(hfile, s, 4))) fp->is_bin = 1; - else fp->is_kstream = 1; - } - else if (hpeek(hfile, s, 2) == 2 && s[0] == 0x1f && s[1] == 0x8b) { - // Plain GZIP header... so a gzipped text file. - fp->is_compressed = 1; - fp->is_kstream = 1; - } - else if (hpeek(hfile, s, 4) == 4 && is_binary(s, 4)) { - // Binary format, but in a raw non-compressed form. - fp->is_bin = 1; - } - else { - fp->is_kstream = 1; - } + if (hts_detect_format(hfile, &fp->format) < 0) goto error; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { + htsFormat *fmt = &fp->format; fp->is_write = 1; - if (strchr(mode, 'b')) fp->is_bin = 1; - if (strchr(mode, 'c')) fp->is_cram = 1; - if (strchr(mode, 'z')) fp->is_compressed = 1; - else if (strchr(mode, 'u')) fp->is_compressed = 0; - else fp->is_compressed = 2; // not set, default behaviour + + if (strchr(mode, 'b')) fmt->format = binary_format; + else if (strchr(mode, 'c')) fmt->format = cram; + else fmt->format = text_format; + + if (strchr(mode, 'z')) fmt->compression = bgzf; + else if (strchr(mode, 'g')) fmt->compression = gzip; + else if (strchr(mode, 'u')) fmt->compression = no_compression; + else { + // No compression mode specified, set to the default for the format + switch (fmt->format) { + case binary_format: fmt->compression = bgzf; break; + case cram: fmt->compression = custom; break; + case text_format: fmt->compression = no_compression; break; + default: abort(); + } + } + + // Fill in category (if determinable; e.g. 'b' could be BAM or BCF) + fmt->category = format_category(fmt->format); + + fmt->version.major = fmt->version.minor = -1; + fmt->compression_level = -1; + fmt->specific = NULL; } else goto error; - if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) { + switch (fp->format.format) { + case binary_format: + case bam: + case bcf: fp->fp.bgzf = bgzf_hopen(hfile, mode); if (fp->fp.bgzf == NULL) goto error; - } - else if (fp->is_cram) { + fp->is_bin = 1; + break; + + case cram: fp->fp.cram = cram_dopen(hfile, fn, mode); if (fp->fp.cram == NULL) goto error; if (!fp->is_write) cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); + fp->is_cram = 1; + break; - } - else if (fp->is_kstream) { - #if KS_BGZF - BGZF *gzfp = bgzf_hopen(hfile, mode); - #else - // TODO Implement gzip hFILE adaptor - hclose(hfile); // This won't work, especially for stdin - gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb"); - #endif - if (gzfp) fp->fp.voidp = ks_init(gzfp); - else goto error; - } - else { - fp->fp.hfile = hfile; + case text_format: + case sam: + case vcf: + if (!fp->is_write) { + #if KS_BGZF + BGZF *gzfp = bgzf_hopen(hfile, mode); + #else + // TODO Implement gzip hFILE adaptor + hclose(hfile); // This won't work, especially for stdin + gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb"); + #endif + if (gzfp) fp->fp.voidp = ks_init(gzfp); + else goto error; + } + else if (fp->format.compression != no_compression) { + fp->fp.bgzf = bgzf_hopen(hfile, mode); + if (fp->fp.bgzf == NULL) goto error; + } + else + fp->fp.hfile = hfile; + break; + + default: + goto error; } return fp; @@ -205,9 +471,6 @@ htsFile *hts_open(const char *fn, const char *mode) if (hts_verbose >= 2) fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); - if (hfile) - hclose_abruptly(hfile); - if (fp) { free(fp->fn); free(fp->fn_aux); @@ -220,9 +483,14 @@ int hts_close(htsFile *fp) { int ret, save; - if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) { + switch (fp->format.format) { + case binary_format: + case bam: + case bcf: ret = bgzf_close(fp->fp.bgzf); - } else if (fp->is_cram) { + break; + + case cram: if (!fp->is_write) { switch (cram_eof(fp->fp.cram)) { case 0: @@ -236,17 +504,30 @@ int hts_close(htsFile *fp) } } ret = cram_close(fp->fp.cram); - } else if (fp->is_kstream) { - #if KS_BGZF - BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f; - ret = bgzf_close(gzfp); - #else - gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f; - ret = gzclose(gzfp); - #endif - ks_destroy((kstream_t*)fp->fp.voidp); - } else { - ret = hclose(fp->fp.hfile); + break; + + case text_format: + case sam: + case vcf: + if (!fp->is_write) { + #if KS_BGZF + BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f; + ret = bgzf_close(gzfp); + #else + gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f; + ret = gzclose(gzfp); + #endif + ks_destroy((kstream_t*)fp->fp.voidp); + } + else if (fp->format.compression != no_compression) + ret = bgzf_close(fp->fp.bgzf); + else + ret = hclose(fp->fp.hfile); + break; + + default: + ret = -1; + break; } save = errno; @@ -258,11 +539,31 @@ int hts_close(htsFile *fp) return ret; } +const htsFormat *hts_get_format(htsFile *fp) +{ + return fp? &fp->format : NULL; +} + +int hts_set_opt(htsFile *fp, enum cram_option opt, ...) { + int r; + va_list args; + + if (fp->format.format != cram) + return 0; + + va_start(args, opt); + r = cram_set_voption(fp->fp.cram, opt, args); + va_end(args); + + return r; +} + int hts_set_threads(htsFile *fp, int n) { - // TODO Plug in CRAM and other threading - if (fp->is_bin) { + if (fp->format.compression == bgzf) { return bgzf_mt(fp->fp.bgzf, n, 256); + } else if (fp->format.format == cram) { + return hts_set_opt(fp, CRAM_OPT_NTHREADS, n); } else return 0; } @@ -276,6 +577,9 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux) } else fp->fn_aux = NULL; + if (fp->format.format == cram) + cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux); + return 0; } @@ -418,6 +722,7 @@ char **hts_readlines(const char *fn, int *_n) return s; } +// DEPRECATED: To be removed in a future HTSlib release int hts_file_type(const char *fname) { int len = strlen(fname); @@ -425,27 +730,19 @@ int hts_file_type(const char *fname) if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF; if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ; if ( !strcmp("-",fname) ) return FT_STDIN; - // ... etc - int fd = open(fname, O_RDONLY); - if ( !fd ) return 0; + hFILE *f = hopen(fname, "r"); + if (f == NULL) return 0; - uint8_t magic[5]; - if ( read(fd,magic,2)!=2 ) { close(fd); return 0; } - if ( !strncmp((char*)magic,"##",2) ) { close(fd); return FT_VCF; } - if ( !strncmp((char*)magic,"BCF",3) ) { close(fd); return FT_BCF; } - close(fd); + htsFormat fmt; + if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; } + if (hclose(f) < 0) return 0; - if ( magic[0]==0x1f && magic[1]==0x8b ) // compressed - { - BGZF *fp = bgzf_open(fname, "r"); - if ( !fp ) return 0; - if ( bgzf_read(fp, magic, 3)!=3 ) { bgzf_close(fp); return 0; } - bgzf_close(fp); - if ( !strncmp((char*)magic,"##",2) ) return FT_VCF_GZ; - if ( !strncmp((char*)magic,"BCF",3) ) return FT_BCF_GZ; + switch (fmt.format) { + case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ; + case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ; + default: return 0; } - return 0; } /**************** @@ -504,11 +801,11 @@ static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) l = &kh_value(b, k); if (absent) { l->m = 1; l->n = 0; - l->list = (hts_pair64_t*)calloc(l->m, 16); + l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t)); } if (l->n == l->m) { l->m <<= 1; - l->list = (hts_pair64_t*)realloc(l->list, l->m * 16); + l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t)); } l->list[l->n].u = beg; l->list[l->n++].v = end; @@ -523,7 +820,7 @@ static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t o int old_m = l->m; l->m = end + 1; kroundup32(l->m); - l->offset = (uint64_t*)realloc(l->offset, l->m * 8); + l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t)); memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1 } if (beg == end) { // to save a loop in this case @@ -616,9 +913,9 @@ static void compress_binning(hts_idx_t *idx, int i) if (q->n + p->n > q->m) { q->m = q->n + p->n; kroundup32(q->m); - q->list = (hts_pair64_t*)realloc(q->list, q->m * 16); + q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t)); } - memcpy(q->list + q->n, p->list, p->n * 16); + memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t)); q->n += p->n; free(p->list); kh_del(bin, bidx, k); @@ -660,6 +957,7 @@ void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) { int bin; + if (tid<0) beg = -1, end = 0; if (tid >= idx->m) { // enlarge the index int32_t oldm = idx->m; idx->m = idx->m? idx->m<<1 : 2; @@ -887,7 +1185,7 @@ static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt) if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1; if (is_be) ed_swap_4p(&p->n); p->m = p->n; - p->list = (hts_pair64_t*)malloc(p->m * 16); + p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t)); if (p->list == NULL) return -2; if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1; if (is_be) swap_bins(p); @@ -897,7 +1195,7 @@ static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt) if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1; if (is_be) ed_swap_4p(&l->n); l->m = l->n; - l->offset = (uint64_t*)malloc(l->n << 3); + l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t)); if (l->offset == NULL) return -2; if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1; if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); @@ -1130,7 +1428,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re if (beg < 0) beg = 0; if (end < beg) return 0; - if ((bidx = idx->bidx[tid]) == 0) return 0; + if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0; iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; @@ -1154,7 +1452,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) n_off += kh_value(bidx, k).n; if (n_off == 0) return iter; - off = (hts_pair64_t*)calloc(n_off, 16); + off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t)); for (i = n_off = 0; i < iter->bins.n; ++i) { if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { int j; @@ -1213,10 +1511,10 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) if (s[i] != ',') tmp[k++] = s[i]; tmp[k] = 0; if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0; - *end = *tmp? strtol(tmp + 1, &tmp, 10) : 1<<29; + *end = *tmp? strtol(tmp + 1, &tmp, 10) : INT_MAX; if (*beg > *end) name_end = l; } - if (name_end == l) *beg = 0, *end = 1<<29; + if (name_end == l) *beg = 0, *end = INT_MAX; return s + name_end; } @@ -1225,7 +1523,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g int tid, beg, end; char *q, *tmp; if (strcmp(reg, ".") == 0) - return itr_query(idx, HTS_IDX_START, 0, 1<<29, readrec); + return itr_query(idx, HTS_IDX_START, 0, 0, readrec); else if (strcmp(reg, "*") != 0) { q = (char*)hts_parse_reg(reg, &beg, &end); tmp = (char*)alloca(q - reg + 1); @@ -1249,6 +1547,9 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) } ret = iter->readrec(fp, data, r, &tid, &beg, &end); if (ret < 0) iter->finished = 1; + iter->curr_tid = tid; + iter->curr_beg = beg; + iter->curr_end = end; return ret; } if (iter->off == 0) return -1; @@ -1265,7 +1566,12 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) iter->curr_off = bgzf_tell(fp); if (tid != iter->tid || beg >= iter->end) { // no need to proceed ret = -1; break; - } else if (end > iter->beg && iter->end > beg) return ret; + } else if (end > iter->beg && iter->end > beg) { + iter->curr_tid = tid; + iter->curr_beg = beg; + iter->curr_end = end; + return ret; + } } else break; // end of file or error } iter->finished = 1; @@ -1279,8 +1585,7 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) static char *test_and_fetch(const char *fn) { FILE *fp; - // FIXME Use is_remote_scheme() helper that's true for ftp/http/irods/etc - if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) { + if (hisremote(fn)) { const int buf_size = 1 * 1024 * 1024; hFILE *fp_remote; uint8_t *buf; @@ -1289,10 +1594,14 @@ static char *test_and_fetch(const char *fn) for (p = fn + strlen(fn) - 1; p >= fn; --p) if (*p == '/') break; ++p; // p now points to the local file name - if ((fp_remote = hopen(fn, "r")) == 0) { - if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to open remote file '%s'\n", __func__, fn); - return 0; + // Attempt to open local file first + if ((fp = fopen((char*)p, "rb")) != 0) + { + fclose(fp); + return (char*)p; } + // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index. + if ((fp_remote = hopen(fn, "r")) == 0) return 0; if ((fp = fopen(p, "w")) == 0) { if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p); hclose_abruptly(fp_remote); diff --git a/htslib/htsfile.1 b/htslib/htsfile.1 new file mode 100644 index 00000000..b55cafa1 --- /dev/null +++ b/htslib/htsfile.1 @@ -0,0 +1,71 @@ +.TH htsfile 1 "3 February 2015" "htslib-1.2.1" "Bioinformatics tools" +.SH NAME +htsfile \- identify high-throughput sequencing data files +.\" +.\" Copyright (C) 2015 Genome Research Ltd. +.\" +.\" Author: John Marshall +.\" +.\" Permission is hereby granted, free of charge, to any person obtaining a +.\" copy of this software and associated documentation files (the "Software"), +.\" to deal in the Software without restriction, including without limitation +.\" the rights to use, copy, modify, merge, publish, distribute, sublicense, +.\" and/or sell copies of the Software, and to permit persons to whom the +.\" Software is furnished to do so, subject to the following conditions: +.\" +.\" The above copyright notice and this permission notice shall be included in +.\" all copies or substantial portions of the Software. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +.\" IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +.\" THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +.\" LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +.\" DEALINGS IN THE SOFTWARE. +.\" +.SH SYNOPSIS +.B htsfile +.RB [ -chH ] +.IR FILE ... +.SH DESCRIPTION +The \fBhtsfile\fR utility attempts to identify what kind of high-throughput +sequencing data files the specified files are, and provides minimal viewing +capabilities for some kinds of data file. +.P +It can identify sequencing data files such as SAM, BAM, and CRAM; +variant calling data files such as VCF and BCF; +index files used to index these data files; +and compressed versions of many of them. +.P +For each \fIFILE\fR given, \fBhtsfile\fP prints a description of the file +format determined, using similar keyword conventions to \fBfile\fP(1): +"text" indicates a textual file that can probably be viewed on a terminal; +"data" indicates binary data; +"sequence", "variant calling", and "index" indicate different categories of +data file. +When it can be identified, the name of the particular file format (such as +"BAM" or "VCF") is printed at the start of the description. +.P +When used to view file contents as text, \fBhtsfile\fP can optionally show +only headers or only data records, but has no other filtering capabilities. +Use \fBsamtools\fR or \fBbcftools\fR if you need more extensive viewing or +filtering capabilities. +.P +The following options are accepted: +.TP 4n +.BR -c ", " --view +Instead of identifying the specified files, display a textual representation +of their contents on standard output. +.TP +.BR -h ", " --header-only +Display data file headers only. +Implies \fB--view\fR. +.TP +.BR -H ", " --no-header +When viewing files, display data records only. +.PP +.SH SEE ALSO +.IR bcftools (1), +.IR file (1), +.IR samtools (1) diff --git a/htslib/htsfile.c b/htslib/htsfile.c new file mode 100644 index 00000000..fac943b2 --- /dev/null +++ b/htslib/htsfile.c @@ -0,0 +1,168 @@ +/* htsfile.c -- file identifier and minimal viewer. + + Copyright (C) 2014-2015 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include "htslib/hfile.h" +#include "htslib/hts.h" +#include "htslib/sam.h" +#include "htslib/vcf.h" + +enum { identify, view_headers, view_all } mode = identify; +int show_headers = 1; + +static htsFile *dup_stdout(const char *mode) +{ + int fd = dup(STDOUT_FILENO); + hFILE *hfp = (fd >= 0)? hdopen(fd, mode) : NULL; + return hfp? hts_hopen(hfp, "-", mode) : NULL; +} + +static int view_sam(hFILE *hfp, const char *filename) +{ + samFile *in = hts_hopen(hfp, filename, "r"); + if (in == NULL) return 0; + samFile *out = dup_stdout("w"); + bam_hdr_t *hdr = sam_hdr_read(in); + + if (show_headers) sam_hdr_write(out, hdr); + if (mode == view_all) { + bam1_t *b = bam_init1(); + while (sam_read1(in, hdr, b) >= 0) + sam_write1(out, hdr, b); + bam_destroy1(b); + } + + bam_hdr_destroy(hdr); + hts_close(out); + hts_close(in); + return 1; +} + +static int view_vcf(hFILE *hfp, const char *filename) +{ + vcfFile *in = hts_hopen(hfp, filename, "r"); + if (in == NULL) return 0; + vcfFile *out = dup_stdout("w"); + bcf_hdr_t *hdr = bcf_hdr_read(in); + + if (show_headers) bcf_hdr_write(out, hdr); + if (mode == view_all) { + bcf1_t *rec = bcf_init(); + while (bcf_read(in, hdr, rec) >= 0) + bcf_write(out, hdr, rec); + bcf_destroy(rec); + } + + bcf_hdr_destroy(hdr); + hts_close(out); + hts_close(in); + return 1; +} + +static void usage(FILE *fp, int status) +{ + fprintf(fp, +"Usage: htsfile [-chH] FILE...\n" +"Options:\n" +" -c, --view Write textual form of FILEs to standard output\n" +" -h, --header-only Display only headers in view mode, not records\n" +" -H, --no-header Suppress header display in view mode\n"); + exit(status); +} + +int main(int argc, char **argv) +{ + static const struct option options[] = { + { "header-only", no_argument, NULL, 'h' }, + { "no-header", no_argument, NULL, 'H' }, + { "view", no_argument, NULL, 'c' }, + { "help", no_argument, NULL, '?' }, + { "version", no_argument, NULL, 1 }, + { NULL, 0, NULL, 0 } + }; + + int status = EXIT_SUCCESS; + int c, i; + while ((c = getopt_long(argc, argv, "chH?", options, NULL)) >= 0) + switch (c) { + case 'c': mode = view_all; break; + case 'h': mode = view_headers; show_headers = 1; break; + case 'H': show_headers = 0; break; + case 1: + printf( +"htsfile (htslib) %s\n" +"Copyright (C) 2015 Genome Research Ltd.\n", + hts_version()); + exit(EXIT_SUCCESS); + break; + case '?': usage(stdout, EXIT_SUCCESS); break; + default: usage(stderr, EXIT_FAILURE); break; + } + + if (optind == argc) usage(stderr, EXIT_FAILURE); + + for (i = optind; i < argc; i++) { + htsFormat fmt; + hFILE *fp = hopen(argv[i], "r"); + if (fp == NULL) { + fprintf(stderr, "htsfile: can't open \"%s\": %s\n", argv[i], strerror(errno)); + status = EXIT_FAILURE; + continue; + } + + if (hts_detect_format(fp, &fmt) < 0) { + fprintf(stderr, "htsfile: detecting \"%s\" format failed: %s\n", argv[i], strerror(errno)); + hclose_abruptly(fp); + status = EXIT_FAILURE; + continue; + } + + if (mode == identify) { + char *description = hts_format_description(&fmt); + printf("%s:\t%s\n", argv[i], description); + free(description); + } + else + switch (fmt.category) { + case sequence_data: if (view_sam(fp, argv[i])) fp = NULL; break; + case variant_data: if (view_vcf(fp, argv[i])) fp = NULL; break; + default: + fprintf(stderr, "htsfile: can't view %s: unknown format\n", argv[i]); + status = EXIT_FAILURE; + break; + } + + if (fp && hclose(fp) < 0) { + fprintf(stderr, "htsfile: closing %s failed\n", argv[i]); + status = EXIT_FAILURE; + } + } + + return status; +} diff --git a/htslib/htslib.mk b/htslib/htslib.mk index 6c203ca8..14baea2c 100644 --- a/htslib/htslib.mk +++ b/htslib/htslib.mk @@ -1,6 +1,6 @@ # Makefile rules useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2014 Genome Research Ltd. +# Copyright (C) 2013-2015 Genome Research Ltd. # # Author: John Marshall # @@ -60,6 +60,7 @@ HTSLIB_PUBLIC_HEADERS = \ $(HTSDIR)/htslib/kseq.h \ $(HTSDIR)/htslib/ksort.h \ $(HTSDIR)/htslib/kstring.h \ + $(HTSDIR)/htslib/regidx.h \ $(HTSDIR)/htslib/sam.h \ $(HTSDIR)/htslib/synced_bcf_reader.h \ $(HTSDIR)/htslib/tbx.h \ @@ -73,10 +74,12 @@ HTSLIB_ALL = \ $(HTSDIR)/faidx.c \ $(HTSDIR)/hfile_internal.h \ $(HTSDIR)/hfile.c \ + $(HTSDIR)/hfile_irods.c \ $(HTSDIR)/hfile_net.c \ $(HTSDIR)/hts.c \ $(HTSDIR)/knetfile.c \ $(HTSDIR)/kstring.c \ + $(HTSDIR)/regidx.c \ $(HTSDIR)/sam.c \ $(HTSDIR)/synced_bcf_reader.c \ $(HTSDIR)/tbx.c \ @@ -130,6 +133,9 @@ $(HTSDIR)/libhts.so $(HTSDIR)/libhts.dylib: $(HTSLIB_ALL) $(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) +cd $(HTSDIR) && $(MAKE) bgzip +$(HTSDIR)/htsfile: $(HTSDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) + +cd $(HTSDIR) && $(MAKE) htsfile + $(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) +cd $(HTSDIR) && $(MAKE) tabix diff --git a/htslib/htslib/bgzf.h b/htslib/htslib/bgzf.h index 31b8d5e4..cb8d4b94 100644 --- a/htslib/htslib/bgzf.h +++ b/htslib/htslib/bgzf.h @@ -84,9 +84,10 @@ extern "C" { * Open an existing file descriptor for reading or writing. * * @param fd file descriptor - * @param mode mode matching /[rwa][u0-9]+/: 'r' for reading, 'w' for - * writing, or 'a' for appending, while a digit specifies - * the zlib compression level. + * @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for + * writing, 'a' for appending, 'g' for gzip rather than BGZF + * compression (with 'w' only), and digit specifies the zlib + * compression level. * Note that there is a distinction between 'u' and '0': the * first yields plain uncompressed output whereas the latter * outputs uncompressed data wrapped in the zlib format. diff --git a/htslib/htslib/faidx.h b/htslib/htslib/faidx.h index 24a30e22..a32d3a9f 100644 --- a/htslib/htslib/faidx.h +++ b/htslib/htslib/faidx.h @@ -61,7 +61,7 @@ extern "C" { int fai_build(const char *fn); /*! - @abstract Distroy a faidx_t struct. + @abstract Destroy a faidx_t struct. @param fai Pointer to the struct to be destroyed */ void fai_destroy(faidx_t *fai); diff --git a/htslib/htslib/hfile.h b/htslib/htslib/hfile.h index 1b1a8a97..ea49c451 100644 --- a/htslib/htslib/hfile.h +++ b/htslib/htslib/hfile.h @@ -1,6 +1,6 @@ /* hfile.h -- buffered low-level input/output streams. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: John Marshall @@ -60,6 +60,14 @@ hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED; */ hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED; +/*! + @abstract Report whether the file name or URL denotes remote storage + @return 0 if local, 1 if remote. + @notes "Remote" means involving e.g. explicit network access, with the + implication that callers may wish to cache such files' contents locally. +*/ +int hisremote(const char *filename) HTS_RESULT_USED; + /*! @abstract Flush (for output streams) and close the stream @return 0 if successful, or EOF (with errno set) if an error occurred. diff --git a/htslib/htslib/hts.h b/htslib/htslib/hts.h index d0207518..084c1626 100644 --- a/htslib/htslib/hts.h +++ b/htslib/htslib/hts.h @@ -69,8 +69,48 @@ typedef struct __kstring_t { * File I/O * ************/ +// Add new entries only at the end (but before the *_maximum entry) +// of these enums, as their numbering is part of the htslib ABI. + +enum htsFormatCategory { + unknown_category, + sequence_data, // Sequence data -- SAM, BAM, CRAM, etc + variant_data, // Variant calling data -- VCF, BCF, etc + index_file, // Index file associated with some data file + region_list, // Coordinate intervals or regions -- BED, etc + category_maximum = 32767 +}; + +enum htsExactFormat { + unknown_format, + binary_format, text_format, + sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed, + format_maximum = 32767 +}; + +enum htsCompression { + no_compression, gzip, bgzf, custom, + compression_maximum = 32767 +}; + +typedef struct htsFormat { + enum htsFormatCategory category; + enum htsExactFormat format; + struct { short major, minor; } version; + enum htsCompression compression; + short compression_level; // currently unused + void *specific; // currently unused +} htsFormat; + +// Maintainers note htsFile cannot be an opaque structure because some of its +// fields are part of libhts.so's ABI (hence these fields must not be moved): +// - fp is used in the public sam_itr_next()/etc macros +// - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1 +// - is_write and is_cram are used directly in samtools <= 1.1 +// - fp is used directly in samtools (up to and including current develop) +// - line is used directly in bcftools (up to and including current develop) typedef struct { - uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_compressed:2, is_kstream:1, dummy:25; + uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, dummy:28; int64_t lineno; kstring_t line; char *fn, *fn_aux; @@ -80,20 +120,71 @@ typedef struct { struct hFILE *hfile; void *voidp; } fp; + htsFormat format; } htsFile; +// REQUIRED_FIELDS +enum sam_fields { + SAM_QNAME = 0x00000001, + SAM_FLAG = 0x00000002, + SAM_RNAME = 0x00000004, + SAM_POS = 0x00000008, + SAM_MAPQ = 0x00000010, + SAM_CIGAR = 0x00000020, + SAM_RNEXT = 0x00000040, + SAM_PNEXT = 0x00000080, + SAM_TLEN = 0x00000100, + SAM_SEQ = 0x00000200, + SAM_QUAL = 0x00000400, + SAM_AUX = 0x00000800, + SAM_RGAUX = 0x00001000, +}; + +enum cram_option { + CRAM_OPT_DECODE_MD, + CRAM_OPT_PREFIX, + CRAM_OPT_VERBOSITY, + CRAM_OPT_SEQS_PER_SLICE, + CRAM_OPT_SLICES_PER_CONTAINER, + CRAM_OPT_RANGE, + CRAM_OPT_VERSION, + CRAM_OPT_EMBED_REF, + CRAM_OPT_IGNORE_MD5, + CRAM_OPT_REFERENCE, + CRAM_OPT_MULTI_SEQ_PER_SLICE, + CRAM_OPT_NO_REF, + CRAM_OPT_USE_BZIP2, + CRAM_OPT_SHARED_REF, + CRAM_OPT_NTHREADS, + CRAM_OPT_THREAD_POOL, + CRAM_OPT_USE_LZMA, + CRAM_OPT_USE_RANS, + CRAM_OPT_REQUIRED_FIELDS, +}; + /********************** * Exported functions * **********************/ extern int hts_verbose; -/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ +/*! @abstract Table for converting a nucleotide character to 4-bit encoding. +The input character may be either an IUPAC ambiguity code, '=' for 0, or +'0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 +for A/C/G/T or combinations of these bits for ambiguous bases. +*/ extern const unsigned char seq_nt16_table[256]; -/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ +/*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC +ambiguity code letter (or '=' when given 0). +*/ extern const char seq_nt16_str[]; +/*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. +Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). +*/ +extern const int seq_nt16_int[]; + #ifdef __cplusplus extern "C" { #endif @@ -105,6 +196,20 @@ extern "C" { */ const char *hts_version(void); +/*! + @abstract Determine format by peeking at the start of a file + @param fp File opened for reading, positioned at the beginning + @param fmt Format structure that will be filled out on return + @return 0 for success, or negative if an error occurred. +*/ +int hts_detect_format(struct hFILE *fp, htsFormat *fmt); + +/*! + @abstract Get a human-readable description of the file format + @return Description string, to be freed by the caller after use. +*/ +char *hts_format_description(const htsFormat *format); + /*! @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file @param fn The file name or "-" for stdin/stdout @@ -116,8 +221,9 @@ const char *hts_version(void); specifier letters: b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) c CRAM format + g gzip compressed u uncompressed - z compressed + z bgzf compressed [0-9] zlib compression level Note that there is a distinction between 'u' and '0': the first yields plain uncompressed output whereas the latter outputs uncompressed data @@ -130,6 +236,13 @@ const char *hts_version(void); */ htsFile *hts_open(const char *fn, const char *mode); +/*! + @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file + @param fn The already-open file handle + @param mode Open mode, as per hts_open() +*/ +htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode); + /*! @abstract Close a file handle, flushing buffered data for output streams @param fp The file handle to be closed @@ -137,6 +250,22 @@ htsFile *hts_open(const char *fn, const char *mode); */ int hts_close(htsFile *fp); +/*! + @abstract Returns the file's format information + @param fp The file handle + @return Read-only pointer to the file's htsFormat. +*/ +const htsFormat *hts_get_format(htsFile *fp); + +/*! + @abstract Sets a specified CRAM option on the open file handle. + @param fp The file handle open the open file. + @param opt The CRAM_OPT_* option. + @param ... Optional arguments, dependent on the option used. + @return 0 for success, or negative if an error occurred. +*/ +int hts_set_opt(htsFile *fp, enum cram_option opt, ...); + int hts_getline(htsFile *fp, int delimiter, kstring_t *str); char **hts_readlines(const char *fn, int *_n); /*! @@ -207,6 +336,7 @@ typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, typedef struct { uint32_t read_rest:1, finished:1, dummy:29; int tid, beg, end, n_off, i; + int curr_tid, curr_beg, curr_end; uint64_t curr_off; hts_pair64_t *off; hts_readrec_func *readrec; @@ -251,12 +381,8 @@ extern "C" { /** * hts_file_type() - Convenience function to determine file type - * @fname: the file name - * - * Returns one of the FT_* defines. - * - * This function was added in order to avoid the need for excessive command - * line switches. + * DEPRECATED: This function has been replaced by hts_detect_format(). + * It and these FT_* macros will be removed in a future HTSlib release. */ #define FT_UNKN 0 #define FT_GZ 1 diff --git a/htslib/htslib/khash.h b/htslib/htslib/khash.h index 2d910dec..5e55088b 100644 --- a/htslib/htslib/khash.h +++ b/htslib/htslib/khash.h @@ -143,11 +143,13 @@ typedef unsigned long khint64_t; typedef unsigned long long khint64_t; #endif +#ifndef kh_inline #ifdef _MSC_VER #define kh_inline __inline #else #define kh_inline inline #endif +#endif /* kh_inline */ typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -182,7 +184,7 @@ typedef khint_t khiter_t; static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ - typedef struct { \ + typedef struct kh_##name##_s { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ @@ -245,11 +247,11 @@ static const double __ac_HASH_UPPER = 0.77; memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (!new_keys) return -1; \ + if (!new_keys) { kfree(new_flags); return -1; } \ h->keys = new_keys; \ if (kh_is_map) { \ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - if (!new_vals) return -1; \ + if (!new_vals) { kfree(new_flags); return -1; } \ h->vals = new_vals; \ } \ } /* otherwise shrink */ \ diff --git a/htslib/htslib/khash_str2int.h b/htslib/htslib/khash_str2int.h index 8c4f5a6b..4bbc1008 100644 --- a/htslib/htslib/khash_str2int.h +++ b/htslib/htslib/khash_str2int.h @@ -121,4 +121,13 @@ static inline int khash_str2int_set(void *_hash, const char *str, int value) return k; } +/* + * Return the number of keys in the hash table. + */ +static inline int khash_str2int_size(void *_hash) +{ + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + return kh_size(hash); +} + #endif diff --git a/htslib/htslib/kseq.h b/htslib/htslib/kseq.h index 577cdc45..e1a3eaaa 100644 --- a/htslib/htslib/kseq.h +++ b/htslib/htslib/kseq.h @@ -71,8 +71,7 @@ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ + if (ks->end == 0) { ks->is_eof = 1; return -1; } \ } \ ks->seek_pos++; \ return (int)ks->buf[ks->begin++]; \ @@ -95,18 +94,17 @@ typedef struct __kstring_t { #define __KS_GETUNTIL(SCOPE, __read) \ SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ + int gotany = 0; \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ uint64_t seek_pos = str->l; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ @@ -128,6 +126,7 @@ typedef struct __kstring_t { str->s = (char*)realloc(str->s, str->m); \ } \ seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \ + gotany = 1; \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ @@ -136,6 +135,7 @@ typedef struct __kstring_t { break; \ } \ } \ + if (!gotany && ks_eof(ks)) return -1; \ ks->seek_pos += seek_pos; \ if (str->s == 0) { \ str->m = 1; \ diff --git a/htslib/htslib/regidx.h b/htslib/htslib/regidx.h new file mode 100644 index 00000000..39a795ee --- /dev/null +++ b/htslib/htslib/regidx.h @@ -0,0 +1,147 @@ +/* + Copyright (C) 2014 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + Regions indexing with an optional payload. Inspired by samtools/bedidx.c. + This code is intended as future replacement of bcf_sr_regions_t. + + Example of usage: + + // Init the parser and print regions. In this example the payload is a + // pointer to a string. For the description of parse_custom and + // free_custom functions, see regidx_parse_f and regidx_free_f below, + // and for working example see test/test-regidx.c. + regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); + + // Query overlap with chr:from-to + regitr_t itr; + if ( regidx_overlap(idx, chr,from,to, &itr) ) printf("There is an overlap!\n"); + + while ( REGITR_OVERLAP(itr,from,to) ) + { + printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to, + REGITR_START(itr), REGITR_END(itr), REGITR_PAYLOAD(itr,char*)); + itr.i++; + } + + regidx_destroy(regs); +*/ + +#ifndef HTSLIB_REGIDX_H +#define HTSLIB_REGIDX_H + +#include +#include + +typedef struct _regidx_t regidx_t; +typedef struct +{ + uint32_t start, end; +} +reg_t; +typedef struct +{ + int i, n; + reg_t *reg; + void *payload; +} +regitr_t; + +#define REGITR_START(itr) (itr).reg[(itr).i].start +#define REGITR_END(itr) (itr).reg[(itr).i].end +#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)[(itr).i] +#define REGITR_OVERLAP(itr,from,to) (itr.i < itr.n && REGITR_START(itr)<=to && REGITR_END(itr)>=from ) + +/* + * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed + * or regidx_parse_tab below. The function is expected to set `chr_from` and + * `chr_to` to point to first and last character of chromosome name and set + * coordinates `reg->start` and `reg->end` (0-based, inclusive). If + * regidx_init() was called with non-zero payload_size, the `payload` points + * to a memory location of the payload_size and `usr` is data passed to + * regidx_init(). Any memory allocated by the function will be freed by + * regidx_free_f on regidx_destroy(). + * + * Return value: 0 on success, -1 to skip a record, -2 on fatal error. + */ +typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr); +typedef void (*regidx_free_f)(void *payload); + +int regidx_parse_bed(const char*,char**,char**,reg_t*,void*,void*); // CHROM,FROM,TO (0-based,right-open) +int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*); // CHROM,POS (1-based, inclusive) + +/* + * regidx_init() - creates new index + * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() + * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, + * the format will be autodected, currently either regidx_parse_tab (the default) or + * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that + * the exact autodetection algorithm will change. + * @param freef: NULL or see description of regidx_parse_f + * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f + * @param usr: optional user data passed to regidx_parse_f + * + * Returns index on success or NULL on error. + */ +regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); + +/* + * regidx_destroy() - free memory allocated by regidx_init + */ +void regidx_destroy(regidx_t *idx); + +/* + * regidx_overlap() - check overlap of the location chr:from-to with regions + * @param start,end: 0-based start, end coordinate (inclusive) + * @param itr: pointer to iterator, can be NULL if not needed + * + * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping + * regions can be iterated as shown in the example above. + */ +int regidx_overlap(regidx_t *idx, const char *chr, uint32_t start, uint32_t end, regitr_t *itr); + +/* + * regidx_insert() - add a new region. + * + * After last region has been added, call regidx_insert(idx,NULL) to + * build the index. + * + * Returns 0 on success or -1 on error. + */ +int regidx_insert(regidx_t *idx, char *line); + +/* + * regidx_seq_names() - return list of all sequence names + */ +char **regidx_seq_names(regidx_t *idx, int *n); + +/* + * regidx_seq_nregs() - number of regions + * regidx_nregs() - total number of regions + */ +int regidx_seq_nregs(regidx_t *idx, const char *seq); +int regidx_nregs(regidx_t *idx); + +#endif + diff --git a/htslib/htslib/sam.h b/htslib/htslib/sam.h index 94c18f78..9e6d6a3b 100644 --- a/htslib/htslib/sam.h +++ b/htslib/htslib/sam.h @@ -235,7 +235,7 @@ typedef struct { #define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1)) /*! @function @abstract Get a base on read - @param s Query sequence returned by bam1_seq() + @param s Query sequence returned by bam_get_seq() @param i The i-th position, 0-based @return 4-bit integer representing the base. */ diff --git a/htslib/htslib/synced_bcf_reader.h b/htslib/htslib/synced_bcf_reader.h index 76d79d02..888fa1e3 100644 --- a/htslib/htslib/synced_bcf_reader.h +++ b/htslib/htslib/synced_bcf_reader.h @@ -106,11 +106,17 @@ typedef struct bcf1_t **buffer; // cached VCF records. First is the current record synced across the reader int nbuffer, mbuffer; // number of cached records (including the current record); number of allocated records int nfilter_ids, *filter_ids; // -1 for ".", otherwise filter id as returned by bcf_id2int - int type; int *samples, n_smpl; // list of columns in the order consistent with bcf_srs_t.samples } bcf_sr_t; +typedef enum +{ + open_failed, not_bgzf, idx_load_failed, file_type_error, api_usage_error, + header_error +} +bcf_sr_error; + typedef struct { // Parameters controlling the logic @@ -123,6 +129,7 @@ typedef struct int require_index; // Some tools do not need random access int max_unpack; // When reading VCFs and knowing some fields will not be needed, boost performance of vcf_parse1 int *has_line; // Corresponds to return value of bcf_sr_next_line but is not limited by sizeof(int). Use bcf_sr_has_line macro to query. + bcf_sr_error errnum; // Auxiliary data bcf_sr_t *readers; @@ -148,6 +155,9 @@ bcf_srs_t *bcf_sr_init(void); /** Destroy bcf_srs_t struct */ void bcf_sr_destroy(bcf_srs_t *readers); +char *bcf_sr_strerror(int errnum); + + /** * bcf_sr_add_reader() - open new reader * @readers: holder of the open readers @@ -161,7 +171,6 @@ void bcf_sr_destroy(bcf_srs_t *readers); int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname); void bcf_sr_remove_reader(bcf_srs_t *files, int i); - /** * bcf_sr_next_line() - the iterator * @readers: holder of the open readers @@ -174,6 +183,8 @@ int bcf_sr_next_line(bcf_srs_t *readers); #define bcf_sr_has_line(readers, i) (readers)->has_line[i] #define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL) #define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0) +#define bcf_sr_get_header(_readers, i) (_readers)->readers[i].header +#define bcf_sr_get_reader(_readers, i) &((_readers)->readers[i]) /** * bcf_sr_seek() - set all readers to selected position diff --git a/htslib/htslib/vcf.h b/htslib/htslib/vcf.h index 38d418c8..fde93941 100644 --- a/htslib/htslib/vcf.h +++ b/htslib/htslib/vcf.h @@ -87,7 +87,8 @@ typedef struct { } bcf_hrec_t; typedef struct { - uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 for BCF_HL_FLT,INFO,FMT + uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG bcf_hrec_t *hrec[3]; int id; } bcf_idinfo_t; @@ -103,7 +104,7 @@ typedef struct { void *dict[3]; // ID dictionary, contig dict and sample dict char **samples; bcf_hrec_t **hrec; - int nhrec; + int nhrec, dirty; int ntransl, *transl[2]; // for bcf_translate() int nsamples_ori; // for bcf_hdr_set_samples() uint8_t *keep_samples; @@ -306,7 +307,7 @@ extern "C" { /** Writes VCF or BCF header */ - int bcf_hdr_write(htsFile *fp, const bcf_hdr_t *h); + int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h); /** Parse VCF line contained in kstring and populate the bcf1_t struct */ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v); @@ -348,6 +349,7 @@ extern "C" { * internally to reflect any changes made by bcf_update_* functions. */ bcf1_t *bcf_dup(bcf1_t *src); + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src); /** * bcf_write() - write one VCF or BCF record. The type is determined at the open() call. @@ -385,8 +387,7 @@ extern "C" { /** * bcf_hdr_add_sample() - add a new sample. - * @param sample: Sample name to be added. After all samples have been added, NULL - * must be passed to update internal header structures. + * @param sample: sample name to be added */ int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample); @@ -403,6 +404,7 @@ extern "C" { int bcf_hdr_append(bcf_hdr_t *h, const char *line); int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...); + /** VCF version, e.g. VCFv4.2 */ const char *bcf_hdr_get_version(const bcf_hdr_t *hdr); void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version); @@ -565,7 +567,8 @@ extern "C" { // from bcf_get_genotypes() below. #define bcf_gt_phased(idx) ((idx+1)<<1|1) #define bcf_gt_unphased(idx) ((idx+1)<<1) - #define bcf_gt_missing 0 + #define bcf_gt_missing 0 + #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1) #define bcf_gt_is_phased(idx) ((idx)&1) #define bcf_gt_allele(val) (((val)>>1)-1) diff --git a/htslib/htslib_vars.mk b/htslib/htslib_vars.mk index 725e9eec..08f9a57c 100644 --- a/htslib/htslib_vars.mk +++ b/htslib/htslib_vars.mk @@ -30,6 +30,7 @@ htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) htslib_hts_h = $(HTSPREFIX)htslib/hts.h htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h +htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) diff --git a/htslib/knetfile.c b/htslib/knetfile.c index 400da4fc..28fe629b 100644 --- a/htslib/knetfile.c +++ b/htslib/knetfile.c @@ -327,11 +327,9 @@ int kftp_connect_file(knetFile *fp) kftp_pasv_prep(fp); kftp_send_cmd(fp, fp->size_cmd, 1); #ifndef _WIN32 - if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) - { - fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); - return -1; - } + // If the file does not exist, the response will be "550 Could not get file + // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi. + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) return -1; #else const char *p = fp->response; while (*p != ' ') ++p; @@ -413,7 +411,7 @@ int khttp_connect_file(knetFile *fp) l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); l += sprintf(buf + l, "\r\n"); - if ( netwrite(fp->fd, buf, l) != l ) return -1; + if ( netwrite(fp->fd, buf, l) != l ) { free(buf); return -1; } l = 0; while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency if (buf[l] == '\n' && l >= 3) @@ -422,6 +420,7 @@ int khttp_connect_file(knetFile *fp) } buf[l] = 0; if (l < 14) { // prematured header + free(buf); netclose(fp->fd); fp->fd = -1; return -1; diff --git a/htslib/regidx.c b/htslib/regidx.c new file mode 100644 index 00000000..291ba795 --- /dev/null +++ b/htslib/regidx.c @@ -0,0 +1,338 @@ +/* + Copyright (C) 2014 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "htslib/hts.h" +#include "htslib/kstring.h" +#include "htslib/kseq.h" +#include "htslib/khash_str2int.h" +#include "htslib/regidx.h" + +#define LIDX_SHIFT 13 // number of insignificant index bits + +// List of regions for one chromosome +typedef struct +{ + int *idx, nidx; + int nregs, mregs; // n:used, m:alloced + reg_t *regs; + void *payload; +} +reglist_t; + +// Container of all sequences +struct _regidx_t +{ + int nseq, mseq; // n:used, m:alloced + reglist_t *seq; // regions for each sequence + void *seq2regs; // hash for fast lookup from chr name to regions + char **seq_names; + regidx_free_f free; // function to free any data allocated by regidx_parse_f + regidx_parse_f parse; // parse one input line + void *usr; // user data to pass to regidx_parse_f + + // temporary data for index initialization + kstring_t str; + int rid_prev, start_prev, end_prev; + int payload_size; + void *payload; +}; + +int regidx_seq_nregs(regidx_t *idx, const char *seq) +{ + int iseq; + if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence + return idx->seq[iseq].nregs; +} + +int regidx_nregs(regidx_t *idx) +{ + int i, nregs = 0; + for (i=0; inseq; i++) nregs += idx->seq[i].nregs; + return nregs; +} + +char **regidx_seq_names(regidx_t *idx, int *n) +{ + *n = idx->nseq; + return idx->seq_names; +} + +int _regidx_build_index(regidx_t *idx) +{ + int iseq; + for (iseq=0; iseqnseq; iseq++) + { + reglist_t *list = &idx->seq[iseq]; + int j,k, imax = 0; // max index bin + for (j=0; jnregs; j++) + { + int ibeg = list->regs[j].start >> LIDX_SHIFT; + int iend = list->regs[j].end >> LIDX_SHIFT; + if ( imax < iend + 1 ) + { + int old_imax = imax; + imax = iend + 1; + kroundup32(imax); + list->idx = (int*) realloc(list->idx, imax*sizeof(int)); + for (k=old_imax; kidx[k] = -1; + } + if ( ibeg==iend ) + { + if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j; + } + else + { + for (k=ibeg; k<=iend; k++) + if ( list->idx[k]<0 ) list->idx[k] = j; + } + list->nidx = iend + 1; + } + } + return 0; +} + +int regidx_insert(regidx_t *idx, char *line) +{ + if ( !line ) + return _regidx_build_index(idx); + + char *chr_from, *chr_to; + reg_t reg; + int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr); + if ( ret==-2 ) return -1; // error + if ( ret==-1 ) return 0; // skip the line + + int rid; + idx->str.l = 0; + kputsn(chr_from, chr_to-chr_from+1, &idx->str); + if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) + { + idx->nseq++; + int m_prev = idx->mseq; + hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); + hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); + idx->seq_names[idx->nseq-1] = strdup(idx->str.s); + rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); + } + + reglist_t *list = &idx->seq[rid]; + list->nregs++; + int m_prev = list->mregs; + hts_expand(reg_t,list->nregs,list->mregs,list->regs); + list->regs[list->nregs-1] = reg; + if ( idx->payload_size ) + { + if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs); + memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size); + } + + if ( idx->rid_prev==rid ) + { + if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) ) + { + fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n", + idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1); + return -1; + } + } + idx->rid_prev = rid; + idx->start_prev = reg.start; + idx->end_prev = reg.end; + return 0; +} + +regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) +{ + if ( !parser ) + { + if ( !fname ) parser = regidx_parse_tab; + else + { + int len = strlen(fname); + if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) ) + parser = regidx_parse_bed; + else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) ) + parser = regidx_parse_bed; + else if ( len>=4 && !strcasecmp(".bed",fname+len-4) ) + parser = regidx_parse_bed; + else + parser = regidx_parse_tab; + } + } + + regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + idx->free = free_f; + idx->parse = parser; + idx->usr = usr_dat; + idx->seq2regs = khash_str2int_init(); + idx->rid_prev = -1; + idx->start_prev = -1; + idx->end_prev = -1; + idx->payload_size = payload_size; + if ( payload_size ) idx->payload = malloc(payload_size); + + if ( !fname ) return idx; + + kstring_t str = {0,0,0}; + + htsFile *fp = hts_open(fname,"r"); + if ( !fp ) goto error; + + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + if ( regidx_insert(idx, str.s) ) goto error; + } + regidx_insert(idx, NULL); + + free(str.s); + hts_close(fp); + return idx; + +error: + free(str.s); + if ( fp ) hts_close(fp); + regidx_destroy(idx); + return NULL; +} + +void regidx_destroy(regidx_t *idx) +{ + int i, j; + for (i=0; inseq; i++) + { + reglist_t *list = &idx->seq[i]; + if ( idx->free ) + { + for (j=0; jnregs; j++) + idx->free(list->payload + idx->payload_size*j); + } + free(list->payload); + free(list->regs); + free(list->idx); + } + free(idx->seq_names); + free(idx->seq); + free(idx->str.s); + free(idx->payload); + khash_str2int_destroy_free(idx->seq2regs); + free(idx); +} + +int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr) +{ + if ( itr ) itr->i = itr->n = 0; + + int iseq; + if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence + + reglist_t *list = &idx->seq[iseq]; + if ( !list->nregs ) return 0; + + int i, ibeg = from>>LIDX_SHIFT; + int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ]; + if ( ireg < 0 ) + { + // linear search; if slow, replace with binary search + if ( ibeg > list->nidx ) ibeg = list->nidx; + for (i=ibeg - 1; i>=0; i--) + if ( list->idx[i] >=0 ) break; + ireg = i>=0 ? list->idx[i] : 0; + } + for (i=ireg; inregs; i++) + { + if ( list->regs[i].start > to ) return 0; // no match + if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found + } + + if ( i>=list->nregs ) return 0; // no match + + if ( !itr ) return 1; + + itr->i = 0; + itr->n = list->nregs - i; + itr->reg = &idx->seq[iseq].regs[i]; + if ( idx->payload_size ) + itr->payload = idx->seq[iseq].payload + i*idx->payload_size; + else + itr->payload = NULL; + + return 1; +} + +int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + *chr_beg = ss; + *chr_end = se-1; + + ss = se+1; + reg->start = strtol(ss, &se, 10); + if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + ss = se+1; + reg->end = strtol(ss, &se, 10) - 1; + if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + return 0; +} + +int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + *chr_beg = ss; + *chr_end = se-1; + + ss = se+1; + reg->start = strtol(ss, &se, 10) - 1; + if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } + + if ( !se[0] || !se[1] ) + reg->end = reg->start; + else + { + ss = se+1; + reg->end = strtol(ss, &se, 10); + if ( ss==se ) reg->end = reg->start; + else reg->end--; + } + + return 0; +} + diff --git a/htslib/sam.c b/htslib/sam.c index d85b85ba..460cf33d 100644 --- a/htslib/sam.c +++ b/htslib/sam.c @@ -77,7 +77,7 @@ bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0) h->sdict = NULL; h->text = (char*)calloc(h->l_text + 1, 1); memcpy(h->text, h0->text, h->l_text); - h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); int i; for (i = 0; i < h->n_targets; ++i) { @@ -95,7 +95,7 @@ static bam_hdr_t *hdr_from_dict(sdict_t *d) h = bam_hdr_init(); h->sdict = d; h->n_targets = kh_size(d); - h->target_len = (uint32_t*)malloc(4 * h->n_targets); + h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets); h->target_name = (char**)malloc(sizeof(char*) * h->n_targets); for (k = kh_begin(d); k != kh_end(d); ++k) { if (!kh_exist(d, k)) continue; @@ -135,7 +135,7 @@ bam_hdr_t *bam_hdr_read(BGZF *fp) if (fp->is_be) ed_swap_4p(&h->n_targets); // read reference sequence names and lengths h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); - h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); for (i = 0; i != h->n_targets; ++i) { bgzf_read(fp, &name_len, 4); if (fp->is_be) ed_swap_4p(&name_len); @@ -432,18 +432,23 @@ int bam_index_build(const char *fn, int min_shift) int ret = 0; if ((fp = hts_open(fn, "r")) == 0) return -1; - if (fp->is_cram) { - ret = cram_index_build(fp->fp.cram, fn); - } else { - idx = bam_index(fp->fp.bgzf, min_shift); - if ( !idx ) - { - hts_close(fp); - return -1; - } - hts_idx_save(idx, fn, min_shift > 0 - ? HTS_FMT_CSI : HTS_FMT_BAI); - hts_idx_destroy(idx); + switch (fp->format.format) { + case cram: + ret = cram_index_build(fp->fp.cram, fn); + break; + + case bam: + idx = bam_index(fp->fp.bgzf, min_shift); + if (idx) { + hts_idx_save(idx, fn, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); + hts_idx_destroy(idx); + } + else ret = -1; + break; + + default: + ret = -1; + break; } hts_close(fp); @@ -474,9 +479,10 @@ static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int { htsFile *fp = fpv; bam1_t *b = bv; - if (fp->is_bin) return bam_read1(bgzfp, b); - else if (fp->is_cram) return cram_get_bam_seq(fp->fp.cram, &b); - else { + switch (fp->format.format) { + case bam: return bam_read1(bgzfp, b); + case cram: return cram_get_bam_seq(fp->fp.cram, &b); + default: // TODO Need headers available to implement this for SAM files fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n"); abort(); @@ -494,8 +500,11 @@ typedef struct hts_cram_idx_t { hts_idx_t *sam_index_load(samFile *fp, const char *fn) { - if (fp->is_bin) return bam_index_load(fn); - else if (fp->is_cram) { + switch (fp->format.format) { + case bam: + return bam_index_load(fn); + + case cram: { if (cram_index_load(fp->fp.cram, fn) < 0) return NULL; // Cons up a fake "index" just pointing at the associated cram_fd: hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); @@ -503,8 +512,11 @@ hts_idx_t *sam_index_load(samFile *fp, const char *fn) idx->fmt = HTS_FMT_CRAI; idx->cram = fp->fp.cram; return (hts_idx_t *) idx; + } + + default: + return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t } - else return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t } static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) @@ -620,11 +632,14 @@ bam_hdr_t *sam_hdr_parse(int l_text, const char *text) bam_hdr_t *sam_hdr_read(htsFile *fp) { - if (fp->is_bin) { + switch (fp->format.format) { + case bam: return bam_hdr_read(fp->fp.bgzf); - } else if (fp->is_cram) { + + case cram: return cram_header_to_bam(fp->fp.cram->header); - } else { + + case sam: { kstring_t str; bam_hdr_t *h; int has_SQ = 0; @@ -650,20 +665,38 @@ bam_hdr_t *sam_hdr_read(htsFile *fp) h = sam_hdr_parse(str.l, str.s); h->l_text = str.l; h->text = str.s; return h; + } + + default: + abort(); } } int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) { - if (fp->is_bin) { + switch (fp->format.format) { + case binary_format: + fp->format.category = sequence_data; + fp->format.format = bam; + /* fall-through */ + case bam: bam_hdr_write(fp->fp.bgzf, h); - } else if (fp->is_cram) { + break; + + case cram: { cram_fd *fd = fp->fp.cram; if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1; if (fp->fn_aux) cram_load_reference(fd, fp->fn_aux); if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; - } else { + } + break; + + case text_format: + fp->format.category = sequence_data; + fp->format.format = sam; + /* fall-through */ + case sam: { char *p; hputs(h->text, fp->fp.hfile); p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!! @@ -677,6 +710,11 @@ int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) } } if ( hflush(fp->fp.hfile) != 0 ) return -1; + } + break; + + default: + abort(); } return 0; } @@ -806,9 +844,8 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) kputc_('A', &str); kputc_(*q, &str); } else if (type == 'i' || type == 'I') { - long x; - x = strtol(q, &q, 10); - if (x < 0) { + if (*q == '-') { + long x = strtol(q, &q, 10); if (x >= INT8_MIN) { kputc_('c', &str); kputc_(x, &str); } else if (x >= INT16_MIN) { @@ -819,6 +856,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) kputc_('i', &str); kputsn_(&y, 4, &str); } } else { + unsigned long x = strtoul(q, &q, 10); if (x <= UINT8_MAX) { kputc_('C', &str); kputc_(x, &str); } else if (x <= UINT16_MAX) { @@ -873,7 +911,8 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) { - if (fp->is_bin) { + switch (fp->format.format) { + case bam: { int r = bam_read1(fp->fp.bgzf, b); if (r >= 0) { if (b->core.tid >= h->n_targets || b->core.tid < -1 || @@ -881,9 +920,12 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) return -3; } return r; - } else if (fp->is_cram) { + } + + case cram: return cram_get_bam_seq(fp->fp.cram, &b); - } else { + + case sam: { int ret; err_recover: if (fp->line.l == 0) { @@ -898,6 +940,10 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) if (h->ignore_sam_err) goto err_recover; } return ret; + } + + default: + abort(); } } @@ -1024,15 +1070,29 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) { - if (fp->is_bin) { + switch (fp->format.format) { + case binary_format: + fp->format.category = sequence_data; + fp->format.format = bam; + /* fall-through */ + case bam: return bam_write1(fp->fp.bgzf, b); - } else if (fp->is_cram) { + + case cram: return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); - } else { + + case text_format: + fp->format.category = sequence_data; + fp->format.format = sam; + /* fall-through */ + case sam: if (sam_format1(h, b, &fp->line) < 0) return -1; kputc('\n', &fp->line); if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; return fp->line.l; + + default: + abort(); } } @@ -1759,7 +1819,7 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) int i; bam_mplp_t iter; iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); - iter->pos = (uint64_t*)calloc(n, 8); + iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t)); iter->n_plp = (int*)calloc(n, sizeof(int)); iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); diff --git a/htslib/synced_bcf_reader.c b/htslib/synced_bcf_reader.c index 19fa703b..3747c0e4 100644 --- a/htslib/synced_bcf_reader.c +++ b/htslib/synced_bcf_reader.c @@ -52,6 +52,26 @@ static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); +char *bcf_sr_strerror(int errnum) +{ + switch (errnum) + { + case open_failed: + return strerror(errno); break; + case not_bgzf: + return "not compressed with bgzip"; break; + case idx_load_failed: + return "could not load index"; break; + case file_type_error: + return "unknown file type"; break; + case api_usage_error: + return "API usage error"; break; + case header_error: + return "could not parse header"; break; + default: return ""; + } +} + static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) { kstring_t str = {0,0,0}; @@ -61,7 +81,7 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) { if ( *tmp==',' || !*tmp ) { - out = (int*) realloc(out, sizeof(int)); + out = (int*) realloc(out, (nout+1)*sizeof(int)); if ( tmp-prev==1 && *prev=='.' ) out[nout] = -1; else @@ -111,76 +131,94 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { + htsFile* file_ptr = hts_open(fname, "r"); + if ( ! file_ptr ) { + files->errnum = open_failed; + return 0; + } + files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); - reader->file = hts_open(fname, "r"); - if ( !reader->file ) return 0; + reader->file = file_ptr; - reader->type = reader->file->is_bin? FT_BCF : FT_VCF; - if (reader->file->is_compressed) reader->type |= FT_GZ; + files->errnum = 0; if ( files->require_index ) { - if ( reader->type==FT_VCF_GZ ) + if ( reader->file->format.format==vcf ) { + if ( reader->file->format.compression!=bgzf ) + { + files->errnum = not_bgzf; + return 0; + } + reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { - fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); + files->errnum = idx_load_failed; return 0; } reader->header = bcf_hdr_read(reader->file); } - else if ( reader->type==FT_BCF_GZ ) + else if ( reader->file->format.format==bcf ) { + if ( reader->file->format.compression!=bgzf ) + { + files->errnum = not_bgzf; + return 0; + } + reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { - fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); - return 0; // not indexed..? + files->errnum = idx_load_failed; + return 0; } } else { - fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname); + files->errnum = file_type_error; return 0; } } else { - if ( reader->type & FT_BCF ) - { - reader->header = bcf_hdr_read(reader->file); - } - else if ( reader->type & FT_VCF ) + if ( reader->file->format.format==bcf || reader->file->format.format==vcf ) { reader->header = bcf_hdr_read(reader->file); } else { - fprintf(stderr,"File type not recognised: %s\n", fname); + files->errnum = file_type_error; return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { + files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { + files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } - if ( !reader->header ) return 0; + if ( !reader->header ) + { + files->errnum = header_error; + return 0; + } reader->fname = fname; if ( files->apply_filters ) @@ -423,13 +461,13 @@ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } if ( files->streaming ) { - if ( reader->type & FT_VCF ) + if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } - else if ( reader->type & FT_BCF ) + else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } @@ -959,8 +997,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; - int ft_type = hts_file_type(regions); - if ( ft_type & FT_VCF ) ito = 1; + + if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) @@ -1034,7 +1072,11 @@ int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions // using in-memory regions - if ( reg->regs ) return 0; + if ( reg->regs ) + { + reg->regs[reg->iseq].creg = -1; + return 0; + } // reading regions from tabix if ( reg->itr ) tbx_itr_destroy(reg->itr); diff --git a/htslib/tabix.1 b/htslib/tabix.1 index 8fd1fe5b..55c0ebbe 100644 --- a/htslib/tabix.1 +++ b/htslib/tabix.1 @@ -1,9 +1,9 @@ -.TH tabix 1 "23 September 2014" "htslib-1.1" "Bioinformatics tools" +.TH tabix 1 "3 February 2015" "htslib-1.2.1" "Bioinformatics tools" .SH NAME .PP -bgzip - Block compression/decompression utility +bgzip \- Block compression/decompression utility .PP -tabix - Generic indexer for TAB-delimited genome position files +tabix \- Generic indexer for TAB-delimited genome position files .\" .\" Copyright (C) 2009-2011 Broad Institute. .\" @@ -30,26 +30,26 @@ tabix - Generic indexer for TAB-delimited genome position files .SH SYNOPSIS .PP .B bgzip -.RB [ \-cdhB ] -.RB [ \-b +.RB [ -cdhB ] +.RB [ -b .IR virtualOffset ] -.RB [ \-s +.RB [ -s .IR size ] .RI [ file ] .PP .B tabix -.RB [ \-0lf ] -.RB [ \-p -.R gff|bed|sam|vcf] -.RB [ \-s +.RB [ -0lf ] +.RB [ -p +gff|bed|sam|vcf] +.RB [ -s .IR seqCol ] -.RB [ \-b +.RB [ -b .IR begCol ] -.RB [ \-e +.RB [ -e .IR endCol ] -.RB [ \-S +.RB [ -S .IR lineSkip ] -.RB [ \-c +.RB [ -c .IR metaChar ] .I in.tab.bgz .RI [ "region1 " [ "region2 " [ ... "]]]" @@ -58,9 +58,11 @@ tabix - Generic indexer for TAB-delimited genome position files .PP Tabix indexes a TAB-delimited genome position file .I in.tab.bgz -and creates an index file +and creates an index file ( .I in.tab.bgz.tbi -when +or +.I in.tab.bgz.csi +) when .I region is absent from the command-line. The input data file must be position sorted and compressed by @@ -74,52 +76,75 @@ specified in the format "chr:beginPos-endPos". Fast data retrieval also works over network if URI is given as a file name and in this case the index file will be downloaded if it is not present locally. -.SH OPTIONS OF TABIX +.SH INDEXING OPTIONS .TP 10 -.BI "-p " STR -Input format for indexing. Valid values are: gff, bed, sam, vcf and -psltab. This option should not be applied together with any of -.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ; -it is not used for data retrieval because this setting is stored in -the index file. [gff] -.TP -.BI "-s " INT -Column of sequence name. Option -.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0 -are all stored in the index file and thus not used in data retrieval. [1] +.B -0, --zero-based +Specify that the position in the data file is 0-based (e.g. UCSC files) +rather than 1-based. .TP -.BI "-b " INT +.BI "-b, --begin " INT Column of start chromosomal position. [4] .TP -.BI "-e " INT +.BI "-c, --comment " CHAR +Skip lines started with character CHAR. [#] +.TP +.BI "-C, --csi" +Skip lines started with character CHAR. [#] +.TP +.BI "-e, --end " INT Column of end chromosomal position. The end column can be the same as the start column. [5] .TP -.BI "-S " INT -Skip first INT lines in the data file. [0] +.B "-f, --force " +Force to overwrite the index file if it is present. .TP -.BI "-c " CHAR -Skip lines started with character CHAR. [#] +.BI "-m, --min-shift" INT +set minimal interval size for CSI indices to 2^INT [14] .TP -.B -0 -Specify that the position in the data file is 0-based (e.g. UCSC files) -rather than 1-based. +.BI "-p, --preset " STR +Input format for indexing. Valid values are: gff, bed, sam, vcf. +This option should not be applied together with any of +.BR -s ", " -b ", " -e ", " -c " and " -0 ; +it is not used for data retrieval because this setting is stored in +the index file. [gff] .TP -.B -h -Print the header/meta lines. +.BI "-s, --sequence " INT +Column of sequence name. Option +.BR -s ", " -b ", " -e ", " -S ", " -c " and " -0 +are all stored in the index file and thus not used in data retrieval. [1] .TP -.B -B -The second argument is a BED file. When this option is in use, the input -file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless, -with this option, the format of the input must be specificed correctly on the command line. +.BI "-S, --skip-lines " INT +Skip first INT lines in the data file. [0] + +.SH QUERYING AND OTHER OPTIONS .TP -.B -f -Force to overwrite the index file if it is present. +.B "-h, --print-header " +Print also the header/meta lines. .TP -.B -l +.B "-H, --only-header " +Print only the header/meta lines. +.TP +.B "-i, --file-info " +Print file format info. +.TP +.B "-l, --list-chroms " List the sequence names stored in the index file. -.RE - +.TP +.B "-r, --reheader " FILE +Replace the header with the content of FILE +.TP +.B "-R, --regions " FILE +Restrict to regions listed in the FILE. The FILE can be BED file (requires .bed, .bed.gz, .bed.bgz +file name extension) or a TAB-delimited file with CHROM, POS, and, optionally, +POS_TO columns, where positions are 1-based and inclusive. When this option is in use, the input +file may not be sorted. +regions. +.TP +.B "-T, --targets" FILE +Similar to +.B -R +but the entire input will be read sequentially and regions not listed in FILE will be skipped. +.PP .SH EXAMPLE (grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; diff --git a/htslib/tabix.c b/htslib/tabix.c index b0af21d9..2f6cfea2 100644 --- a/htslib/tabix.c +++ b/htslib/tabix.c @@ -37,10 +37,12 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kseq.h" #include "htslib/bgzf.h" #include "htslib/hts.h" +#include "htslib/regidx.h" typedef struct { - int min_shift; + char *regions_fname, *targets_fname; + int print_header, header_only; } args_t; @@ -53,14 +55,14 @@ static void error(const char *format, ...) exit(EXIT_FAILURE); } - -#define IS_GFF (1<<0) -#define IS_BED (1<<1) -#define IS_SAM (1<<2) -#define IS_VCF (1<<3) -#define IS_BCF (1<<4) -#define IS_BAM (1<<5) -#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF) +#define IS_GFF (1<<0) +#define IS_BED (1<<1) +#define IS_SAM (1<<2) +#define IS_VCF (1<<3) +#define IS_BCF (1<<4) +#define IS_BAM (1<<5) +#define IS_CRAM (1<<6) +#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF) int file_type(const char *fname) { @@ -72,78 +74,154 @@ int file_type(const char *fname) else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF; else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF; else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM; + else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM; + + htsFile *fp = hts_open(fname,"r"); + enum htsExactFormat format = fp->format.format; + hts_close(fp); + if ( format == bcf ) return IS_BCF; + if ( format == bam ) return IS_BAM; + if ( format == cram ) return IS_CRAM; + if ( format == vcf ) return IS_VCF; + return 0; } -#define PRINT_HEADER 1 -#define HEADER_ONLY 2 -static int query_regions(char **argv, int argc, int mode) +static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs) { - char *fname = argv[0]; - int i, ftype = file_type(fname); + kstring_t str = {0,0,0}; + int iseq = 0, ireg = 0; + char **regs = NULL; + *nregs = argc; - if ( ftype & IS_TXT || !ftype ) + if ( regions_fname ) { - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) error("Could not read %s\n", fname); - tbx_t *tbx = tbx_index_load(fname); - if ( !tbx ) error("Could not load .tbi index of %s\n", fname); - kstring_t str = {0,0,0}; - if ( mode ) + // improve me: this is a too heavy machinery for parsing regions... + + regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL); + if ( !idx ) error("Could not read %s\n", regions_fname); + + (*nregs) += regidx_nregs(idx); + regs = (char**) malloc(sizeof(char*)*(*nregs)); + + int nseq; + char **seqs = regidx_seq_names(idx, &nseq); + for (iseq=0; iseq= 0 ) + regitr_t itr; + regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr); + while ( itr.i < itr.n ) { - if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; - puts(str.s); + str.l = 0; + ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1); + regs[ireg++] = strdup(str.s); + itr.i++; } } - if ( mode!=HEADER_ONLY ) + regidx_destroy(idx); + } + free(str.s); + + if ( !ireg ) + { + if ( argc ) + regs = (char**) malloc(sizeof(char*)*argc); + else { - for (i=1; i= 0) puts(str.s); - tbx_itr_destroy(itr); - } + regs = (char**) malloc(sizeof(char*)); + regs[0] = strdup("."); + *nregs = 1; } - free(str.s); - if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); - tbx_destroy(tbx); } - else if ( ftype==IS_BCF ) // output uncompressed VCF + + for (iseq=0; iseqformat; + + regidx_t *reg_idx = NULL; + if ( args->targets_fname ) + { + reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); + if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); + } + + if ( format == bcf ) { - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) error("Could not read %s\n", fname); htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); - if ( mode ) - { + if ( args->print_header ) bcf_hdr_write(out,hdr); - } - if ( mode!=HEADER_ONLY ) + if ( !args->header_only ) { bcf1_t *rec = bcf_init(); - for (i=1; i=0 ) bcf_write(out,hdr,rec); + hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); + while ( bcf_itr_next(fp, itr, rec) >=0 ) + { + if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; + bcf_write(out,hdr,rec); + } tbx_itr_destroy(itr); } bcf_destroy(rec); } - if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } - else if ( ftype==IS_BAM ) // todo: BAM + else if ( format==vcf || format==sam || format==unknown_format ) + { + tbx_t *tbx = tbx_index_load(fname); + if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); + kstring_t str = {0,0,0}; + if ( args->print_header ) + { + while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) + { + if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; + puts(str.s); + } + } + if ( !args->header_only ) + { + int nseq; + const char **seq = NULL; + if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); + for (i=0; i= 0) + { + if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; + puts(str.s); + } + tbx_itr_destroy(itr); + } + free(seq); + } + free(str.s); + tbx_destroy(tbx); + } + else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); + + if ( reg_idx ) regidx_destroy(reg_idx); + if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); + + for (i=0; i= 0) + while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0) { switch (c) { + case 'R': args.regions_fname = optarg; break; + case 'T': args.targets_fname = optarg; break; + case 'C': do_csi = 1; break; case 'r': reheader = optarg; break; - case 'h': mode = PRINT_HEADER; break; - case 'H': mode = HEADER_ONLY; break; + case 'h': args.print_header = 1; break; + case 'H': args.header_only = 1; break; case 'l': list_chroms = 1; break; case '0': conf.preset |= TBX_UCSC; break; case 'b': conf.bc = atoi(optarg); break; @@ -320,6 +412,8 @@ int main(int argc, char *argv[]) else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; + else if (strcmp(optarg, "bcf") == 0) ; // bcf is autodetected, preset is not needed + else if (strcmp(optarg, "bam") == 0) ; // same as bcf else error("The preset string not recognised: '%s'\n", optarg); break; case 's': conf.sc = atoi(optarg); break; @@ -333,8 +427,14 @@ int main(int argc, char *argv[]) if ( list_chroms ) return query_chroms(argv[optind]); - if ( argc > optind+1 || mode==HEADER_ONLY ) - return query_regions(&argv[optind], argc-optind, mode); + if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname ) + { + int nregs = 0; + char **regs = NULL; + if ( !args.header_only ) + regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs); + return query_regions(&args, argv[optind], regs, nregs); + } char *fname = argv[optind]; int ftype = file_type(fname); @@ -343,23 +443,38 @@ int main(int argc, char *argv[]) if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff; else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed; else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam; - else if ( ftype==IS_VCF ) conf_ptr = &tbx_conf_vcf; + else if ( ftype==IS_VCF ) + { + conf_ptr = &tbx_conf_vcf; + if ( !min_shift && do_csi ) min_shift = 14; + } else if ( ftype==IS_BCF ) { - if ( min_shift <= 0 ) min_shift = 14; + if ( !min_shift ) min_shift = 14; } else if ( ftype==IS_BAM ) { - if ( min_shift <= 0 ) min_shift = 14; + if ( !min_shift ) min_shift = 14; } } + if ( do_csi ) + { + if ( !min_shift ) min_shift = 14; + min_shift *= do_csi; // positive for CSIv2, negative for CSIv1 + } + if ( min_shift!=0 && !do_csi ) do_csi = 1; + if ( reheader ) return reheader_file(fname, reheader, ftype, conf_ptr); if ( conf_ptr ) conf = *conf_ptr; - char *suffix = min_shift <= 0 ? ".tbi" : (ftype==IS_BAM ? ".bai" : ".csi"); + char *suffix = ".tbi"; + if ( do_csi ) suffix = ".csi"; + else if ( ftype==IS_BAM ) suffix = ".bai"; + else if ( ftype==IS_CRAM ) suffix = ".crai"; + char *idx_fname = calloc(strlen(fname) + 5, 1); strcat(strcpy(idx_fname, fname), suffix); @@ -375,7 +490,12 @@ int main(int argc, char *argv[]) } free(idx_fname); - if ( min_shift > 0 ) // CSI index + if ( ftype==IS_CRAM ) + { + if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname); + return 0; + } + else if ( do_csi ) { if ( ftype==IS_BCF ) { @@ -390,7 +510,7 @@ int main(int argc, char *argv[]) if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname); return 0; } - else + else // TBI index { if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname); return 0; diff --git a/htslib/tbx.c b/htslib/tbx.c index 4a5bdd2a..a82f195b 100644 --- a/htslib/tbx.c +++ b/htslib/tbx.c @@ -280,6 +280,11 @@ tbx_t *tbx_index_load(const char *fn) return NULL; } meta = hts_idx_get_meta(tbx->idx, &l_meta); + if ( !meta ) + { + free(tbx); + return NULL; + } memcpy(x, meta, 28); memcpy(&tbx->conf, x, 24); p = nm = (char*)meta + 28; diff --git a/htslib/test/aux#aux.sam b/htslib/test/auxf#values.sam similarity index 100% rename from htslib/test/aux#aux.sam rename to htslib/test/auxf#values.sam diff --git a/htslib/test/aux.fa b/htslib/test/auxf.fa similarity index 100% rename from htslib/test/aux.fa rename to htslib/test/auxf.fa diff --git a/htslib/test/aux.fa.fai b/htslib/test/auxf.fa.fai similarity index 100% rename from htslib/test/aux.fa.fai rename to htslib/test/auxf.fa.fai diff --git a/htslib/test/hfile.c b/htslib/test/hfile.c index 987c8e0a..c4ba91c4 100644 --- a/htslib/test/hfile.c +++ b/htslib/test/hfile.c @@ -141,7 +141,7 @@ int main(void) check_offset(fin, 200, "input/first200"); check_offset(fout, 1000, "output/first200"); - if (hseek(fin, 1000, SEEK_SET) < 0) fail("hseek"); + if (hseek(fin, 800, SEEK_CUR) < 0) fail("hseek/cur"); check_offset(fin, 1000, "input/seek"); for (off = 1000; (n = hread(fin, buffer, sizeof buffer)) > 0; off += n) if (hwrite(fout, buffer, n) != n) fail("hwrite"); @@ -149,7 +149,7 @@ int main(void) check_offset(fin, off, "input/eof"); check_offset(fout, off, "output/eof"); - if (hseek(fin, 200, SEEK_SET) < 0) fail("hseek"); + if (hseek(fin, 200, SEEK_SET) < 0) fail("hseek/set"); if (hseek(fout, 200, SEEK_SET) < 0) fail("hseek(output)"); check_offset(fin, 200, "input/backto200"); check_offset(fout, 200, "output/backto200"); diff --git a/htslib/test/sam.c b/htslib/test/sam.c index 22f06dce..55398404 100644 --- a/htslib/test/sam.c +++ b/htslib/test/sam.c @@ -1,6 +1,6 @@ /* test/sam.c -- SAM/BAM/CRAM API test cases. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2015 Genome Research Ltd. Author: John Marshall @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/sam.h" +#include "htslib/faidx.h" #include "htslib/kstring.h" int status; @@ -71,10 +72,10 @@ static int aux_fields1(void) static const char sam[] = "data:" "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" -"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n"; +"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n"; // Canonical form of the alignment record above, as output by sam_format1() - static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000"; + static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); @@ -109,6 +110,33 @@ static int aux_fields1(void) if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %d, expected 1000000", bam_aux2i(p)); + if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1) + fail("Y1 field is %d, expected -2^31", bam_aux2i(p)); + + if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647) + fail("Y2 field is %d, expected -2^31+1", bam_aux2i(p)); + + if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1) + fail("Y3 field is %d, expected -1", bam_aux2i(p)); + + if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0) + fail("Y4 field is %d, expected 0", bam_aux2i(p)); + + if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1) + fail("Y5 field is %d, expected 1", bam_aux2i(p)); + + if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647) + fail("Y6 field is %d, expected 2^31-1", bam_aux2i(p)); + + // TODO Checking these perhaps requires inventing bam_aux2u() or so +#if 0 + if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648) + fail("Y7 field is %d, expected 2^31", bam_aux2i(p)); + + if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295) + fail("Y8 field is %d, expected 2^32-1", bam_aux2i(p)); +#endif + if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); @@ -132,12 +160,28 @@ static void iterators1(void) hts_itr_destroy(sam_itr_queryi(NULL, HTS_IDX_NONE, 0, 0)); } -int main(void) +static void faidx1(const char *filename) +{ + int n; + faidx_t *fai = fai_load(filename); + if (fai == NULL) fail("can't load faidx file"); + + n = faidx_fetch_nseq(fai); + if (n != 7) fail("faidx_fetch_nseq returned %d, expected 7", n); + + n = faidx_nseq(fai); + if (n != 7) fail("faidx_nseq returned %d, expected 7", n); + + fai_destroy(fai); +} + +int main(int argc, char **argv) { status = EXIT_SUCCESS; aux_fields1(); iterators1(); + if (argc >= 2) faidx1(argv[1]); return status; } diff --git a/htslib/test/test-regidx.c b/htslib/test/test-regidx.c new file mode 100644 index 00000000..0aea6b88 --- /dev/null +++ b/htslib/test/test-regidx.c @@ -0,0 +1,116 @@ +/* test/test-regidx.c -- Regions index test harness. + + Copyright (C) 2014 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include + +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(-1); +} + +int custom_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +{ + // Use the standard parser for CHROM,FROM,TO + int i, ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL); + if ( ret!=0 ) return ret; + + // Skip the fields that were parsed above + char *ss = (char*) line; + while ( *ss && isspace(*ss) ) ss++; + for (i=0; i<3; i++) + { + while ( *ss && !isspace(*ss) ) ss++; + if ( !*ss ) return -2; // wrong number of fields + while ( *ss && isspace(*ss) ) ss++; + } + if ( !*ss ) return -2; + + // Parse the payload + char *se = ss; + while ( *se && !isspace(*se) ) se++; + char **dat = (char**) payload; + *dat = (char*) malloc(se-ss+1); + memcpy(*dat,ss,se-ss+1); + (*dat)[se-ss] = 0; + return 0; +} +void custom_free(void *payload) +{ + char **dat = (char**)payload; + free(*dat); +} + +int main(int argc, char **argv) +{ + // Init index with no file name, we will insert the regions manually + regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); + if ( !idx ) error("init failed\n"); + + // Insert regions + char *line; + line = "1 10000000 10000000 1:10000000-10000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); + line = "1 20000000 20000001 1:20000000-20000001"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); + line = "1 20000002 20000002 1:20000002-20000002"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); + line = "1 30000000 30000000 1:30000000-30000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); + + // Finish initialization + regidx_insert(idx,NULL); + + // Test + regitr_t itr; + int from, to; + + from = to = 10000000; + if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + if ( strcmp("1:10000000-10000000",REGITR_PAYLOAD(itr,char*)) ) error("query failed: 1:%d-%d vs %s\n", from,to,REGITR_PAYLOAD(itr,char*)); + if ( !regidx_overlap(idx,"1",from-2,to-1,&itr) ) error("query failed: 1:%d-%d\n",from-1,to); + if ( !regidx_overlap(idx,"1",from-2,to+3,&itr) ) error("query failed: 1:%d-%d\n",from-1,to+2); + if ( regidx_overlap(idx,"1",from-2,to-2,&itr) ) error("query failed: 1:%d-%d\n",from-1,to-1); + + from = to = 20000000; + if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + + from = to = 20000002; + if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + + from = to = 30000000; + if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + + // Clean up + regidx_destroy(idx); + + return 0; +} + + diff --git a/htslib/test/test-vcf-api.c b/htslib/test/test-vcf-api.c index 77a8fece..3e7623a8 100644 --- a/htslib/test/test-vcf-api.c +++ b/htslib/test/test-vcf-api.c @@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include void write_bcf(char *fname) { @@ -153,7 +154,12 @@ void write_bcf(char *fname) free(str.s); bcf_destroy1(rec); bcf_hdr_destroy(hdr); - hts_close(fp); + int ret; + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } } void bcf_to_vcf(char *fname) @@ -161,7 +167,10 @@ void bcf_to_vcf(char *fname) htsFile *fp = hts_open(fname,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *rec = bcf_init1(); - htsFile *out = hts_open("-","w"); + + char *gz_fname = (char*) malloc(strlen(fname)+4); + snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname); + htsFile *out = hts_open(gz_fname,"wg"); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); @@ -199,8 +208,41 @@ void bcf_to_vcf(char *fname) bcf_destroy1(rec); bcf_hdr_destroy(hdr); bcf_hdr_destroy(hdr_out); - hts_close(fp); - hts_close(out); + int ret; + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } + if ( (ret=hts_close(out)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret); + exit(ret); + } + + + // read gzip, write stdout + htsFile *gz_in = hts_open(gz_fname, "r"); + if ( !gz_in ) + { + fprintf(stderr,"Could not read: %s\n", gz_fname); + exit(1); + } + + kstring_t line = {0,0,0}; + while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 ) + { + kputc('\n',&line); + fwrite(line.s,1,line.l,stdout); + } + + if ( (ret=hts_close(gz_in)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret); + exit(ret); + } + free(line.s); + free(gz_fname); } void iterator(const char *fname) @@ -221,7 +263,12 @@ void iterator(const char *fname) hts_idx_destroy(idx); bcf_hdr_destroy(hdr); - hts_close(fp); + int ret; + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } } int main(int argc, char **argv) diff --git a/htslib/test/test_view.c b/htslib/test/test_view.c index 7f02708e..1f96ceac 100644 --- a/htslib/test/test_view.c +++ b/htslib/test/test_view.c @@ -32,6 +32,82 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" +typedef struct hts_opt { + enum cram_option opt; + union { + int i; + char *s; + } val; + struct hts_opt *next; +} hts_opt; + +/* + * Parses arg and appends it to the option list. + * Returns 0 on success; + * -1 on failure. + */ +int add_option(hts_opt **opts, char *arg) { + hts_opt *o, *t; + char *cp; + + if (!(cp = strchr(arg, '='))) + cp = "1"; // assume boolean + else + *cp++ = 0; + + if (!(o = malloc(sizeof(*o)))) + return -1; + + if (strcmp(arg, "DECODE_MD") == 0) + o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(cp); + else if (strcmp(arg, "VERBOSITY") == 0) + o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(cp); + else if (strcmp(arg, "SEQS_PER_SLICE") == 0) + o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(cp); + else if (strcmp(arg, "SLICES_PER_CONTAINER") == 0) + o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(cp); + else if (strcmp(arg, "EMBED_REF") == 0) + o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(cp); + else if (strcmp(arg, "NO_REF") == 0) + o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(cp); + else if (strcmp(arg, "IGNORE_MD5") == 0) + o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(cp); + else if (strcmp(arg, "USE_BZIP2") == 0) + o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(cp); + else if (strcmp(arg, "USE_RANS") == 0) + o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(cp); + else if (strcmp(arg, "USE_LZMA") == 0) + o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(cp); + else if (strcmp(arg, "REFERENCE") == 0) + o->opt = CRAM_OPT_REFERENCE, o->val.s = cp; + else if (strcmp(arg, "VERSION") == 0) + o->opt = CRAM_OPT_VERSION, o->val.s =cp; + else if (strcmp(arg, "MULTI_SEQ_PER_SLICE") == 0) + o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(cp); + else if (strcmp(arg, "NTHREADS") == 0) + o->opt = CRAM_OPT_NTHREADS, o->val.i = atoi(cp); + else if (strcmp(arg, "REQUIRED_FIELDS") == 0) + o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(cp, NULL, 0); + else { + fprintf(stderr, "Unknown option '%s'\n", arg); + free(o); + return -1; + } + + o->next = NULL; + + if (*opts) { + t = *opts; + while (t->next) + t = t->next; + t->next = o; + } else { + *opts = o; + } + + return 0; +} + int main(int argc, char *argv[]) { samFile *in; @@ -43,8 +119,9 @@ int main(int argc, char *argv[]) htsFile *out; char modew[8]; int r = 0, exit_code = 0; + hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL; - while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) { + while ((c = getopt(argc, argv, "IbDCSl:t:i:o:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; @@ -53,10 +130,12 @@ int main(int argc, char *argv[]) case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; + case 'i': if (add_option(&in_opts, optarg)) return 1; break; + case 'o': if (add_option(&out_opts, optarg)) return 1; break; } } if (argc == optind) { - fprintf(stderr, "Usage: samview [-bSCSI] [-l level] || [region]\n"); + fprintf(stderr, "Usage: samview [-bSCSI] [-l level] [-o option=value] || [region]\n"); return 1; } strcpy(moder, "r"); @@ -95,6 +174,15 @@ int main(int argc, char *argv[]) cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); } + // Process any options; currently cram only. + for (; in_opts; in_opts = (last=in_opts)->next, free(last)) { + hts_set_opt(in, in_opts->opt, in_opts->val); + if (in_opts->opt == CRAM_OPT_REFERENCE) + hts_set_opt(out, in_opts->opt, in_opts->val); + } + for (; out_opts; out_opts = (last=out_opts)->next, free(last)) + hts_set_opt(out, out_opts->opt, out_opts->val); + sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; diff --git a/htslib/vcf.c b/htslib/vcf.c index fb44980f..0901ce1e 100644 --- a/htslib/vcf.c +++ b/htslib/vcf.c @@ -36,17 +36,18 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/vcf.h" #include "htslib/tbx.h" #include "htslib/hfile.h" +#include "htslib/khash_str2int.h" #include "htslib/khash.h" KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) - typedef khash_t(vdict) vdict_t; +typedef khash_t(vdict) vdict_t; #include "htslib/kseq.h" KSTREAM_DECLARE(gzFile, gzread) - uint32_t bcf_float_missing = 0x7F800001; - uint32_t bcf_float_vector_end = 0x7F800002; - uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +uint32_t bcf_float_missing = 0x7F800001; +uint32_t bcf_float_vector_end = 0x7F800002; +uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; /************************* @@ -57,17 +58,13 @@ int bcf_hdr_sync(bcf_hdr_t *h); int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) { - if ( !s ) - { - bcf_hdr_sync(h); - return 0; - } + if ( !s ) return 0; const char *ss = s; while ( !*ss && isspace(*ss) ) ss++; if ( !*ss ) { - fprintf(stderr,"[W::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__); + fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__); abort(); } @@ -80,18 +77,23 @@ int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) kh_val(d, k).id = kh_size(d) - 1; } else { if (hts_verbose >= 2) - fprintf(stderr, "[W::%s] Duplicated sample name '%s'. Skipped.\n", __func__, s); + { + fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s); + abort(); + } free(sdup); return -1; } int n = kh_size(d); h->samples = (char**) realloc(h->samples,sizeof(char*)*n); h->samples[n-1] = sdup; + h->dirty = 1; return 0; } -void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) +int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) { + int ret = 0; int i = 0; const char *p, *q; // add samples @@ -101,13 +103,14 @@ void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) char *s = (char*)malloc(q - p + 1); strncpy(s, p, q - p); s[q - p] = 0; - bcf_hdr_add_sample(h,s); + if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1; free(s); } if (*q == 0 || *q == '\n') break; p = q + 1; } bcf_hdr_add_sample(h,NULL); + return ret; } int bcf_hdr_sync(bcf_hdr_t *h) @@ -142,6 +145,7 @@ int bcf_hdr_sync(bcf_hdr_t *h) h->id[i][kh_val(d,k).id].val = &kh_val(d,k); } } + h->dirty = 0; return 0; } @@ -178,7 +182,7 @@ bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]); j++; } - if ( i!=j ) out->nkeys--; // IDX was omitted + if ( i!=j ) out->nkeys -= i-j; // IDX was omitted return out; } @@ -350,8 +354,8 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); - if ( i<0 ) return 0; - if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; + if ( i<0 ) j = 0; + else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; i = bcf_hrec_find_key(hrec,"ID"); if ( i<0 ) return 0; @@ -381,7 +385,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).id = idx; - kh_val(d, k).info[0] = i; + kh_val(d, k).info[0] = j; kh_val(d, k).hrec[0] = hrec; return 1; @@ -414,6 +418,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT; else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL; else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR; + else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR; else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG; else { @@ -449,6 +454,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) if ( kh_val(d, k).hrec[info&0xf] ) return 0; kh_val(d, k).info[info&0xf] = info; kh_val(d, k).hrec[info&0xf] = hrec; + if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id); return 1; } kh_val(d, k) = bcf_idinfo_def; @@ -494,6 +500,7 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) int n = ++hdr->nhrec; hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); hdr->hrec[n-1] = hrec; + hdr->dirty = 1; return hrec->type==BCF_HL_GEN ? 0 : 1; } @@ -579,9 +586,10 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) needs_sync += bcf_hdr_add_hrec(hdr, hrec); p += len; } - bcf_hdr_parse_sample_line(hdr,p); // calls hdr_sync + int ret = bcf_hdr_parse_sample_line(hdr,p); + bcf_hdr_sync(hdr); bcf_hdr_check_sanity(hdr); - return 0; + return ret; } int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) @@ -589,8 +597,7 @@ int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) int len; bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len); if ( !hrec ) return -1; - if ( bcf_hdr_add_hrec(hdr, hrec) ) - bcf_hdr_sync(hdr); + bcf_hdr_add_hrec(hdr, hrec); return 0; } @@ -637,8 +644,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) if ( i < hdr->nhrec ) memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); bcf_hrec_destroy(hrec); - - bcf_hdr_sync(hdr); + hdr->dirty = 1; } } @@ -692,7 +698,7 @@ void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) free(hrec->value); hrec->value = strdup(version); } - bcf_hdr_sync(hdr); + hdr->dirty = 1; } bcf_hdr_t *bcf_hdr_init(const char *mode) @@ -735,7 +741,7 @@ void bcf_hdr_destroy(bcf_hdr_t *h) bcf_hdr_t *bcf_hdr_read(htsFile *hfp) { - if (!hfp->is_bin) + if (hfp->format.format == vcf) return vcf_hdr_read(hfp); BGZF *fp = hfp->fp.bgzf; @@ -766,9 +772,11 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) return h; } -int bcf_hdr_write(htsFile *hfp, const bcf_hdr_t *h) +int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) { - if (!hfp->is_bin) return vcf_hdr_write(hfp, h); + if ( h->dirty ) bcf_hdr_sync(h); + if (hfp->format.format == vcf || hfp->format.format == text_format) + return vcf_hdr_write(hfp, h); int hlen; char *htxt = bcf_hdr_fmt_text(h, 1, &hlen); @@ -916,7 +924,7 @@ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - if (!fp->is_bin) return vcf_read(fp,h,v); + if (fp->format.format == vcf) return vcf_read(fp,h,v); int ret = bcf_read1_core(fp->fp.bgzf, v); if ( ret!=0 || !h->keep_samples ) return ret; return bcf_subset_format(h,v); @@ -1116,32 +1124,42 @@ static int bcf1_sync(bcf1_t *line) return 0; } -bcf1_t *bcf_dup(bcf1_t *src) +bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) { bcf1_sync(src); - bcf1_t *out = bcf_init1(); - - out->rid = src->rid; - out->pos = src->pos; - out->rlen = src->rlen; - out->qual = src->qual; - out->n_info = src->n_info; out->n_allele = src->n_allele; - out->n_fmt = src->n_fmt; out->n_sample = src->n_sample; + bcf_clear(dst); + dst->rid = src->rid; + dst->pos = src->pos; + dst->rlen = src->rlen; + dst->qual = src->qual; + dst->n_info = src->n_info; dst->n_allele = src->n_allele; + dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample; - out->shared.m = out->shared.l = src->shared.l; - out->shared.s = (char*) malloc(out->shared.l); - memcpy(out->shared.s,src->shared.s,out->shared.l); + dst->shared.m = dst->shared.l = src->shared.l; + dst->shared.s = (char*) malloc(dst->shared.l); + memcpy(dst->shared.s,src->shared.s,dst->shared.l); - out->indiv.m = out->indiv.l = src->indiv.l; - out->indiv.s = (char*) malloc(out->indiv.l); - memcpy(out->indiv.s,src->indiv.s,out->indiv.l); + dst->indiv.m = dst->indiv.l = src->indiv.l; + dst->indiv.s = (char*) malloc(dst->indiv.l); + memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l); - return out; + return dst; +} +bcf1_t *bcf_dup(bcf1_t *src) +{ + bcf1_t *out = bcf_init1(); + return bcf_copy(out, src); } int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v) { + if ( h->dirty ) + { + // we could as well call bcf_hdr_sync here, not sure + fprintf(stderr,"FIXME: dirty header not synced\n"); + exit(1); + } if ( bcf_hdr_nsamples(h)!=v->n_sample ) { fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n", @@ -1149,7 +1167,8 @@ int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v) return -1; } - if ( !hfp->is_bin ) return vcf_write(hfp,h,v); + if ( hfp->format.format == vcf || hfp->format.format == text_format ) + return vcf_write(hfp,h,v); if ( v->errcode ) { @@ -1240,8 +1259,6 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); - bcf_hrec_add_key(hrec, "length", strlen("length")); - bcf_hrec_set_val(hrec, hrec->nkeys-1, "2147483647", strlen("2147483647"), 0); bcf_hdr_add_hrec(h, hrec); need_sync = 1; } @@ -1343,7 +1360,7 @@ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) char *htxt = bcf_hdr_fmt_text(h, 0, &hlen); while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros int ret; - if ( fp->is_compressed==1 ) + if ( fp->format.compression!=no_compression ) ret = bgzf_write(fp->fp.bgzf, htxt, hlen); else ret = hwrite(fp->fp.hfile, htxt, hlen); @@ -1546,7 +1563,15 @@ int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1; if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g; l = 0, m = g = 1; - if ( *r==':' ) j++; + if ( *r==':' ) + { + j++; + if ( j>=v->n_fmt ) + { + fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1); + exit(1); + } + } else break; } else if ( *r== ',' ) m++; @@ -1727,7 +1752,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p); kstring_t tmp = {0,0,0}; int l; - ksprintf(&tmp, "##contig=", p); + ksprintf(&tmp, "##contig=", p); bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); free(tmp.s); if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); @@ -1768,7 +1793,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (*(q-1) == ';') *(q-1) = 0; for (r = p; *r; ++r) if (*r == ';') ++n_flt; - a = (int32_t*)alloca(n_flt * 4); + a = (int32_t*)alloca(n_flt * sizeof(int32_t)); // add filters for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) { *(char*)aux1.p = 0; @@ -1810,6 +1835,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) for (end = val; *end != ';' && *end != 0; ++end); c = *end; *end = 0; } else end = r; + if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO k = kh_get(vdict, d, key); if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) { @@ -1837,7 +1863,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (*t == ',') ++n_val; if ((y>>4&0xf) == BCF_HT_INT) { int32_t *z; - z = (int32_t*)alloca(n_val<<2); + z = (int32_t*)alloca(n_val * sizeof(int32_t)); for (i = 0, t = val; i < n_val; ++i, ++t) { z[i] = strtol(t, &te, 10); @@ -1852,7 +1878,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos; } else if ((y>>4&0xf) == BCF_HT_REAL) { float *z; - z = (float*)alloca(n_val<<2); + z = (float*)alloca(n_val * sizeof(float)); for (i = 0, t = val; i < n_val; ++i, ++t) { z[i] = strtod(t, &te); @@ -2090,7 +2116,7 @@ int vcf_write_line(htsFile *fp, kstring_t *line) { int ret; if ( line->s[line->l-1]!='\n' ) kputc('\n',line); - if ( fp->is_compressed==1 ) + if ( fp->format.compression!=no_compression ) ret = bgzf_write(fp->fp.bgzf, line->s, line->l); else ret = hwrite(fp->fp.hfile, line->s, line->l); @@ -2102,7 +2128,7 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) int ret; fp->line.l = 0; vcf_format1(h, v, &fp->line); - if ( fp->is_compressed==1 ) + if ( fp->format.compression!=no_compression ) ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); else ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); @@ -2168,7 +2194,7 @@ int bcf_index_build(const char *fn, int min_shift) htsFile *fp; hts_idx_t *idx; if ((fp = hts_open(fn, "rb")) == 0) return -1; - if ( !fp->fp.bgzf->is_compressed ) { hts_close(fp); return -1; } + if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; } idx = bcf_index(fp, min_shift); hts_close(fp); if ( !idx ) return -1; @@ -2233,6 +2259,11 @@ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]); ret |= 1; } + if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) + { + fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]); + ret |= 1; + } } } } @@ -2256,7 +2287,9 @@ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int)); for (i=0; in[dict]; i++) { - if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) ) + if ( !src_hdr->id[dict][i].key || !dst_hdr->id[dict][i].key ) // gap left after removed BCF header lines + src_hdr->transl[dict][i] = -1; + else if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) ) { src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key); src_hdr->ntransl++; @@ -2365,6 +2398,7 @@ bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) { int hlen; + void *names_hash = khash_str2int_init(); char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen); kstring_t str; bcf_hdr_t *h; @@ -2385,10 +2419,20 @@ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int } kputsn(htxt, p - htxt, &str); for (i = 0; i < n; ++i) { + if ( khash_str2int_has_key(names_hash,samples[i]) ) + { + fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]); + free(str.s); + free(htxt); + khash_str2int_destroy(names_hash); + bcf_hdr_destroy(h); + return NULL; + } imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]); if (imap[i] < 0) continue; kputc('\t', &str); kputs(samples[i], &str); + khash_str2int_inc(names_hash,samples[i]); } } else kputsn(htxt, hlen, &str); while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines @@ -2396,6 +2440,7 @@ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int bcf_hdr_parse(h, str.s); free(str.s); free(htxt); + khash_str2int_destroy(names_hash); return h; } @@ -2849,7 +2894,7 @@ int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass) if ( flt_id==line->d.flt[i] ) break; if ( i==line->d.n_flt ) return 0; // the filter is not present line->d.shared_dirty |= BCF1_DIRTY_FLT; - if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,line->d.n_flt-i); + if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt)); line->d.n_flt--; if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0); return 0; @@ -3138,30 +3183,30 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v if ( !dst ) return -4; // could not alloc } -#define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \ - out_type_t *tmp = (out_type_t *) *dst; \ - type_t *p = (type_t*) fmt->p; \ - for (i=0; in; j++) \ + #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \ + out_type_t *tmp = (out_type_t *) *dst; \ + type_t *p = (type_t*) fmt->p; \ + for (i=0; in; j++) \ + { \ + if ( is_missing ) set_missing; \ + else if ( is_vector_end ) { set_vector_end; break; } \ + else *tmp = p[j]; \ + tmp++; \ + } \ + for (; jn; j++) { set_vector_end; tmp++; } \ + p = (type_t *)((char *)p + fmt->size); \ } \ - for (; jn; j++) { set_vector_end; tmp++; } \ - p = (type_t *)((char *)p + fmt->size); \ - } \ -} -switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; - case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; - case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; - case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break; - default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1); -} -#undef BRANCH -return nsmpl*fmt->n; + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; + case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; + case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break; + default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1); + } + #undef BRANCH + return nsmpl*fmt->n; } diff --git a/htslib/vcfutils.c b/htslib/vcfutils.c index 3f648368..91118e42 100644 --- a/htslib/vcfutils.c +++ b/htslib/vcfutils.c @@ -64,7 +64,11 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT - assert( an>=nac ); // sanity check for missing values + if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); + exit(1); + } ac[0] = an - nac; return 1; } @@ -80,7 +84,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) for (i=0; i<(int)line->n_fmt; i++) if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } if ( !fmt_gt ) return 0; - #define BRANCH_INT(type_t,missing,vector_end) { \ + #define BRANCH_INT(type_t,vector_end) { \ for (i=0; in_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ @@ -88,15 +92,20 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) for (ial=0; ialn; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ - if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ + if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ + if ( p[ial]>>1 > line->n_allele ) \ + { \ + fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + exit(1); \ + } \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (fmt_gt->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT @@ -108,12 +117,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) { int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; - #define BRANCH_INT(type_t,missing,vector_end) { \ + #define BRANCH_INT(type_t,vector_end) { \ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ for (i=0; in; i++) \ { \ if ( p[i] == vector_end ) break; /* smaller ploidy */ \ - if ( !p[i] || p[i] == missing ) continue; /* missing allele */ \ + if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \ int tmp = p[i]>>1; \ if ( tmp>1 ) \ { \ @@ -137,9 +146,9 @@ int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) } \ } switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT @@ -165,7 +174,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) int *ac = (int*) calloc(line->n_allele,sizeof(int)); // check if all alleles are populated - #define BRANCH(type_t,missing,vector_end) { \ + #define BRANCH(type_t,vector_end) { \ for (i=0; in_sample; i++) \ { \ type_t *p = (type_t*) (gt->p + i*gt->size); \ @@ -173,16 +182,16 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) for (ial=0; ialn; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ - if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ - if ( (p[ial]>>1)-1 >= line->n_allele ) return -1; \ + if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ + if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (gt->type) { - case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH @@ -416,7 +425,7 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) { for (j=0; j=0 ); @@ -563,19 +572,19 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) int nori = nret / line->n_sample; if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G { - int ntop, inc = 0; + int inc = 0, nnew; if ( vlen==BCF_VL_A ) { assert( nori==nA_ori ); // todo: will fail if all values are missing - ntop = nA_ori; ndat = nA_new*line->n_sample; + nnew = nA_new; inc = 1; } else { assert( nori==nR_ori ); // todo: will fail if all values are missing - ntop = nR_ori; ndat = nR_new*line->n_sample; + nnew = nR_new; } #define BRANCH(type_t,is_vector_end) \ @@ -583,14 +592,14 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) for (j=0; jn_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ - type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \ + type_t *ptr_dst = ((type_t*)dat) + j*nnew; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ - for (k_src=0; k_src samtools-1.1.tar.bz2 -tar xjvf samtools-1.1.tar.bz2 -cd samtools-1.1 +curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.2/samtools-1.2.tar.bz2 > samtools-1.2.tar.bz2 +tar xjvf samtools-1.2.tar.bz2 +cd samtools-1.2 make -PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.1 +PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.2 popd diff --git a/pysam/__init__.py b/pysam/__init__.py index e38427a5..febc31a2 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -10,6 +10,8 @@ from pysam.cfaidx import * import pysam.cvcf as cvcf from pysam.cvcf import * +import pysam.cbcf as cbcf +from pysam.cbcf import * import pysam.csamtools as csamtools import pysam.Pileup as Pileup @@ -145,6 +147,7 @@ def usage(self): libchtslib.__all__ + \ ctabix.__all__ + \ cvcf.__all__ +\ + cbcf.__all__ +\ cfaidx.__all__ +\ calignmentfile.__all__ +\ csamfile.__all__ +\ @@ -180,4 +183,5 @@ def get_libraries(): 'cfaidx.so', 'csamfile.so', 'cvcf.so', + 'cbcf.so', 'ctabix.so')] diff --git a/pysam/calignmentfile.pxd b/pysam/calignmentfile.pxd index 5b4698f4..6e11d9d2 100644 --- a/pysam/calignmentfile.pxd +++ b/pysam/calignmentfile.pxd @@ -69,7 +69,14 @@ cdef class AlignedSegment: # add an alignment tag with value to the AlignedSegment # an existing tag of the same name will be replaced. - cpdef setTag( self, tag, value, value_type = ?, replace = ? ) + cpdef set_tag(self, tag, value, value_type=?, replace=?) + + # add an alignment tag with value to the AlignedSegment + # an existing tag of the same name will be replaced. + cpdef get_tag(self, tag) + + # return true if tag exists + cpdef has_tag(self, tag) cdef class AlignmentFile: @@ -78,19 +85,18 @@ cdef class AlignmentFile: # pointer to htsFile structure cdef htsFile * htsfile - # pointer to compressed file - cdef BGZF * fp - # pointer to index cdef hts_idx_t *index # header structure cdef bam_hdr_t * header - # true if file is a bam file - cdef int isbam + # true if file is bam format + cdef readonly bint is_bam + # true if file is bam format + cdef readonly bint is_cram # true if not a file but a stream - cdef int isstream + cdef readonly bint is_stream # true if file is not on the local filesystem - cdef int isremote + cdef readonly bint is_remote # current read within iteration cdef bam1_t * b # file opening mode @@ -156,7 +162,6 @@ cdef class IteratorRowSelection(IteratorRow): cdef positions cdef bam1_t * getCurrent( self ) cdef int cnext(self) - cdef BGZF * fp cdef class IteratorColumn: @@ -199,5 +204,4 @@ cdef class IndexedReads: cdef htsFile * htsfile cdef index cdef int owns_samfile - cdef BGZF * fp cdef bam_hdr_t * header diff --git a/pysam/calignmentfile.pyx b/pysam/calignmentfile.pyx index d6bec5b5..e4eedbdf 100644 --- a/pysam/calignmentfile.pyx +++ b/pysam/calignmentfile.pyx @@ -235,7 +235,7 @@ VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), cdef class AlignmentFile: '''*(filename, mode=None, template = None, - referencenames=None, referencelengths = None, + reference_names=None, reference_lengths = None, text=NULL, header=None, add_sq_text=False, check_header=True, check_sq=True)* @@ -283,12 +283,15 @@ cdef class AlignmentFile: 3. If *text* is given, new header text is copied from raw text. - 4. The names (*referencenames*) and lengths - (*referencelengths*) are supplied directly as lists. By + 4. The names (*reference_names*) and lengths + (*reference_lengths*) are supplied directly as lists. By default, 'SQ' and 'LN' tags will be added to the header text. This option can be changed by unsetting the flag *add_sq_text*. + For writing a CRAM file, the filename of the reference can be + added through a fasta formatted file (*reference_filename*) + By default, if a file is opened in mode 'r', it is checked for a valid header (*check_header* = True) and a definition of chromosome names (*check_sq* = True). @@ -298,18 +301,21 @@ cdef class AlignmentFile: def __cinit__(self, *args, **kwargs ): self.htsfile = NULL self._filename = None - self.isbam = False - self.isstream = False + self.is_bam = False + self.is_stream = False + self.is_cram = False + self.is_remote = False + self._open(*args, **kwargs) # allocate memory for iterator self.b = calloc(1, sizeof(bam1_t)) - def _isOpen( self ): + def _isOpen(self): '''return true if htsfile has been opened.''' return self.htsfile != NULL - def _hasIndex( self ): + def _hasIndex(self): '''return true if htsfile has an existing (and opened) index.''' return self.index != NULL @@ -317,27 +323,36 @@ cdef class AlignmentFile: filename, mode=None, AlignmentFile template=None, - referencenames=None, - referencelengths=None, + reference_names=None, + reference_lengths=None, + reference_filename=None, text=None, header=None, port=None, add_sq_text=True, check_header=True, - check_sq=True): - '''open a sam/bam file. + check_sq=True, + referencenames=None, + referencelengths=None): + '''open a sam, bam or cram formatted file. - If _open is called on an existing bamfile, the current file will be - closed and a new file will be opened. + If _open is called on an existing file, the current file + will be closed and a new file will be opened. ''' + # for backwards compatibility: + if referencenames is not None: + reference_names = referencenames + if referencelengths is not None: + reference_lengths = referencelengths # read mode autodetection if mode is None: try: self._open(filename, 'rb', template=template, - referencenames=referencenames, - referencelengths=referencelengths, + reference_names=reference_names, + reference_lengths=reference_lengths, + reference_filename=reference_filename, text=text, header=header, port=port, @@ -349,8 +364,9 @@ cdef class AlignmentFile: self._open(filename, 'r', template=template, - referencenames=referencenames, - referencelengths=referencelengths, + reference_names=reference_names, + reference_lengths=reference_lengths, + reference_filename=reference_filename, text=text, header=header, port=port, @@ -358,7 +374,9 @@ cdef class AlignmentFile: check_sq=check_sq) return - assert mode in ("r","w","rb","wb", "wh", "wbu", "rU", "wb0"), \ + assert mode in ("r","w","rb","wb", "wh", + "wbu", "rU", "wb0", + "rc", "wc"), \ "invalid file opening mode `%s`" % mode # close a previously opened file @@ -371,12 +389,13 @@ cdef class AlignmentFile: cdef bytes bmode = mode.encode('ascii') self._filename = filename = _encodeFilename(filename) - self.isstream = filename == b"-" - - self.isbam = len(mode) > 1 and mode[1] == 'b' - self.isremote = filename.startswith(b"http:") or \ - filename.startswith(b"ftp:") + # FIXME: Use htsFormat when it is available + self.is_bam = len(mode) > 1 and mode[1] == 'b' + self.is_cram = len(mode) > 1 and mode[1] == 'c' + self.is_stream = filename == b"-" + self.is_remote = filename.startswith(b"http:") or \ + filename.startswith(b"ftp:") cdef char * ctext ctext = NULL @@ -391,27 +410,27 @@ cdef class AlignmentFile: self.header = self._buildHeader(header) else: # build header from a target names and lengths - assert referencenames and referencelengths, \ + assert reference_names and reference_lengths, \ ("either supply options `template`, `header` " - "or both `referencenames` and `referencelengths` " + "or both `reference_names` and `reference_lengths` " "for writing") - assert len(referencenames) == len(referencelengths), \ + assert len(reference_names) == len(reference_lengths), \ "unequal names and lengths of reference sequences" # allocate and fill header - referencenames = [_forceBytes(ref) for ref in referencenames] + reference_names = [_forceBytes(ref) for ref in reference_names] self.header = bam_hdr_init() - self.header.n_targets = len(referencenames) + self.header.n_targets = len(reference_names) n = 0 - for x in referencenames: + for x in reference_names: n += len(x) + 1 self.header.target_name = calloc( n, sizeof(char*)) self.header.target_len = calloc( n, sizeof(uint32_t)) for x from 0 <= x < self.header.n_targets: - self.header.target_len[x] = referencelengths[x] - name = referencenames[x] + self.header.target_len[x] = reference_lengths[x] + name = reference_names[x] self.header.target_name[x] = calloc( len(name) + 1, sizeof(char)) strncpy(self.header.target_name[x], name, len(name)) @@ -422,8 +441,8 @@ cdef class AlignmentFile: text = [] for x from 0 <= x < self.header.n_targets: text.append("@SQ\tSN:%s\tLN:%s\n" % \ - (_forceStr(referencenames[x]), - referencelengths[x])) + (_forceStr(reference_names[x]), + reference_lengths[x])) text = ''.join(text) if text is not None: @@ -435,43 +454,46 @@ cdef class AlignmentFile: strlen(ctext), sizeof(char)) memcpy(self.header.text, ctext, strlen(ctext)) - # open file. Header gets written to file at the same time for bam files - # and sam files (in the latter case, the mode needs to be wh) + # open file (hts_open is synonym with sam_open) self.htsfile = hts_open(filename, bmode) - - # for compatibility - "w" writes sam file without header - if self.isbam or "h" in mode: - # write header to htsfile + + # set filename with reference sequences. If no filename + # is given, the CRAM reference arrays will be built from + # the @SQ header in the header + if self.is_cram and reference_filename: + # note that fn_aux takes ownership, so create + # a copy + fn = _encodeFilename(reference_filename) + self.htsfile.fn_aux = strdup(fn) + + # write header to htsfile + if self.is_bam or self.is_cram or "h" in mode: sam_hdr_write(self.htsfile, self.header) - + elif mode[0] == "r": # open file for reading if (filename != b"-" - and not self.isremote + and not self.is_remote and not os.path.exists(filename)): raise IOError("file `%s` not found" % filename) - # try to detect errors + # open file (hts_open is synonym with sam_open) self.htsfile = hts_open(filename, bmode) if self.htsfile == NULL: raise ValueError( "could not open file (mode='%s') - " "is it SAM/BAM format?" % mode) - # get file pointer - # TODO: this is specific to BAM files - # refactor to make generalizable - self.fp = self.htsfile.fp.bgzf - # bam files require a valid header - if self.isbam: + if self.is_bam or self.is_cram: self.header = sam_hdr_read(self.htsfile) if self.header == NULL: raise ValueError( "file does not have valid header (mode='%s') " "- is it BAM format?" % mode ) else: - # in sam files it is optional (htsfile full of unmapped reads) + # in sam files it is optional (htsfile full of + # unmapped reads) if check_header: self.header = sam_hdr_read(self.htsfile) if self.header == NULL: @@ -491,24 +513,42 @@ cdef class AlignmentFile: raise IOError("could not open file `%s`" % filename ) # check for index and open if present - if mode[0] == "r" and self.isbam: + cdef int format_index = -1 + if self.is_bam: + format_index = HTS_FMT_BAI + elif self.is_cram: + format_index = HTS_FMT_CRAI + + if mode[0] == "r" and (self.is_bam or self.is_cram): - if not self.isremote: - if not os.path.exists(filename + b".bai") \ - and not os.path.exists( filename[:-4] + b".bai"): + # open index for remote files + if self.is_remote: + self.index = hts_idx_load(filename, format_index) + if self.index == NULL: + warnings.warn( + "unable to open remote index for '%s'" % filename) + else: + if self.is_bam \ + and not os.path.exists(filename + b".bai") \ + and not os.path.exists(filename[:-4] + b".bai"): + self.index = NULL + elif self.is_cram \ + and not os.path.exists(filename + b".crai") \ + and not os.path.exists(filename[:-4] + b".crai"): self.index = NULL else: - # returns NULL if there is no index or index could not be opened - self.index = hts_idx_load(filename, HTS_FMT_BAI) + # returns NULL if there is no index or index could + # not be opened + self.index = sam_index_load(self.htsfile, + filename) if self.index == NULL: - raise IOError("error while opening index `%s` " % filename ) - else: - self.index = hts_idx_load(filename, HTS_FMT_BAI) - if self.index == NULL: - warnings.warn("unable to open index for `%s` " % filename) + raise IOError( + "error while opening index for '%s'" % + filename) - if not self.isstream: - self.start_offset = bgzf_tell(self.fp) + # save start of data section + if not self.is_stream: + self.start_offset = self.tell() def gettid(self, reference): ''' @@ -614,18 +654,19 @@ cdef class AlignmentFile: return self.seek(self.start_offset, 0) def seek(self, uint64_t offset, int where = 0): - ''' - move file pointer to position *offset*, see :meth:`pysam.AlignmentFile.tell`. + '''move file pointer to position *offset*, see + :meth:`pysam.AlignmentFile.tell`. ''' if not self._isOpen(): - raise ValueError( "I/O operation on closed file" ) - if not self.isbam: - raise NotImplementedError("seek only available in bam files") - if self.isstream: + raise ValueError("I/O operation on closed file") + if not self.is_bam: + raise NotImplementedError( + "seek only available in bam files") + if self.is_stream: raise OSError("seek no available in streams") - return bgzf_seek(self.fp, offset, where) + return bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where) def tell(self): ''' @@ -633,10 +674,11 @@ cdef class AlignmentFile: ''' if not self._isOpen(): raise ValueError("I/O operation on closed file") - if not self.isbam: - raise NotImplementedError("seek only available in bam files") + if not (self.is_bam or self.is_cram): + raise NotImplementedError( + "seek only available in bam files") - return bgzf_tell(self.fp) + return bgzf_tell(hts_get_bgzfp(self.htsfile)) def fetch(self, reference=None, @@ -647,15 +689,16 @@ cdef class AlignmentFile: callback=None, until_eof=False, multiple_iterators=False): - '''fetch aligned reads in a :term:`region` using 0-based indexing. The - region is specified by :term:`reference`, *start* and - *end*. Alternatively, a samtools :term:`region` string can be - supplied. + '''fetch aligned, i.e. mapped, reads in a :term:`region` + using 0-based + indexing. The region is specified by :term:`reference`, + *start* and *end*. Alternatively, a samtools :term:`region` + string can be supplied. Without *reference* or *region* all mapped reads will be fetched. The reads will be returned ordered by reference sequence, which will not necessarily be the order within the - file. + file. If *until_eof* is given, all reads from the current file position will be returned in order as they are within the @@ -686,14 +729,14 @@ cdef class AlignmentFile: tid) # Turn of re-opening if htsfile is a stream - if self.isstream: + if self.is_stream: multiple_iterators = False - if self.isbam: - if not until_eof and not self._hasIndex() \ - and not self.isremote: - raise ValueError( - "fetch called on bamfile without index") + if self.is_bam or self.is_cram: + if not until_eof and not self.is_remote: + if not self._hasIndex(): + raise ValueError( + "fetch called on bamfile without index") if has_coord: return IteratorRowRegion( @@ -902,7 +945,7 @@ cdef class AlignmentFile: has_coord, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) - if self.isbam: + if self.is_bam or self.is_cram: if not self._hasIndex(): raise ValueError("no index available for pileup") @@ -918,7 +961,7 @@ cdef class AlignmentFile: else: raise NotImplementedError( "pileup of samfiles not implemented yet" ) - def close( self ): + def close(self): ''' closes the :class:`pysam.AlignmentFile`.''' if self.htsfile != NULL: @@ -926,10 +969,16 @@ cdef class AlignmentFile: hts_idx_destroy(self.index); self.htsfile = NULL - def __dealloc__( self ): + def __dealloc__(self): # remember: dealloc cannot call other methods # note: no doc string # note: __del__ is not called. + + # FIXME[kbj]: isn't self.close a method? I've been duplicating + # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty + # solution and perhaps unnecessary given that calling self.close has + # been working for years. + self.close() bam_destroy1(self.b) if self.header != NULL: @@ -944,11 +993,17 @@ cdef class AlignmentFile: if not self._isOpen(): return 0 - x = sam_write1(self.htsfile, - self.header, - read._delegate) + cdef int ret = sam_write1(self.htsfile, + self.header, + read._delegate) + + # kbj: Still need to raise an exception with except -1. Otherwise + # when ret == -1 we get a "SystemError: error return without + # exception set". + if ret < 0: + raise ValueError('sam write failed') - return x + return ret def __enter__(self): return self @@ -963,7 +1018,7 @@ cdef class AlignmentFile: ## properties ############################################################### property filename: - '''number of :term:`filename` associated with this object.''' + ''':term:`filename` associated with this object.''' def __get__(self): return self._filename @@ -989,10 +1044,11 @@ cdef class AlignmentFile: """ def __get__(self): - if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) + if not self._isOpen(): + raise ValueError("I/O operation on closed file") t = [] for x from 0 <= x < self.header.n_targets: - t.append( self.header.target_len[x] ) + t.append(self.header.target_len[x]) return tuple(t) property mapped: @@ -1013,11 +1069,13 @@ cdef class AlignmentFile: an error.''' if not self._isOpen(): raise ValueError("I/O operation on closed file") - if not self.isbam: - raise AttributeError("AlignmentFile.mapped only available in bam files") + if not self.is_bam and not self.is_cram: + raise AttributeError( + "AlignmentFile.mapped only available in bam files") if self.index == NULL: - raise ValueError("mapping information not recorded in index " - "or index not available") + raise ValueError( + "mapping information not recorded in index " + "or index not available") property unmapped: @@ -1261,7 +1319,7 @@ cdef class AlignmentFile: if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - if not self.isbam and self.header.n_targets == 0: + if not self.is_bam and self.header.n_targets == 0: raise NotImplementedError( "can not iterate over samfile without header") return self @@ -1388,10 +1446,10 @@ cdef class IteratorRowRegion(IteratorRow): cdef int cnext(self): '''cversion of iterator. Used by IteratorColumn''' - self.retval = hts_itr_next(self.htsfile.fp.bgzf, + self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), self.iter, self.b, - NULL) + self.htsfile) def __next__(self): """python version of next(). @@ -1410,6 +1468,7 @@ cdef class IteratorRowRegion(IteratorRow): def __dealloc__(self): hts_itr_destroy(self.iter) + cdef class IteratorRowHead(IteratorRow): """*(AlignmentFile samfile, n, int multiple_iterators=False)* @@ -1592,8 +1651,6 @@ cdef class IteratorRowSelection(IteratorRow): self.positions = positions self.current_pos = 0 - self.fp = self.htsfile.fp.bgzf - def __iter__(self): return self @@ -1606,7 +1663,7 @@ cdef class IteratorRowSelection(IteratorRow): # end iteration if out of positions if self.current_pos >= len(self.positions): return -1 - bgzf_seek(self.fp, + bgzf_seek(hts_get_bgzfp(self.htsfile), self.positions[self.current_pos], 0) self.current_pos += 1 @@ -2092,7 +2149,7 @@ def fromQualityString(quality_string): return array.array('B', [ord(x)-33 for x in quality_string]) -cdef inline uint8_t _getTypeCode(value, value_type = None): +cdef inline uint8_t _get_value_code(value, value_type=None): '''guess type code for a *value*. If *value_type* is None, the type code will be inferred based on the Python type of *value*''' @@ -2113,92 +2170,134 @@ cdef inline uint8_t _getTypeCode(value, value_type = None): else: if value_type not in 'Zidf': return 0 - value_type = _forceBytes( value_type ) + value_type = _forceBytes(value_type) _char_type = value_type type_code = (_char_type)[0] return type_code -cdef inline convert_python_tag(pytag, value, fmts, args): - - if not type(pytag) is bytes: - pytag = pytag.encode('ascii') - t = type(value) - if t is tuple or t is list: - # binary tags - treat separately - pytype = 'B' - # get data type - first value determines type. If there is a - # mix of types, the result is undefined. - if type(value[0]) is float: - datafmt, datatype = "f", "f" - else: - mi, ma = min(value), max(value) - # signed ints - if mi < 0: - if mi >= -128 and ma < 128: - datafmt, datatype = "b", 'c' - elif mi >= -32768 and ma < 32768: - datafmt, datatype = "h", 's' - elif mi < -2147483648 or ma >= 2147483648: - raise ValueError( - "at least one signed integer out of range of " - "BAM/SAM specification") - else: datafmt, datatype = "i", 'i' +cdef inline _get_value_type(value, maximum_value=None): + '''returns the value type of a value. - # unsigned ints - else: - if ma < 256: - datafmt, datatype = "B", 'C' - elif ma < 65536: - datafmt, datatype = "H", 'S' - elif ma >= 4294967296: - raise ValueError( - "at least one integer out of range of BAM/SAM specification") - else: - datafmt, datatype = "I", 'I' + If max is specified, the approprite type is + returned for a range where value is the minimum. + ''' + + if maximum_value is None: + maximum_value = value - datafmt = "2sccI%i%s" % (len(value), datafmt) - args.extend([pytag[:2], - pytype.encode('ascii'), - datatype.encode('ascii'), - len(value)] + list(value)) - fmts.append( datafmt ) - return + t = type(value) if t is float: - fmt, pytype = "2scf", 'f' + valuetype = b'f' elif t is int: - # negative values - if value < 0: - if value >= -127: fmt, pytype = "2scb", 'c' - elif value >= -32767: fmt, pytype = "2sch", 's' - elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value ) - else: fmt, pytype = "2sci", 'i' - # positive values + # signed ints + if value < 0: + if value >= -128 and maximum_value < 128: + valuetype = b'c' + elif value >= -32768 and maximum_value < 32768: + valuetype = b's' + elif value < -2147483648 or maximum_value >= 2147483648: + raise ValueError( + "at least one signed integer out of range of " + "BAM/SAM specification") + else: + valuetype = b'i' + # unsigned ints else: - if value <= 255: fmt, pytype = "2scB", 'C' - elif value <= 65535: fmt, pytype = "2scH", 'S' - elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value ) - else: fmt, pytype = "2scI", 'I' + if maximum_value < 256: + valuetype = b'C' + elif maximum_value < 65536: + valuetype = b'S' + elif maximum_value >= 4294967296: + raise ValueError( + "at least one integer out of range of BAM/SAM specification") + else: + valuetype = b'I' else: # Note: hex strings (H) are not supported yet if t is not bytes: value = value.encode('ascii') if len(value) == 1: - fmt, pytype = "2scc", 'A' + valuetype = b"A" else: - fmt, pytype = "2sc%is" % (len(value)+1), 'Z' + valuetype = b'Z' + + return valuetype + + +cdef inline _pack_tags(tags): + """pack a list of tags. Each tag is a tuple of (tag, tuple). + + Values are packed into the most space efficient data structure + possible unless the tag contains a third field with the type code. + + Returns a fmt string and the associated list of arguments + to used in a call to struct.pack_into. + """ + fmts, args = ["<"], [] + + for tag in tags: + + if len(tag) == 2: + pytag, value = tag + valuetype = None + elif len(tag) == 3: + pytag, value, valuetype = tag + else: + raise ValueError("malformatted tag: %s" % str(tag)) + + if not type(pytag) is bytes: + pytag = pytag.encode('ascii') + + datatype2format = {'c': 'b', + 's': 'h', + 'i': 'i', + 'C': 'B', + 'S': 'H', + 'I': 'I', + 'f': 'f', + 'A': 'c',} + + t = type(value) + if t is tuple or t is list: + # binary tags are treated separately + if valuetype is None: + # automatically determine value type - first value + # determines type. If there is a mix of types, the + # result is undefined. + valuetype = _get_value_type(min(value), max(value)) + + if valuetype not in datatype2format: + raise ValueError("invalid value type '%s'" % valuetype) + datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype]) + + args.extend([pytag[:2], + b"B", + valuetype, + len(value)] + list(value)) + fmts.append(datafmt) + + else: + + if valuetype is None: + valuetype = _get_value_type(value) + + if valuetype == b"Z": + fmt = "2sc%is" % (len(value)+1) + else: + fmt = "2sc%s" % datatype2format[valuetype] + + args.extend([pytag[:2], + valuetype, + value]) - args.extend([pytag[:2], - pytype.encode('ascii'), - value]) + fmts.append(fmt) - fmts.append(fmt) + return "".join(fmts), args -########################################################### -########################################################### -########################################################### + cdef class AlignedSegment: '''Class representing an aligned segment. @@ -2597,120 +2696,7 @@ cdef class AlignedSegment: # copy data memcpy(p, result.data.as_voidptr, l) - # TODO: opts object with mapping-like interface - property tags: - """the tags in the AUX field. - - This property permits convenience access to - the tags. Changes it the returned list will - not update the tags automatically. Instead, - the following is required for adding a - new tag:: - - read.tags = read.tags + [("RG",0)] - - This method will happily write the same tag - multiple times. - """ - def __get__(self): - cdef char * ctag - cdef bam1_t * src - cdef uint8_t * s - cdef char auxtag[3] - cdef char auxtype - cdef uint8_t byte_size - cdef int32_t nvalues - - src = self._delegate - if src.l_data == 0: - return [] - s = pysam_bam_get_aux(src) - result = [] - auxtag[2] = 0 - while s < (src.data + src.l_data): - # get tag - auxtag[0] = s[0] - auxtag[1] = s[1] - s += 2 - auxtype = s[0] - if auxtype in ('c', 'C'): - value = bam_aux2i(s) - s += 1 - elif auxtype in ('s', 'S'): - value = bam_aux2i(s) - s += 2 - elif auxtype in ('i', 'I'): - value = bam_aux2i(s) - s += 4 - elif auxtype == 'f': - value = bam_aux2f(s) - s += 4 - elif auxtype == 'd': - value = bam_aux2f(s) - s += 8 - elif auxtype == 'A': - value = "%c" % bam_aux2A(s) - s += 1 - elif auxtype in ('Z', 'H'): - value = _charptr_to_str(bam_aux2Z(s)) - # +1 for NULL terminated string - s += len(value) + 1 - elif auxtype == 'B': - s += 1 - byte_size, nvalues, value = convertBinaryTagToList( s ) - # 5 for 1 char and 1 int - s += 5 + ( nvalues * byte_size) - 1 - else: - raise KeyError("unknown type '%s'" % auxtype) - - s += 1 - - result.append((_charptr_to_str(auxtag), value)) - - return result - - def __set__(self, tags): - cdef bam1_t * src - cdef uint8_t * s - cdef char * temp - cdef int new_size = 0 - cdef int old_size - src = self._delegate - fmts, args = ["<"], [] - - if tags is not None and len(tags) > 0: - for pytag, value in tags: - convert_python_tag(pytag, value, fmts, args) - fmt = "".join(fmts) - new_size = struct.calcsize(fmt) - buffer = ctypes.create_string_buffer(new_size) - struct.pack_into(fmt, - buffer, - 0, - *args) - - # delete the old data and allocate new space. - # If total_size == 0, the aux field will be - # empty - old_size = pysam_bam_get_l_aux(src) - pysam_bam_update(src, - old_size, - new_size, - pysam_bam_get_aux(src)) - # copy data only if there is any - if new_size > 0: - - # get location of new data - s = pysam_bam_get_aux(src) - - # check if there is direct path from buffer.raw to tmp - p = buffer.raw - # create handle to make sure buffer stays alive long - # enough for memcpy, see issue 129 - temp = p - memcpy(s, temp, new_size) - property bin: """properties bin""" def __get__(self): @@ -3095,8 +3081,6 @@ cdef class AlignedSegment: ##################################################### ## Unsorted as yet - - # TODO: capture in CIGAR object property cigartuples: """the :term:`cigar` alignment. The alignment @@ -3200,17 +3184,25 @@ cdef class AlignedSegment: 5)) + cpdef set_tag(self, + tag, + value, + value_type=None, + replace=True): + """sets a particular field *tag* to *value* in the optional alignment + section. - cpdef setTag(self, tag, value, - value_type = None, - replace = True): - ''' - Set optional field of alignment *tag* to *value*. *value_type* may be specified, - but if not the type will be inferred based on the Python type of *value* + *value_type* describes the type of *value* that is to entered + into the alignment record.. It can be set explicitely to one + of the valid one-letter type codes. If unset, an appropriate + type will be chosen automatically. - An existing value of the same tag will be overwritten unless - *replace* is set to False. - ''' + An existing value of the same *tag* will be overwritten unless + replace is set to False. This is usually not recommened as a + tag may only appear once in the optional alignment section. + + If *value* is None, the tag will be deleted. + """ cdef int value_size cdef uint8_t * value_ptr @@ -3224,14 +3216,24 @@ cdef class AlignedSegment: if len(tag) != 2: raise ValueError('Invalid tag: %s' % tag) + + tag = _forceBytes(tag) + if replace: + existing_ptr = bam_aux_get(src, tag) + if existing_ptr: + bam_aux_del(src, existing_ptr) + + # setting value to None deletes a tag + if value is None: + return - type_code = _getTypeCode(value, value_type) + type_code = _get_value_code(value, value_type) if type_code == 0: raise ValueError("can't guess type or invalid type code specified") # Not Endian-safe, but then again neither is samtools! if type_code == 'Z': - value = _forceBytes( value ) + value = _forceBytes(value) value_ptr = value value_size = len(value)+1 elif type_code == 'i': @@ -3249,11 +3251,6 @@ cdef class AlignedSegment: else: raise ValueError('Unsupported value_type in set_option') - tag = _forceBytes( tag ) - if replace: - existing_ptr = bam_aux_get(src, tag) - if existing_ptr: - bam_aux_del(src, existing_ptr) bam_aux_append(src, tag, @@ -3261,20 +3258,32 @@ cdef class AlignedSegment: value_size, value_ptr) + cpdef has_tag(self, tag): + """returns true if the optional alignment section + contains a given *tag*.""" + cdef uint8_t * v + cdef int nvalues + btag = _forceBytes(tag) + v = bam_aux_get(self._delegate, btag) + return v != NULL - ####################################################################### - ####################################################################### - ## Derived properties - ####################################################################### + cpdef get_tag(self, tag): + """retrieves data from the optional alignment section + given a two-letter *tag* denoting the field. - def opt(self, tag): - """retrieves optional data given a two-letter *tag*""" - #see bam_aux.c: bam_aux_get() and bam_aux2i() etc + If *tag* is not present, a KeyError is raised. + + The returned value is cast into an appropriate python type. + + This method is the fastest way to access the optional + alignment section if only few tags need to be retrieved. + """ cdef uint8_t * v cdef int nvalues btag = _forceBytes(tag) v = bam_aux_get(self._delegate, btag) - if v == NULL: raise KeyError( "tag '%s' not present" % tag ) + if v == NULL: + raise KeyError("tag '%s' not present" % tag) auxtype = chr(v[0]) if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S': return bam_aux2i(v) @@ -3291,11 +3300,140 @@ cdef class AlignedSegment: elif auxtype == 'Z': return _charptr_to_str(bam_aux2Z(v)) elif auxtype == 'B': - bytesize, nvalues, values = convertBinaryTagToList( v + 1 ) + bytesize, nvalues, values = convertBinaryTagToList(v + 1) return values else: raise ValueError("unknown auxilliary type '%s'" % auxtype) + def get_tags(self, with_value_type=False): + """the fields in the optional aligment section. + + Returns a list of all fields in the optional + alignment section. Values are converted to appropriate python + values. For example: + + [(NM, 2), (RG, "GJP00TM04")] + + If *with_value_type* is set, the value type as encode in + the AlignedSegment record will be returned as well: + + [(NM, 2, "i"), (RG, "GJP00TM04", "Z")] + + This method will convert all values in the optional alignment + section. When getting only one or few tags, please see + :meth:`get_tag` for a quicker way to achieve this. + + """ + + cdef char * ctag + cdef bam1_t * src + cdef uint8_t * s + cdef char auxtag[3] + cdef char auxtype + cdef uint8_t byte_size + cdef int32_t nvalues + + src = self._delegate + if src.l_data == 0: + return [] + s = pysam_bam_get_aux(src) + result = [] + auxtag[2] = 0 + while s < (src.data + src.l_data): + # get tag + auxtag[0] = s[0] + auxtag[1] = s[1] + s += 2 + auxtype = s[0] + if auxtype in ('c', 'C'): + value = bam_aux2i(s) + s += 1 + elif auxtype in ('s', 'S'): + value = bam_aux2i(s) + s += 2 + elif auxtype in ('i', 'I'): + value = bam_aux2i(s) + s += 4 + elif auxtype == 'f': + value = bam_aux2f(s) + s += 4 + elif auxtype == 'd': + value = bam_aux2f(s) + s += 8 + elif auxtype == 'A': + value = "%c" % bam_aux2A(s) + s += 1 + elif auxtype in ('Z', 'H'): + value = _charptr_to_str(bam_aux2Z(s)) + # +1 for NULL terminated string + s += len(value) + 1 + elif auxtype == 'B': + s += 1 + byte_size, nvalues, value = convertBinaryTagToList(s) + # 5 for 1 char and 1 int + s += 5 + (nvalues * byte_size) - 1 + else: + raise KeyError("unknown type '%s'" % auxtype) + + s += 1 + + result.append((_charptr_to_str(auxtag), value)) + + return result + + def set_tags(self, tags): + """sets the fields in the optional alignmest section with + a list of (tag, value) tuples. + + The :term:`value type` of the values is determined from the + python type. Optionally, a type may be given explicitely as + a third value in the tuple, For example: + + x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")] + + This method will not enforce the rule that the same tag may appear + only once in the optional alignment section. + """ + + cdef bam1_t * src + cdef uint8_t * s + cdef char * temp + cdef int new_size = 0 + cdef int old_size + src = self._delegate + + # convert and pack the data + if tags is not None and len(tags) > 0: + fmt, args =_pack_tags(tags) + new_size = struct.calcsize(fmt) + buffer = ctypes.create_string_buffer(new_size) + struct.pack_into(fmt, + buffer, + 0, + *args) + + # delete the old data and allocate new space. + # If total_size == 0, the aux field will be + # empty + old_size = pysam_bam_get_l_aux(src) + pysam_bam_update(src, + old_size, + new_size, + pysam_bam_get_aux(src)) + + # copy data only if there is any + if new_size > 0: + + # get location of new data + s = pysam_bam_get_aux(src) + + # check if there is direct path from buffer.raw to tmp + p = buffer.raw + # create handle to make sure buffer stays alive long + # enough for memcpy, see issue 129 + temp = p + memcpy(s, temp, new_size) + ######################################################## # Compatibility Accessors @@ -3422,9 +3560,18 @@ cdef class AlignedSegment: property positions: def __get__(self): return self.get_reference_positions() + property tags: + def __get__(self): + return self.get_tags() + def __set__(self, tags): + self.set_tags(tags) def overlap(self): return self.get_overlap() - + def opt(self, tag): + return self.get_tag(tag) + def setTag(self, tag, value, value_type=None, replace=True): + return self.set_tag(tag, value, value_type, replace) + cdef class PileupColumn: '''A pileup of reads at a particular reference sequence postion @@ -3509,7 +3656,8 @@ cdef class PileupRead: ''' def __init__(self): - raise TypeError("this class cannot be instantiated from Python") + raise TypeError( + "this class cannot be instantiated from Python") def __str__(self): return "\t".join( @@ -3533,20 +3681,25 @@ cdef class PileupRead: """indel length; 0 for no indel, positive for ins and negative for del""" def __get__(self): return self._indel + property level: """the level of the read in the "viewer" mode""" def __get__(self): return self._level + property is_del: """1 iff the base on the padded read is a deletion""" def __get__(self): return self._is_del + property is_head: def __get__(self): return self._is_head + property is_tail: def __get__(self): return self._is_tail + property is_refskip: def __get__(self): return self._is_refskip @@ -3627,7 +3780,7 @@ cdef class IndexedReads: # object is alive. self.samfile = samfile - assert samfile.isbam, "can only IndexReads on bam files" + assert samfile.is_bam, "can only IndexReads on bam files" # multiple_iterators the file - note that this makes the iterator # slow and causes pileup to slow down significantly. @@ -3642,23 +3795,20 @@ cdef class IndexedReads: self.header = self.samfile.header self.owns_samfile = False - # TODO: BAM file specific - self.fp = self.htsfile.fp.bgzf - def build(self): '''build index.''' self.index = collections.defaultdict(list) - # this method will start indexing from the current file position - # if you decide + # this method will start indexing from the current file + # position if you decide cdef int ret = 1 cdef bam1_t * b = calloc(1, sizeof( bam1_t)) cdef uint64_t pos while ret > 0: - pos = bgzf_tell(self.fp) + pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) ret = sam_read1(self.htsfile, self.samfile.header, b) diff --git a/pysam/cbcf.pxd b/pysam/cbcf.pxd new file mode 100644 index 00000000..83e628a4 --- /dev/null +++ b/pysam/cbcf.pxd @@ -0,0 +1,158 @@ +############################################################################### +############################################################################### +## Cython wrapper for htslib VCF/BCF reader/writer +############################################################################### +# +# NOTICE: This code is incomplete and preliminary. It is nearly complete as +# an immutable interface, but has no capability (yet) to mutate the +# resulting data (beyond dropping all samples). Documentation still +# needs to be written and a unit test suite is in the works. The +# code is also specific to Python 2 and will require a bit of work +# to properly adapt to Python 3. +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free +from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup + +from chtslib cimport * + + +cdef class VariantHeader(object): + cdef bcf_hdr_t *ptr + + cdef _subset_samples(self, include_samples) + + +cdef class VariantHeaderRecord(object): + cdef VariantHeader header + cdef bcf_hrec_t *ptr + + +cdef class VariantHeaderRecords(object): + cdef VariantHeader header + + +cdef class VariantHeaderContigs(object): + cdef VariantHeader header + + +cdef class VariantHeaderSamples(object): + cdef VariantHeader header + + +cdef class VariantContig(object): + cdef VariantHeader header + cdef int id + + +cdef class VariantMetadata(object): + cdef VariantHeader header + cdef int type + cdef int id + + +cdef class VariantHeaderMetadata(object): + cdef VariantHeader header + cdef int32_t type + + +cdef class VariantRecord(object): + cdef VariantHeader header + cdef bcf1_t *ptr + + +cdef class VariantRecordFilter(object): + cdef VariantRecord record + + +cdef class VariantRecordFormat(object): + cdef VariantRecord record + + +cdef class VariantRecordInfo(object): + cdef VariantRecord record + + +cdef class VariantRecordSamples(object): + cdef VariantRecord record + + +cdef class VariantRecordSample(object): + cdef VariantRecord record + cdef readonly int32_t index + + +cdef class BaseIndex(object): + cdef tuple refs + cdef dict refmap + + +cdef class BCFIndex(BaseIndex): + cdef VariantHeader header + cdef hts_idx_t *ptr + + +cdef class TabixIndex(BaseIndex): + cdef tbx_t *ptr + + +cdef class BaseIterator(object): + cdef VariantFile bcf + cdef hts_itr_t *iter + + +cdef class BCFIterator(BaseIterator): + cdef BCFIndex index + + +cdef class TabixIterator(BaseIterator): + cdef TabixIndex index + cdef kstring_t line_buffer + + +cdef class VariantFile(object): + cdef htsFile *htsfile # pointer to htsFile structure + cdef int64_t start_offset # BGZF offset of first record + + cdef readonly object filename # filename as supplied by user + cdef readonly object mode # file opening mode + + cdef readonly VariantHeader header + cdef readonly BaseIndex index + + cdef readonly bint drop_samples # true if sample information is to be ignored + + # FIXME: Temporary, use htsFormat when it is available + cdef readonly bint is_bcf # true if file is a bcf file + cdef readonly bint is_stream # true if not a seekable file but a stream + cdef readonly bint is_remote # true if file is not on the local filesystem + cdef readonly bint is_reading # true if file has begun reading records + + cpdef int write(self, VariantRecord record) except -1 diff --git a/pysam/cbcf.pyx b/pysam/cbcf.pyx new file mode 100644 index 00000000..8f8e315c --- /dev/null +++ b/pysam/cbcf.pyx @@ -0,0 +1,2421 @@ +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +## Cython wrapper for htslib VCF/BCF reader/writer +############################################################################### +# +# NOTICE: This code is incomplete and preliminary. It does offer a nearly +# complete immutable Pythonic interface to VCF/BCF metadata and data +# with reading and writing capability, but has no capability (yet) +# to mutate the resulting data (beyond dropping all samples). +# Documentation still needs to be written and a unit test suite is +# in the works. The code is also superficially specific to Python 2 +# and will require a bit of work to properly adapt to Python 3. +# +# Here is a minimal example of how to use the API: +# +# $ cat bcfview.py +# import sys +# from pysam import VariantFile +# +# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format +# bcf_out = VariantFile('-', 'w', header=bcf_in.header) +# +# for rec in bcf_in: +# bcf_out.write(rec) +# +# Performance is fairly close to that of bcftools view. Here is an example +# using some 1k Genomes data: +# +# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l +# 1103799 +# +# real 0m56.114s +# user 1m4.489s +# sys 0m3.102s +# +# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l +# 1103800 # bcftools adds an extra header +# +# real 0m55.126s +# user 1m3.502s +# sys 0m3.459s +# +# Here is a quick tour through the API:: +# +# VariantFile(filename, mode=None, header=None, drop_samples=False) +# +# Attributes / Properties +# +# htsfile: htsFile* [private] +# start_offset: BGZF offset of first record [private] +# filename: filename [read only] +# mode: mode [read only] +# header: VariantHeader object [read only] +# index: TabixIndex, BCFIndex or None [read only] +# drop_samples: sample information is to be ignored [read only] +# +# is_stream: file is stdin/stdout [read only] +# is_remote: file is not on the local filesystem [read only] +# is_reading: file has begun reading records [read only] +# category: file format general category [read only] +# format: file format [read only] +# version: tuple of (major, minor) format version [read only] +# compression: file compression +# description: vaguely human readable description of [read only] +# file format. +# +# Methods: +# copy() +# close() +# open(filename, mode=None, header=None, drop_samples=False) +# reset() +# seek(offset) +# tell() +# fetch(contig=None, start=None, stop=None, region=None, reopen=False) +# subset_samples(include_samples) +# +# VariantHeader(mode) # mode='r' for reading, mode='w' for writing +# +# version: VCF version +# samples: sequence-like access to samples +# records: sequence-like access to partially parsed headers +# contigs: mapping-like object for contig name -> VariantContig +# +# filters: mapping-like object for filter name -> VariantMetadata +# info: mapping-like object for info name -> VariantMetadata +# formats: mapping-like object for formats name -> VariantMetadata +# +# VariantRecord(...) +# +# header: VariantHeader object +# rid: reference id (i.e. tid) +# chrom: chromosome/contig string +# contig: synonym for chrom +# pos: 1-based start position (inclusive) +# start: 0-based start position (inclusive) +# stop: 0-based stop position (exclusive) +# rlen: reference length (stop - start) +# id: record identifier +# ref: reference allele +# alleles: alleles (ref followed by alts) +# alts: alt alleles +# qual: quality (float) +# filter: mapping-like object for filter name -> type info +# info: mapping-like object for info name -> value +# format: mapping-like object for format name -> type info +# samples: mapping-like object of sample genotypes & attrs +# +# VariantRecordSample(...) +# +# name: sample name +# index: sample index +# allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1) +# alleles: tuple of alleles (missing=None) +# +# VariantRecordSample is also a mapping object from formats to values +# +# VariantContig(...) +# +# id: reference id (i.e. tid) +# name: chromosome/contig string +# length: contig length if provided, else None +# header: defining VariantHeaderRecord +# +# VariantMetadata(...) # for FILTER, INFO and FORMAT metadata +# +# id: internal id +# name: metadata name +# type: value data type +# number: number of values +# header: defining VariantHeaderRecord +# +# VariantHeaderRecord(...) # replace with single tuple of key/value pairs? +# +# type: record type +# key: first record key +# value: first record value +# attrs: remaining key/value pairs +# +############################################################################### +# +# TODO list for next major sprint: +# +# * more genotype methods +# * unit test suite (perhaps py.test based) +# * documentation +# * htslib 1.2 format info +# +# For later sprints: +# +# * ability to create indices +# * mutable header and record data +# * pickle support +# * Python 3 support +# * left/right locus normalization +# * parallel iteration (like synced_bcf_reader) +# * fix reopen to re-use fd +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +from __future__ import division, print_function + +import os +import sys + +from libc.string cimport strcmp + +cimport cython + +from cpython cimport PyBytes_Check, PyUnicode_Check +from cpython.version cimport PY_MAJOR_VERSION + + +__all__ = ['VariantFile', 'VariantHeader'] + + +######################################################################## +######################################################################## +## Constants +######################################################################## + +cdef int MAX_POS = 2 << 29 +cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') +cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') +cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') + +cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') +cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', + 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') +cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM') + +######################################################################## +######################################################################## +## Python 3 compatibility functions +######################################################################## + +IS_PYTHON3 = PY_MAJOR_VERSION >= 3 + + +# filename encoding (copied from lxml.etree.pyx) +cdef str FILENAME_ENCODING +FILENAME_ENCODING = sys.getfilesystemencoding() +if FILENAME_ENCODING is None: + FILENAME_ENCODING = sys.getdefaultencoding() +if FILENAME_ENCODING is None: + FILENAME_ENCODING = 'ascii' + + +cdef bytes encode_filename(object filename): + """Make sure a filename is 8-bit encoded (or None).""" + if filename is None: + return None + elif PyBytes_Check(filename): + return filename + elif PyUnicode_Check(filename): + return filename.encode(FILENAME_ENCODING) + else: + raise TypeError('Argument must be string or unicode.') + + +cdef force_str(object s): + """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)""" + if s is None: + return None + if PY_MAJOR_VERSION < 3: + return s + elif PyBytes_Check(s): + return s.decode('ascii') + else: + # assume unicode + return s + + +cdef bytes force_bytes(object s): + """convert string or unicode object to bytes, assuming ascii encoding.""" + if PY_MAJOR_VERSION < 3: + return s + elif s is None: + return None + elif PyBytes_Check(s): + return s + elif PyUnicode_Check(s): + return s.encode('ascii') + else: + raise TypeError('Argument must be string, bytes or unicode.') + + +cdef charptr_to_str(const char* s): + if PY_MAJOR_VERSION < 3: + return s + else: + return s.decode('ascii') + + +######################################################################## +######################################################################## +## Low level type conversion helpers +######################################################################## + + +cdef tuple char_array_to_tuple(const char **a, int n, int free_after=0): + if not a: + return None + try: + return tuple( charptr_to_str(a[i]) for i in range(n) ) + finally: + if free_after and a: + free(a) + + +cdef bcf_array_to_object(void *data, int type, int n, int scalar=0): + cdef char *datac + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef float *dataf + cdef int i + + if not data or n <= 0: + return None + + if type == BCF_BT_CHAR: + datac = data + value = datac[:n] if datac[0] != bcf_str_missing else None + else: + value = [] + if type == BCF_BT_INT8: + data8 = data + for i in range(n): + if data8[i] == bcf_int8_vector_end: + break + value.append(data8[i] if data8[i] != bcf_int8_missing else None) + elif type == BCF_BT_INT16: + data16 = data + for i in range(n): + if data16[i] == bcf_int16_vector_end: + break + value.append(data16[i] if data16[i] != bcf_int16_missing else None) + elif type == BCF_BT_INT32: + data32 = data + for i in range(n): + if data32[i] == bcf_int32_vector_end: + break + value.append(data32[i] if data32[i] != bcf_int32_missing else None) + elif type == BCF_BT_FLOAT: + dataf = data + for i in range(n): + if bcf_float_is_vector_end(dataf[i]): + break + value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None) + else: + raise TypeError('unsupported info type code') + + if not value: + value = None + elif scalar and len(value) == 1: + value = value[0] + else: + value = tuple(value) + + return value + + +cdef object bcf_info_value(const bcf_info_t *z): + cdef char *s + + if not z: + return None + elif z.len == 0: + value = True + elif z.len == 1: + if z.type == BCF_BT_INT8: + value = z.v1.i if z.v1.i != bcf_int8_missing else None + elif z.type == BCF_BT_INT16: + value = z.v1.i if z.v1.i != bcf_int16_missing else None + elif z.type == BCF_BT_INT32: + value = z.v1.i if z.v1.i != bcf_int32_missing else None + elif z.type == BCF_BT_FLOAT: + value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None + elif z.type == BCF_BT_CHAR: + s = &z.v1.i + value = s if not s or s[0] != bcf_str_missing else None + else: + raise TypeError('unsupported info type code') + else: + value = bcf_array_to_object(z.vptr, z.type, z.len) + + return value + + +cdef inline int is_gt_fmt(bcf_hdr_t *h, bcf_fmt_t *fmt): + return strcmp(bcf_hdr_int2id(h, BCF_DT_ID, fmt.id), "GT") == 0 + + +######################################################################## +######################################################################## +## Variant Header objects +######################################################################## + +#FIXME: implement a full mapping interface +#FIXME: passing bcf_hrec_t* may not be the safest approach once mutating +# operations are allowed. +cdef class VariantHeaderRecord(object): + """header record from a :class:`VariantHeader` object""" + + property type: + """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" + def __get__(self): + cdef bcf_hrec_t *r = self.ptr + return METADATA_TYPES[r.type] + + property key: + """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" + def __get__(self): + cdef bcf_hrec_t *r = self.ptr + return r.key if r.key else None + + property value: + """header value. Set only for generic lines, None for FILTER/INFO, etc.""" + def __get__(self): + cdef bcf_hrec_t *r = self.ptr + return r.value if r.value else None + + property attrs: + """sequence of additional header attributes""" + def __get__(self): + cdef bcf_hrec_t *r = self.ptr + cdef int i + return tuple( (r.keys[i] if r.keys[i] else None, + r.vals[i] if r.vals[i] else None) for i in range(r.nkeys) ) + + def __str__(self): + cdef bcf_hrec_t *r = self.ptr + if r.type == BCF_HL_GEN: + return '##{}={}'.format(self.key, self.value) + else: + attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX') + return '##{}=<{}>'.format(self.type, attrs) + + +cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *h): + if not header: + raise ValueError('invalid VariantHeader') + + if not h: + return None + + cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord) + record.header = header + record.ptr = h + + return record + + +cdef class VariantHeaderRecords(object): + """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" + + def __len__(self): + return self.header.ptr.nhrec + + def __bool__(self): + return self.header.ptr.nhrec != 0 + + def __getitem__(self, index): + cdef int32_t i = index + if i < 0 or i >= self.header.ptr.nhrec: + raise IndexError('invalid header record index') + return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) + + def __iter__(self): + cdef int32_t i + for i in range(self.header.ptr.nhrec): + yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) + + __hash__ = None + + +cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords) + records.header = header + return records + + +cdef class VariantMetadata(object): + """filter, info or format metadata record from a :class:`VariantHeader` object""" + property name: + """metadata name""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + return h.id[BCF_DT_ID][self.id].key + + # Q: Should this be exposed? + property id: + """metadata internal header id number""" + def __get__(self): + return self.id + + property number: + """metadata number (i.e. cardinality)""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + if not bcf_hdr_idinfo_exists(h, self.type, self.id) or self.type == BCF_HL_FLT: + return None + cdef int l = bcf_hdr_id2length(h, self.type, self.id) + if l == BCF_VL_FIXED: + return bcf_hdr_id2number(h, self.type, self.id) + elif l == BCF_VL_VAR: + return '.' + else: + return METADATA_LENGTHS[l] + + property type: + """metadata value type""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + if not bcf_hdr_idinfo_exists(h, self.type, self.id) or self.type == BCF_HL_FLT: + return None + return VALUE_TYPES[bcf_hdr_id2type(h, self.type, self.id)] + + property header: + """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + if not bcf_hdr_idinfo_exists(h, self.type, self.id): + return None + cdef bcf_hrec_t *hrec = h.id[BCF_DT_ID][self.id].val.hrec[self.type] + if not hrec: + return None + return makeVariantHeaderRecord(self.header, hrec) + + +cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): + if not header: + raise ValueError('invalid VariantHeader') + + if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT: + raise ValueError('invalid metadata type') + + if id < 0 or id >= header.ptr.n[BCF_DT_ID]: + raise ValueError('invalid metadata id') + + cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata) + meta.header = header + meta.type = type + meta.id = id + + return meta + + +cdef class VariantHeaderMetadata(object): + """mapping from filter, info or format name to :class:`VariantMetadata` object""" + + def __len__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i, n = 0 + + for i in range(h.n[BCF_DT_ID]): + idpair = h.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + n += 1 + + return n + + def __bool__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i + + for i in range(h.n[BCF_DT_ID]): + idpair = h.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + return True + + return False + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.header.ptr + cdef vdict_t *d = h.dict[BCF_DT_ID] + cdef khiter_t k = kh_get_vdict(d, key) + + if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: + raise KeyError('invalid filter') + + return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) + + def __iter__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i + + for i in range(h.n[BCF_DT_ID]): + idpair = h.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + yield idpair.key + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata) + meta.header = header + meta.type = type + + return meta + + +cdef class VariantContig(object): + """contig metadata from a :class:`VariantHeader`""" + + property name: + """contig name""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + return h.id[BCF_DT_CTG][self.id].key + + property id: + """contig internal id number""" + def __get__(self): + return self.id + + property length: + """contig length or None if not available""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef uint32_t length = h.id[BCF_DT_CTG][self.id].val.info[0] + return length if length else None + + property header: + """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" + def __get__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef bcf_hrec_t *hrec = h.id[BCF_DT_CTG][self.id].val.hrec[0] + return makeVariantHeaderRecord(self.header, hrec) + + +cdef VariantContig makeVariantContig(VariantHeader header, int id): + if not header: + raise ValueError('invalid VariantHeader') + + if id < 0 or id >= header.ptr.n[BCF_DT_CTG]: + raise ValueError('invalid contig id') + + cdef VariantContig contig = VariantContig.__new__(VariantContig) + contig.header = header + contig.id = id + + return contig + + +cdef class VariantHeaderContigs(object): + """mapping from contig name or index to :class:`VariantContig` object.""" + + def __len__(self): + cdef bcf_hdr_t *h = self.header.ptr + assert kh_size(h.dict[BCF_DT_CTG]) == h.n[BCF_DT_CTG] + return h.n[BCF_DT_CTG] + + def __bool__(self): + cdef bcf_hdr_t *h = self.header.ptr + assert kh_size(h.dict[BCF_DT_CTG]) == h.n[BCF_DT_CTG] + return h.n[BCF_DT_CTG] != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.header.ptr + cdef int index + + if isinstance(key, int): + index = key + if index < 0 or index >= h.n[BCF_DT_CTG]: + raise IndexError('invalid contig index') + return makeVariantContig(self.header, index) + + cdef vdict_t *d = h.dict[BCF_DT_CTG] + cdef khiter_t k = kh_get_vdict(d, key) + + if k == kh_end(d): + raise KeyError('invalid contig') + + cdef int id = kh_val_vdict(d, k).id + + return makeVariantContig(self.header, id) + + def __iter__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef vdict_t *d = h.dict[BCF_DT_CTG] + cdef uint32_t n = kh_size(d) + + assert n == h.n[BCF_DT_CTG] + + for i in range(n): + yield bcf_hdr_id2name(h, i) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs) + contigs.header = header + + return contigs + + +cdef class VariantHeaderSamples(object): + """sequence of sample names from a :class:`VariantHeader` object""" + + def __len__(self): + return bcf_hdr_nsamples(self.header.ptr) + + def __bool__(self): + return bcf_hdr_nsamples(self.header.ptr) != 0 + + def __getitem__(self, index): + cdef bcf_hdr_t *h = self.header.ptr + cdef int32_t n = bcf_hdr_nsamples(h) + cdef int32_t i = index + + if i < 0 or i >= n: + raise IndexError('invalid sample index') + + return h.samples[i] + + def __iter__(self): + cdef bcf_hdr_t *h = self.header.ptr + cdef int32_t n = bcf_hdr_nsamples(h) + cdef int32_t i + + for i in range(n): + yield h.samples[i] + + def __contains__(self, key): + cdef bcf_hdr_t *h = self.header.ptr + cdef vdict_t *d = h.dict[BCF_DT_SAMPLE] + cdef khiter_t k = kh_get_vdict(d, key) + + return k != kh_end(d) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples) + samples.header = header + + return samples + + +cdef class VariantHeader(object): + """header information for a :class:`VariantFile` object""" + + #FIXME: Add structured proxy + #FIXME: Add generic proxy + #FIXME: Add mutable methods + + # See makeVariantHeader for C constructor + def __cinit__(self, mode): + self.ptr = NULL + + # Python constructor + def __init__(self, mode): + if mode not in 'rw': + raise ValueError("invalid header mode specified '{}'".format(mode)) + + mode = force_bytes(mode) + self.ptr = bcf_hdr_init(mode) + + if not self.ptr: + raise ValueError('cannot create VariantHeader') + + def __dealloc__(self): + if self.ptr: + bcf_hdr_destroy(self.ptr) + self.ptr = NULL + + def __bool__(self): + # self.ptr == NULL should be impossible + return self.ptr != NULL + + def copy(self): + return makeVariantHeader(bcf_hdr_dup(self.ptr)) + + property version: + """VCF version""" + def __get__(self): + return bcf_hdr_get_version(self.ptr) + + property samples: + """samples (:class:`VariantHeaderSamples`)""" + def __get__(self): + return makeVariantHeaderSamples(self) + + property records: + """header records (:class:`VariantHeaderRecords`)""" + def __get__(self): + return makeVariantHeaderRecords(self) + + property contigs: + """contig information (:class:`VariantHeaderContigs`)""" + def __get__(self): + return makeVariantHeaderContigs(self) + + property filters: + """filter metadata (:class:`VariantHeaderMetadata`)""" + def __get__(self): + return makeVariantHeaderMetadata(self, BCF_HL_FLT) + + property info: + """info metadata (:class:`VariantHeaderMetadata`)""" + def __get__(self): + return makeVariantHeaderMetadata(self, BCF_HL_INFO) + + property formats: + """format metadata (:class:`VariantHeaderMetadata`)""" + def __get__(self): + return makeVariantHeaderMetadata(self, BCF_HL_FMT) + + # only safe to do when opening an htsfile + cdef _subset_samples(self, include_samples): + keep_samples = set(self.samples) + include_samples = set(include_samples) + missing_samples = include_samples - keep_samples + keep_samples &= include_samples + + if missing_samples: + # FIXME: add specialized exception with payload + raise ValueError('missing {:d} requested samples'.format(len(missing_samples))) + + keep_samples = ','.join(keep_samples) + cdef char *keep = keep_samples if keep_samples else NULL + cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) + + if ret != 0: + raise ValueError('bcf_hdr_set_samples failed: ret = {}'.format(ret)) + + def __str__(self): + cdef int hlen + cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen) + + ret = hstr[:hlen] + free(hstr) + return force_str(hstr) + + +cdef VariantHeader makeVariantHeader(bcf_hdr_t *h): + if not h: + raise ValueError('cannot create VariantHeader') + + cdef VariantHeader header = VariantHeader.__new__(VariantHeader, None) + header.ptr = h + + return header + + +######################################################################## +######################################################################## +## Variant Record objects +######################################################################## + +cdef class VariantRecordFilter(object): + """mapping from filter index or name to :class:`VariantMetadata` object for filters set on a :class:`VariantRecord` object.""" + + def __len__(self): + return self.record.ptr.d.n_flt + + def __bool__(self): + return self.record.ptr.d.n_flt != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int index, id + cdef int n = r.d.n_flt + + if isinstance(key, int): + index = key + + if index < 0 or index >= n: + raise IndexError('invalid filter index') + + id = r.d.flt[index] + else: + if key == '.': + key = 'PASS' + + id = bcf_hdr_id2int(h, BCF_DT_ID, key) + + if not bcf_hdr_idinfo_exists(h, BCF_HL_FLT, id) or not bcf_has_filter(h, self.record.ptr, key): + raise KeyError('Invalid filter') + + return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) + + def __iter__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = r.d.n_flt + + for i in range(n): + yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.flt[i]) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + return bcf_has_filter(h, r, key) == 1 + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter) + filter.record = record + + return filter + + +cdef class VariantRecordFormat(object): + """mapping from format name or index to :class:`VariantMetadata` object for formats present in a :class:`VariantRecord` object.""" + + def __len__(self): + return self.record.ptr.n_fmt + + def __bool__(self): + return self.record.ptr.n_fmt != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int index + cdef int n = r.n_fmt + + if isinstance(key, int): + index = key + if index < 0 or index >= n: + raise IndexError('invalid format index') + fmt = &r.d.fmt[index] + else: + fmt = bcf_get_fmt(h, r, key) + if not fmt: + raise KeyError('unknown format') + + return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) + + def __iter__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = r.n_fmt + + for i in range(n): + yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.fmt[i].id) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt = bcf_get_fmt(h, r, key) + return fmt != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat) + format.record = record + + return format + + +#TODO: Add a getmeta method to return the corresponding VariantMetadata? +cdef class VariantRecordInfo(object): + """mapping from info metadata name to value for info data present in a :class:`VariantRecord` object.""" + + def __len__(self): + return self.record.ptr.n_info + + def __bool__(self): + return self.record.ptr.n_info != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info = bcf_get_info(h, r, key) + + if not info: + raise KeyError('Unknown INFO field: {}'.format(key)) + + return bcf_info_value(info) + + def __iter__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = r.n_info + + for i in range(n): + yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.info[i].key) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info = bcf_get_info(h, r, key) + + return info != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef int i, n = r.n_info + + for i in range(n): + info = &r.d.info[i] + yield bcf_info_value(info) + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef int i, n = r.n_info + + for i in range(n): + info = &r.d.info[i] + key = bcf_hdr_int2id(h, BCF_DT_ID, info.key) + value = bcf_info_value(info) + yield key, value + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo) + info.record = record + + return info + + +cdef class VariantRecordSamples(object): + """mapping from sample index or name to :class:`makeVariantRecordSample` object.""" + + def __len__(self): + return bcf_hdr_nsamples(self.record.header.ptr) + + def __bool__(self): + return bcf_hdr_nsamples(self.record.header.ptr) != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int n = bcf_hdr_nsamples(h) + cdef int sample_index + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + sample_index = key + else: + sample_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, key) + if sample_index < 0: + raise KeyError('invalid sample name') + + if sample_index < 0 or sample_index >= n: + raise IndexError('invalid sample index') + + return makeVariantRecordSample(self.record, sample_index) + + def __iter__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = bcf_hdr_nsamples(h) + + for i in range(n): + yield h.samples[i] + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int n = bcf_hdr_nsamples(h) + cdef int sample_index + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + sample_index = key + else: + sample_index = bcf_hdr_id2int(h, BCF_DT_SAMPLE, key) + if sample_index < 0: + raise KeyError('invalid sample name') + + return 0 <= sample_index < n + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = bcf_hdr_nsamples(h) + + for i in range(n): + yield makeVariantRecordSample(self.record, i) + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = bcf_hdr_nsamples(h) + + for i in range(n): + yield h.samples[i], makeVariantRecordSample(self.record, i) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordSamples genos = VariantRecordSamples.__new__(VariantRecordSamples) + genos.record = record + + return genos + + +cdef class VariantRecord(object): + """Variant record""" + + def __dealloc__(self): + if self.ptr: + bcf_destroy1(self.ptr) + self.ptr = NULL + + property rid: + """internal reference id number""" + def __get__(self): + return self.ptr.rid + + property chrom: + """chromosome/contig name""" + def __get__(self): + return bcf_hdr_id2name(self.header.ptr, self.ptr.rid) + + property contig: + """chromosome/contig name""" + def __get__(self): + return bcf_hdr_id2name(self.header.ptr, self.ptr.rid) + + property pos: + """record start position on chrom/contig (1-based inclusive)""" + def __get__(self): + return self.ptr.pos + 1 + + property start: + """record start position on chrom/contig (0-based inclusive)""" + def __get__(self): + return self.ptr.pos + + property stop: + """record stop position on chrom/contig (0-based exclusive)""" + def __get__(self): + return self.ptr.pos + self.ptr.rlen + + property rlen: + """record length on chrom/contig (rec.stop - rec.start)""" + def __get__(self): + return self.ptr.rlen + + property qual: + """phred scaled quality score or None if not available""" + def __get__(self): + return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None + +# property n_info: +# def __get__(self): +# if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: +# raise ValueError('Error unpacking BCFRecord') +# return self.ptr.n_info + +# property n_allele: +# def __get__(self): +# return self.ptr.n_allele + +# property n_fmt: +# def __get__(self): +# return self.ptr.n_fmt + +# property n_sample: +# def __get__(self): +# return self.ptr.n_sample + +# property shared: +# def __get__(self): +# return self.ptr.shared.s + +# property indiv: +# def __get__(self): +# return self.ptr.indiv.s + +# property n_flt: +# def __get__(self): +# if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: +# raise ValueError('Error unpacking VariantRecord') +# return self.ptr.d.n_flt + + property id: + """record identifier or None if not available""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + id = self.ptr.d.id + return id if id != b'.' else None + + property ref: + """reference allele""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + return self.ptr.d.allele[0] if self.ptr.d.allele else None + + property alleles: + """tuple of reference allele followed by alt alleles""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if not self.ptr.d.allele: + return None + return tuple(self.ptr.d.allele[i] for i in range(self.ptr.n_allele)) + + property alts: + """tuple of alt alleles""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if self.ptr.n_allele < 2 or not self.ptr.d.allele: + return None + return tuple(self.ptr.d.allele[i] for i in range(1,self.ptr.n_allele)) + + property filter: + """filter information (see :class:`VariantRecordFilter`)""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFilter(self) + + property info: + """info data (see :class:`VariantRecordInfo`)""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordInfo(self) + + property format: + """sample format metadata (see :class:`VariantRecordFormat`)""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFormat(self) + + property samples: + """sample data (see :class:`VariantRecordSamples`)""" + def __get__(self): + if bcf_unpack(self.ptr, BCF_UN_IND) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordSamples(self) + + def __str__(self): + cdef kstring_t line + cdef char c + + line.l = line.m = 0 + line.s = NULL + + if vcf_format(self.header.ptr, self.ptr, &line) < 0: + if line.m: + free(line.s) + raise ValueError('vcf_format failed') + + # Strip CR/LF? + #while line.l: + # c = line.s[line.l - 1] + # if c != b'\n' and c != b'\r': + # break + # line.l -= 1 + + ret = line.s[:line.l] + ret = force_str(ret) + + if line.m: + free(line.s) + + return ret + + +cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): + if not header: + raise ValueError('invalid VariantHeader') + + if not r: + raise ValueError('cannot create VariantRecord') + + cdef VariantRecord record = VariantRecord.__new__(VariantRecord) + record.header = header + record.ptr = r + + return record + + +######################################################################## +######################################################################## +## Variant Sampletype object +######################################################################## + + +cdef class VariantRecordSample(object): + """Data for a single sample from a :class:`VariantRecord` object. + Provides data accessors for genotypes and a mapping interface from format name to values. + """ + + property name: + """sample name""" + def __get__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t n = bcf_hdr_nsamples(h) + + if self.index < 0 or self.index >= n: + raise ValueError('invalid sample index') + + return h.samples[self.index] + + property allele_indices: + """allele indices for called genotype, if present. Otherwise None""" + def __get__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t n = bcf_hdr_nsamples(h) + + if self.index < 0 or self.index >= n or not r.n_fmt: + return None + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(h, fmt0) + + if not gt0 or not fmt0.n: + return None + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + alleles = [] + + if fmt0.type == BCF_BT_INT8: + data8 = (fmt0.p + self.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + alleles.append( (data8[i] >> 1) - 1 ) + elif fmt0.type == BCF_BT_INT16: + data16 = (fmt0.p + self.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + alleles.append( (data16[i] >> 1) - 1 ) + elif fmt0.type == BCF_BT_INT32: + data32 = (fmt0.p + self.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + alleles.append( (data32[i] >> 1) - 1 ) + + return tuple(alleles) + + property alleles: + """alleles for called genotype, if present. Otherwise None""" + def __get__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t nsamples = bcf_hdr_nsamples(h) + cdef int32_t nalleles = r.n_allele + + if self.index < 0 or self.index >= nsamples or not r.n_fmt: + return None + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(h, fmt0) + + if not gt0 or not fmt0.n: + return None + + cdef int32_t a + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + alleles = [] + + if fmt0.type == BCF_BT_INT8: + data8 = (fmt0.p + self.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + a = (data8[i] >> 1) - 1 + alleles.append(r.d.allele[a] if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT16: + data16 = (fmt0.p + self.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + a = (data16[i] >> 1) - 1 + alleles.append(r.d.allele[a] if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT32: + data32 = (fmt0.p + self.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + a = (data32[i] >> 1) - 1 + alleles.append(r.d.allele[a] if 0 <= a < nalleles else None) + + return tuple(alleles) + + def __len__(self): + return self.record.ptr.n_fmt + + def __bool__(self): + return self.record.ptr.n_fmt != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int index + + if isinstance(key, int): + index = key + if index < 0 or index >= r.n_fmt: + raise IndexError('invalid format index') + fmt = r.d.fmt + index + else: + fmt = bcf_get_fmt(h, r, key) + + if not fmt: + raise KeyError('invalid format requested') + + if is_gt_fmt(h, fmt): + return self.alleles + elif fmt.p and fmt.n and fmt.size: + return bcf_array_to_object(fmt.p + self.index * fmt.size, fmt.type, fmt.n, scalar=1) + else: + return None + + def __iter__(self): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = r.n_fmt + + for i in range(n): + yield bcf_hdr_int2id(h, BCF_DT_ID, r.d.fmt[i].id) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *h = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt = bcf_get_fmt(h, r, key) + return fmt != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): + if not record or sample_index < 0: + raise ValueError('cannot create VariantRecordSample') + + cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample) + sample.record = record + sample.index = sample_index + + return sample + + +######################################################################## +######################################################################## +## Index objects +######################################################################## + + +cdef class BaseIndex(object): + def __init__(self): + self.refs = () + self.remap = {} + + def __len__(self): + return len(self.refs) + + def __bool__(self): + return len(self.refs) != 0 + + def __getitem__(self, key): + if isinstance(key, int): + return self.refs[key] + else: + return self.refmap[key] + + def __iter__(self): + return iter(self.refs) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef class BCFIndex(object): + """CSI index data structure for BCF files""" + def __init__(self): + self.refs = () + self.refmap = {} + + if not self.ptr: + raise ValueError('Invalid index object') + + cdef int n + cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) + + if not refs: + raise ValueError('Cannot retrieve reference sequence names') + + self.refs = char_array_to_tuple(refs, n, free_after=1) + self.refmap = { r:i for i,r in enumerate(self.refs) } + + def __dealloc__(self): + if self.ptr: + hts_idx_destroy(self.ptr) + self.ptr = NULL + + def fetch(self, bcf, contig, start, stop, region, reopen): + return BCFIterator(bcf, contig, start, stop, region, reopen) + + +cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx): + if not idx: + return None + + if not header: + raise ValueError('invalid VariantHeader') + + cdef BCFIndex index = BCFIndex.__new__(BCFIndex) + index.header = header + index.ptr = idx + index.__init__() + + return index + + +cdef class TabixIndex(BaseIndex): + """Tabix index data structure for VCF files""" + def __init__(self): + self.refs = () + self.refmap = {} + + if not self.ptr: + raise ValueError('Invalid index object') + + cdef int n + cdef const char **refs = tbx_seqnames(self.ptr, &n) + + if not refs: + raise ValueError('Cannot retrieve reference sequence names') + + self.refs = char_array_to_tuple(refs, n, free_after=1) + self.refmap = { r:i for i,r in enumerate(self.refs) } + + def __dealloc__(self): + if self.ptr: + tbx_destroy(self.ptr) + self.ptr = NULL + + def fetch(self, bcf, contig, start, stop, region, reopen): + return TabixIterator(bcf, contig, start, stop, region, reopen) + + +cdef TabixIndex makeTabixIndex(tbx_t *idx): + if not idx: + return None + + cdef TabixIndex index = TabixIndex.__new__(TabixIndex) + index.ptr = idx + index.__init__() + + return index + + +######################################################################## +######################################################################## +## Iterators +######################################################################## + + +cdef class BaseIterator(object): + pass + + +# Interal function to clean up after iteration stop or failure. +# This would be a nested function if it weren't a cdef function. +cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): + bcf_destroy1(record) + + # destroy iter so future calls to __next__ raise StopIteration + bcf_itr_destroy(self.iter) + self.iter = NULL + + +cdef class BCFIterator(BaseIterator): + def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): + + if not isinstance(bcf.index, BCFIndex): + raise ValueError('bcf index required') + + cdef BCFIndex index = bcf.index + + if not index: + raise ValueError('bcf index required') + + if reopen: + bcf = bcf.copy() + + if region is not None: + if contig is not None or start is not None or stop is not None: + raise ValueError # FIXME + + self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, region) + else: + if contig is None: + raise ValueError # FIXME + + rid = index.refmap.get(contig, -1) + + if start is None: + start = 0 + if stop is None: + stop = MAX_POS + + self.iter = bcf_itr_queryi(index.ptr, rid, start, stop) + + # Do not fail on self.iter == NULL, since it signifies a null query. + + self.bcf = bcf + self.index = index + + def __dealloc__(self): + if self.iter: + bcf_itr_destroy(self.iter) + self.iter = NULL + + def __iter__(self): + return self + + def __next__(self): + if not self.iter: + raise StopIteration + + cdef bcf1_t *record = bcf_init1() + + record.pos = -1 + if self.bcf.drop_samples: + record.max_unpack = BCF_UN_SHR + + cdef int ret = bcf_itr_next(self.bcf.htsfile, self.iter, record) + + if ret < 0: + _stop_BCFIterator(self, record) + if ret == -1: + raise StopIteration + else: + raise ValueError('error reading BCF file') + + ret = bcf_subset_format(self.bcf.header.ptr, record) + + if ret < 0: + _stop_BCFIterator(self, record) + raise ValueError('error in bcf_subset_format') + + return makeVariantRecord(self.bcf.header, record) + + +cdef class TabixIterator(BaseIterator): + def __cinit__(self, *args, **kwargs): + self.line_buffer.l = 0 + self.line_buffer.m = 0 + self.line_buffer.s = NULL + + def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): + if not isinstance(bcf.index, TabixIndex): + raise ValueError('tabix index required') + + cdef TabixIndex index = bcf.index + + if not index: + raise ValueError('bcf index required') + + if reopen: + bcf = bcf.copy() + + if region is not None: + if contig is not None or start is not None or stop is not None: + raise ValueError # FIXME + + self.iter = tbx_itr_querys(index.ptr, region) + else: + if contig is None: + raise ValueError # FIXME + + rid = index.refmap.get(contig, -1) + + if start is None: + start = 0 + if stop is None: + stop = MAX_POS + + self.iter = tbx_itr_queryi(index.ptr, rid, start, stop) + + # Do not fail on self.iter == NULL, since it signifies a null query. + + self.bcf = bcf + self.index = index + + def __dealloc__(self): + if self.iter: + tbx_itr_destroy(self.iter) + self.iter = NULL + + if self.line_buffer.m: + free(self.line_buffer.s) + + self.line_buffer.l = 0 + self.line_buffer.m = 0 + self.line_buffer.s = NULL + + def __iter__(self): + return self + + def __next__(self): + if not self.iter: + raise StopIteration + + cdef int ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer) + + if ret < 0: + tbx_itr_destroy(self.iter) + self.iter = NULL + if ret == -1: + raise StopIteration + else: + raise ValueError('error reading indexed VCF file') + + cdef bcf1_t *record = bcf_init1() + + record.pos = -1 + if self.bcf.drop_samples: + record.max_unpack = BCF_UN_SHR + + ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record) + + # FIXME: stop iteration on parse failure? + if ret < 0: + bcf_destroy1(record) + raise ValueError('error in vcf_parse') + + return makeVariantRecord(self.bcf.header, record) + + +######################################################################## +######################################################################## +## Variant File +######################################################################## + + +cdef class VariantFile(object): + """*(filename, mode=None, header=None, drop_samples=False)* + + A :term:`VCF`/:term:`BCF` formatted file. The file is automatically + opened. + + *mode* should be ``r`` for reading or ``w`` for writing. The default is + text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append + ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. + + If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid + modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. + For instance, to open a :term:`BCF` formatted file for reading, type:: + + f = pysam.VariantFile('ex1.bcf','rb') + + If mode is not specified, we will try to auto-detect in the order 'rb', + 'r', thus both the following should work:: + + f1 = pysam.VariantFile('ex1.bcf') + f2 = pysam.VariantFile('ex1.vcf') + + If an index for a variant file exists (.csi or .tbi), it will be opened + automatically. Without an index random access to records via + :meth:`fetch` is disabled. + + For writing, a :class:`VariantHeader` object must be provided, typically + obtained from another :term:`VCF` file/:term:`BCF` file. + """ + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + + def __init__(self, *args, **kwargs): + self.header = None + self.index = None + self.filename = None + self.mode = None + self.is_stream = False + self.is_remote = False + self.is_reading = False + self.drop_samples = False + self.start_offset = -1 + + self.open(*args, **kwargs) + + def __dealloc__(self): + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + property category: + """General file format category. One of UNKNOWN, ALIGNMENTS, VARIANTS, INDEX, REGIONS""" + def __get__(self): + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMAT_CATEGORIES[self.htsfile.format.category] + + property format: + """File format. + One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED. + """ + def __get__(self): + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMATS[self.htsfile.format.format] + + property version: + """Tuple of file format version numbers (major, minor)""" + def __get__(self): + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return self.htsfile.format.version.major, self.htsfile.format.version.minor + + property compression: + """File compression. One of NONE, GZIP, BGZF, CUSTOM.""" + def __get__(self): + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return COMPRESSION[self.htsfile.format.compression] + + property description: + """Vaguely human readable description of the file format""" + def __get__(self): + if not self.htsfile: + raise ValueError('metadata not available on closed file') + cdef char *desc = hts_format_description(&self.htsfile.format) + try: + return force_str(desc) + finally: + free(desc) + + def close(self): + """closes the :class:`pysam.VariantFile`.""" + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + self.header = self.index = None + + property is_open: + def __get__(self): + """return True if VariantFile is open and in a valid state.""" + return self.htsfile != NULL + + def __iter__(self): + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.mode[0] != b'r': + raise ValueError('cannot iterate over Variantfile opened for writing') + + self.is_reading = 1 + return self + + def __next__(self): + cdef int ret + cdef bcf1_t *record = bcf_init1() + + record.pos = -1 + if self.drop_samples: + record.max_unpack = BCF_UN_SHR + + ret = bcf_read1(self.htsfile, self.header.ptr, record) + + if ret < 0: + bcf_destroy1(record) + if ret == -1: + raise StopIteration + elif ret == -2: + raise IOError('truncated file') + else: + raise ValueError('Variant read failed') + + return makeVariantRecord(self.header, record) + + def copy(self): + if not self.is_open: + raise ValueError + + cdef VariantFile vars = VariantFile.__new__(VariantFile) + + # FIXME: re-open using fd or else header and index could be invalid + vars.htsfile = hts_open(self.filename, self.mode) + + if not vars.htsfile: + raise ValueError('Cannot re-open htsfile') + + # minimize overhead by re-using header and index. This approach is + # currently risky, but see above for how this can be mitigated. + vars.header = self.header + vars.index = self.index + + vars.filename = self.filename + vars.mode = self.mode + vars.drop_samples = self.drop_samples + vars.is_stream = self.is_stream + vars.is_remote = self.is_remote + vars.is_reading = self.is_reading + vars.start_offset = self.start_offset + + if self.htsfile.is_bin: + vars.seek(self.tell()) + else: + makeVariantHeader(bcf_hdr_read(vars.htsfile)) + + return vars + + def open(self, filename, mode=None, VariantHeader header=None, drop_samples=False): + """open a vcf/bcf file. + + If open is called on an existing VariantFile, the current file will be + closed and a new file will be opened. + """ + # close a previously opened file + if self.is_open: + self.close() + + # read mode autodetection + if mode is None: + try: + self.open(filename, 'rb', header=header) + return + except ValueError, msg: + pass + + self.open(filename, 'r', header=header) + return + + if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'): + raise ValueError('invalid file opening mode `{}`'.format(mode)) + + mode = mode.encode('ascii') + + # for htslib, wbu seems to not work + if mode == b'wbu': + mode = b'wb0' + + self.mode = mode + self.filename = filename = encode_filename(filename) + self.drop_samples = bool(drop_samples) + + # FIXME: Use htsFormat when it is available + self.is_remote = filename.startswith(b'http:') or filename.startswith(b'ftp:') + self.is_stream = filename == b'-' + + if mode[0] == b'w': + # open file for writing + + # header structure (used for writing) + if header: + self.header = header.copy() + else: + raise ValueError('a VariantHeader must be specified') + + # open file. Header gets written to file at the same time for bam files + # and sam files (in the latter case, the mode needs to be wh) + self.htsfile = hts_open(filename, mode) + + if not self.htsfile: + raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode))) + + bcf_hdr_write(self.htsfile, self.header.ptr) + + elif mode[0] == b'r': + # open file for reading + if filename != b'-' and not self.is_remote and not os.path.exists(filename): + raise IOError('file `{}` not found'.format(filename)) + + self.htsfile = hts_open(filename, mode) + + if not self.htsfile: + raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format((filename, mode))) + + self.header = makeVariantHeader(bcf_hdr_read(self.htsfile)) + + if not self.header: + raise ValueError("file `{}` does not have valid header (mode='{}') - is it BCF format?".format((filename, mode))) + + # check for index and open if present + if self.htsfile.format.format == bcf: + self.index = makeBCFIndex(self.header, bcf_index_load(filename)) + else: + self.index = makeTabixIndex(tbx_index_load(filename + '.tbi')) + + if not self.is_stream: + self.start_offset = self.tell() + + def reset(self): + """reset file position to beginning of file just after the header.""" + return self.seek(self.start_offset, 0) + + def seek(self, uint64_t offset): + """move file pointer to position *offset*, see :meth:`pysam.VariantFile.tell`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise OSError('seek not available in streams') + + if self.htsfile.format.compression != no_compression: + return bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) + else: + return hts_useek(self.htsfile, offset, SEEK_SET) + + def tell(self): + """return current file position, see :meth:`pysam.VariantFile.seek`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise OSError('tell not available in streams') + + if self.htsfile.format.compression != no_compression: + return bgzf_tell(hts_get_bgzfp(self.htsfile)) + else: + return hts_utell(self.htsfile) + + def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False): + """fetch records in a :term:`region` using 0-based indexing. The + region is specified by :term:`contig`, *start* and *end*. + Alternatively, a samtools :term:`region` string can be supplied. + + Without *contig* or *region* all mapped records will be fetched. The + records will be returned ordered by contig, which will not necessarily + be the order within the file. + + Set *reopen* to true if you will be using multiple iterators on the + same file at the same time. The iterator returned will receive its + own copy of a filehandle to the file effectively re-opening the + file. Re-opening a file incurrs some overhead, so use with care. + + If only *contig* is set, all records on *contig* will be fetched. + If both *region* and *contig* are given, an exception is raised. + + Note that a :term:`VCF` file without a tabix index (.tbi) or a + :term:`BCF` file without a CSI index can only be read sequentially. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.mode[0] != b'r': + raise ValueError('cannot fetch from Variantfile opened for writing') + + if contig is None and region is None: + self.is_reading = 1 + bcf = self.copy() if reopen else self + bcf.seek(self.start_offset) + return iter(bcf) + + if not self.index: + raise ValueError('fetch requires an index') + + self.is_reading = 1 + return self.index.fetch(self, contig, start, stop, region, reopen) + + cpdef int write(self, VariantRecord record) except -1: + """ + write a single :class:`pysam.VariantRecord` to disk. + + returns the number of bytes written. + """ + if not self.is_open: + return 0 + + cdef int ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) + + if ret < 0: + raise ValueError('write failed') + + return ret + + def subset_samples(self, include_samples): + """ + Read only a subset of samples to reduce processing time and memory. + Must be called prior to retrieving records. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.mode[0] != b'r': + raise ValueError('cannot subset samples from Variantfile opened for writing') + + if self.is_reading: + raise ValueError('cannot subset samples after fetching records') + + self.header._subset_samples(include_samples) + + # potentially unnecessary optimization that also sets max_unpack + if not include_samples: + self.drop_samples = True diff --git a/pysam/chtslib.pxd b/pysam/chtslib.pxd index d62e281d..4fd745ae 100644 --- a/pysam/chtslib.pxd +++ b/pysam/chtslib.pxd @@ -3,6 +3,7 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup from libc.stdio cimport FILE, printf +from posix.types cimport off_t cdef extern from "Python.h": long _Py_HashPointer(void*) @@ -20,16 +21,129 @@ cdef extern from "zlib.h" nogil: gzFile gzopen( char *path, char *mode) gzFile gzdopen (int fd, char *mode) char * gzgets(gzFile file, char *buf, int len) - int gzeof( gzFile file ) + int gzeof(gzFile file) + cdef extern from "htslib/kstring.h" nogil: ctypedef struct kstring_t: size_t l, m char *s + +cdef extern from "htslib_util.h" nogil: + ctypedef uint32_t khint32_t + ctypedef uint32_t khint_t + ctypedef khint_t khiter_t + + # Used to manage BCF Header info + ctypedef struct vdict_t: + khint_t n_buckets, size, n_occupied, upper_bound + khint32_t *flags + const char *keys + bcf_idinfo_t *vals + + # Used to manage indexed contigs in Tabix + ctypedef struct s2i_t: + khint_t n_buckets, size, n_occupied, upper_bound + khint32_t *flags + const char *keys + int64_t *vals + + # Generic khash methods + khint_t kh_size(void *d) + khint_t kh_begin(void *d) + khint_t kh_end(void *d) + int kh_exist(void *d, khiter_t i) + + # Specialized khash methods for vdict + khint_t kh_get_vdict(vdict_t *d, const char *key) + const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i) + bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i) + + cdef extern from "htslib/hfile.h" nogil: ctypedef struct hFILE + # @abstract Open the named file or URL as a stream + # @return An hFILE pointer, or NULL (with errno set) if an error occurred. + hFILE *hopen(const char *filename, const char *mode) + + # @abstract Associate a stream with an existing open file descriptor + # @return An hFILE pointer, or NULL (with errno set) if an error occurred. + # @notes For socket descriptors (on Windows), mode should contain 's'. + hFILE *hdopen(int fd, const char *mode) + + # @abstract Report whether the file name or URL denotes remote storage + # @return 0 if local, 1 if remote. + # @notes "Remote" means involving e.g. explicit network access, with the + # implication that callers may wish to cache such files' contents locally. + int hisremote(const char *filename) + + # @abstract Flush (for output streams) and close the stream + # @return 0 if successful, or EOF (with errno set) if an error occurred. + int hclose(hFILE *fp) + + # @abstract Close the stream, without flushing or propagating errors + # @notes For use while cleaning up after an error only. Preserves errno. + void hclose_abruptly(hFILE *fp) + + # @abstract Return the stream's error indicator + # @return Non-zero (in fact, an errno value) if an error has occurred. + # @notes This would be called herror() and return true/false to parallel + # ferror(3), but a networking-related herror(3) function already exists. */ + int herrno(hFILE *fp) + + # @abstract Clear the stream's error indicator + void hclearerr(hFILE *fp) + + # @abstract Reposition the read/write stream offset + # @return The resulting offset within the stream (as per lseek(2)), + # or negative if an error occurred. + off_t hseek(hFILE *fp, off_t offset, int whence) + + # @abstract Report the current stream offset + # @return The offset within the stream, starting from zero. + off_t htell(hFILE *fp) + + # @abstract Read one character from the stream + # @return The character read, or EOF on end-of-file or error + int hgetc(hFILE *fp) + + # @abstract Peek at characters to be read without removing them from buffers + # @param fp The file stream + # @param buffer The buffer to which the peeked bytes will be written + # @param nbytes The number of bytes to peek at; limited by the size of the + # internal buffer, which could be as small as 4K. + # @return The number of bytes peeked, which may be less than nbytes if EOF + # is encountered; or negative, if there was an I/O error. + # @notes The characters peeked at remain in the stream's internal buffer, + # and will be returned by later hread() etc calls. + ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) + + # @abstract Read a block of characters from the file + # @return The number of bytes read, or negative if an error occurred. + # @notes The full nbytes requested will be returned, except as limited + # by EOF or I/O errors. + ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) + + # @abstract Write a character to the stream + # @return The character written, or EOF if an error occurred. + int hputc(int c, hFILE *fp) + + # @abstract Write a string to the stream + # @return 0 if successful, or EOF if an error occurred. + int hputs(const char *text, hFILE *fp) + + # @abstract Write a block of characters to the file + # @return Either nbytes, or negative if an error occurred. + # @notes In the absence of I/O errors, the full nbytes will be written. + ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) + + # @abstract For writing streams, flush buffered output to the underlying stream + # @return 0 if successful, or EOF if an error occurred. + int hflush(hFILE *fp) + + cdef extern from "htslib/bgzf.h" nogil: ctypedef struct bgzf_mtaux_t ctypedef struct bgzidx_t @@ -61,9 +175,10 @@ cdef extern from "htslib/bgzf.h" nogil: # Open an existing file descriptor for reading or writing. # # @param fd file descriptor - # @param mode mode matching /[rwa][u0-9]+/: 'r' for reading, 'w' for - # writing, or 'a' for appending, while a digit specifies - # the zlib compression level. + # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for + # writing, 'a' for appending, 'g' for gzip rather than BGZF + # compression (with 'w' only), and digit specifies the zlib + # compression level. # Note that there is a distinction between 'u' and '0': the # first yields plain uncompressed output whereas the latter # outputs uncompressed data wrapped in the zlib format. @@ -124,12 +239,13 @@ cdef extern from "htslib/bgzf.h" nogil: # Write the data in the buffer to the file. int bgzf_flush(BGZF *fp) + int SEEK_SET + # Return a virtual file pointer to the current location in the file. # No interpetation of the value should be made, other than a subsequent # call to bgzf_seek can be used to position the file at the same point. # Return value is non-negative on success. - #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF)) - int64_t bgzf_tell(BGZF * fp) + int64_t bgzf_tell(BGZF *fp) # Set the file to read from the location specified by _pos_. # @@ -244,33 +360,82 @@ cdef extern from "htslib/hts.h" nogil: ctypedef struct cram_fd ctypedef union FilePointerUnion: - BGZF * bgzf - cram_fd * cram - hFILE * hfile - void * voidp + BGZF *bgzf + cram_fd *cram + hFILE *hfile + void *voidp + + ctypedef enum htsFormatCategory: + unknown_category + sequence_data # Sequence data -- SAM, BAM, CRAM, etc + variant_data # Variant calling data -- VCF, BCF, etc + index_file # Index file associated with some data file + region_list # Coordinate intervals or regions -- BED, etc + category_maximum + + ctypedef enum htsExactFormat: + unknown_format + binary_format + text_format + sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed + format_maximum + + ctypedef enum htsCompression: + no_compression, gzip, bgzf, custom + compression_maximum + + cdef struct htsVersion: + short major, minor + + ctypedef struct htsFormat: + htsFormatCategory category + htsExactFormat format + htsVersion version + htsCompression compression ctypedef struct htsFile: - # uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_compressed:2, is_kstream:1, dummy:25; - uint32_t is_bin + uint8_t is_bin + uint8_t is_write + uint8_t is_be + uint8_t is_cram int64_t lineno kstring_t line - char * fn - char * fn_aux + char *fn + char *fn_aux FilePointerUnion fp + htsFormat format int hts_verbose - # @abstract Table for converting a nucleotide character to the 4-bit encoding. + # @abstract Table for converting a nucleotide character to 4-bit encoding. + # The input character may be either an IUPAC ambiguity code, '=' for 0, or + # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 + # for A/C/G/T or combinations of these bits for ambiguous bases. const unsigned char *seq_nt16_table - # @abstract Table for converting a 4-bit encoded nucleotide to a letter. + # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC + # ambiguity code letter (or '=' when given 0). const char *seq_nt16_str + # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. + # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). + const int *seq_nt16_int + # @abstract Get the htslib version number # @return For released versions, a string like "N.N[.N]"; or git describe # output if using a library built within a Git repository. const char *hts_version() + # @abstract Determine format by peeking at the start of a file + # @param fp File opened for reading, positioned at the beginning + # @param fmt Format structure that will be filled out on return + # @return 0 for success, or negative if an error occurred. + int hts_detect_format(hFILE *fp, htsFormat *fmt) + + # @abstract Get a human-readable description of the file format + # @return Description string, to be freed by the caller after use. + char *hts_format_description(const htsFormat *format) + # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file # @param fn The file name or "-" for stdin/stdout # @param mode Mode matching /[rwa][bcuz0-9]+/ @@ -281,8 +446,9 @@ cdef extern from "htslib/hts.h" nogil: # specifier letters: # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) # c CRAM format + # g gzip compressed # u uncompressed - # z compressed + # z bgzf compressed # [0-9] zlib compression level # Note that there is a distinction between 'u' and '0': the first yields # plain uncompressed output whereas the latter outputs uncompressed data @@ -294,11 +460,29 @@ cdef extern from "htslib/hts.h" nogil: # [rw] .. uncompressed VCF htsFile *hts_open(const char *fn, const char *mode) + # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file + # @param fp The already-open file handle + # @param fn The file name or "-" for stdin/stdout + # @param mode Open mode, as per hts_open() + htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) + # @abstract Close a file handle, flushing buffered data for output streams # @param fp The file handle to be closed # @return 0 for success, or negative if an error occurred. int hts_close(htsFile *fp) + # @abstract Returns the file's format information + # @param fp The file handle + # @return Read-only pointer to the file's htsFormat. + const htsFormat *hts_get_format(htsFile *fp) + + # @abstract Sets a specified CRAM option on the open file handle. + # @param fp The file handle open the open file. + # @param opt The CRAM_OPT_* option. + # @param ... Optional arguments, dependent on the option used. + # @return 0 for success, or negative if an error occurred. + #int hts_set_opt(htsFile *fp, enum cram_option opt, ...) + int hts_getline(htsFile *fp, int delimiter, kstring_t *str) char **hts_readlines(const char *fn, int *_n) @@ -334,6 +518,10 @@ cdef extern from "htslib/hts.h" nogil: int8_t HTS_FMT_TBI int8_t HTS_FMT_CRAI + BGZF *hts_get_bgzfp(htsFile *fp) + int hts_useek(htsFile *fp, long uoffset, int where) + long hts_utell(htsFile *fp) + ctypedef struct hts_idx_t ctypedef struct hts_pair64_t: @@ -349,6 +537,7 @@ cdef extern from "htslib/hts.h" nogil: uint32_t read_rest uint32_t finished int tid, bed, end, n_off, i + int curr_tid, curr_beg, curr_end uint64_t curr_off hts_pair64_t *off hts_readrec_func *readfunc @@ -399,8 +588,8 @@ cdef extern from "htslib/hts.h" nogil: # # Returns one of the FT_* defines. # - # This function was added in order to avoid the need for excessive command - # line switches. + # DEPRECATED: This function has been replaced by hts_detect_format(). + # It and these FT_* macros will be removed in a future HTSlib release. int FT_UNKN int FT_GZ int FT_VCF @@ -711,7 +900,7 @@ cdef extern from "htslib/sam.h" nogil: # set bam_pileup1_t::level, while the later does. Level helps the # implementation of alignment viewers, but calculating this has some # overhead. - # + # # is_del, is_head, etc are a bit field, declaring as below should # work as expected, see # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J @@ -763,6 +952,7 @@ cdef extern from "htslib/sam.h" nogil: # Added by AH # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" + cdef extern from "pysam_stream.h" nogil: ctypedef struct kstream_t: @@ -775,12 +965,12 @@ cdef extern from "pysam_stream.h" nogil: kstring_t qual gzFile gzopen(char *, char *) - kseq_t * kseq_init(gzFile) + kseq_t *kseq_init(gzFile) int kseq_read(kseq_t *) void kseq_destroy(kseq_t *) int gzclose(gzFile) - kstream_t * ks_init(gzFile) + kstream_t *ks_init(gzFile) void ks_destroy(kstream_t *) # Retrieve characters from stream until delimiter @@ -790,6 +980,7 @@ cdef extern from "pysam_stream.h" nogil: kstring_t * str, int * dret) + cdef extern from "htslib/faidx.h": ctypedef struct faidx_t: @@ -817,9 +1008,10 @@ cdef extern from "htslib/faidx.h": int faidx_seq_len(faidx_t *fai, const char *seq) + # tabix support cdef extern from "htslib/tbx.h" nogil: - + # tbx.h definitions int8_t TBX_MAX_SHIFT int8_t TBX_GENERIC @@ -842,7 +1034,7 @@ cdef extern from "htslib/tbx.h" nogil: tbx_conf_t tbx_conf_psltbl tbx_conf_t tbx_conf_sam tbx_conf_t tbx_conf_vcf - + void tbx_itr_destroy(hts_itr_t * iter) hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) @@ -853,7 +1045,7 @@ cdef extern from "htslib/tbx.h" nogil: int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) - + tbx_t * tbx_index_load(char *fn) # free the array but not the values @@ -861,3 +1053,654 @@ cdef extern from "htslib/tbx.h" nogil: void tbx_destroy(tbx_t *tbx) + +# VCF/BCF API +cdef extern from "htslib/vcf.h" nogil: + + # Header struct + + uint8_t BCF_HL_FLT # header line + uint8_t BCF_HL_INFO + uint8_t BCF_HL_FMT + uint8_t BCF_HL_CTG + uint8_t BCF_HL_STR # structured header line TAG= + uint8_t BCF_HL_GEN # generic header line + + uint8_t BCF_HT_FLAG # header type + uint8_t BCF_HT_INT + uint8_t BCF_HT_REAL + uint8_t BCF_HT_STR + + uint8_t BCF_VL_FIXED # variable length + uint8_t BCF_VL_VAR + uint8_t BCF_VL_A + uint8_t BCF_VL_G + uint8_t BCF_VL_R + + # === Dictionary === + # + # The header keeps three dictonaries. The first keeps IDs in the + # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths + # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] + # is the actual hash table, which is opaque to the end users. In the hash + # table, the key is the ID or sample name as a C string and the value is a + # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash + # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the + # size of the hash table or, equivalently, the length of the id[] arrays. + + uint8_t BCF_DT_ID # dictionary type + uint8_t BCF_DT_CTG + uint8_t BCF_DT_SAMPLE + + # Complete textual representation of a header line + ctypedef struct bcf_hrec_t: + int type # One of the BCF_HL_* type + char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. + char *value # Set only for generic lines, NULL for FILTER/INFO, etc. + int nkeys # Number of structured fields + char **keys # The key=value pairs + char **vals + + ctypedef struct bcf_idinfo_t: + uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG + int id + + ctypedef struct bcf_idpair_t: + const char *key + const bcf_idinfo_t *val + + ctypedef struct bcf_hdr_t: + int32_t n[3] + bcf_idpair_t *id[3] + void *dict[3] # ID dictionary, contig dict and sample dict + char **samples + bcf_hrec_t **hrec + int nhrec, dirty + int ntransl + int *transl[2] # for bcf_translate() + int nsamples_ori # for bcf_hdr_set_samples() + uint8_t *keep_samples + kstring_t mem + + uint8_t bcf_type_shift[] + + # * VCF record * + + uint8_t BCF_BT_NULL + uint8_t BCF_BT_INT8 + uint8_t BCF_BT_INT16 + uint8_t BCF_BT_INT32 + uint8_t BCF_BT_FLOAT + uint8_t BCF_BT_CHAR + + uint8_t VCF_REF + uint8_t VCF_SNP + uint8_t VCF_MNP + uint8_t VCF_INDEL + uint8_t VCF_OTHER + + ctypedef struct variant_t: + int type, n # variant type and the number of bases affected, negative for deletions + + ctypedef struct bcf_fmt_t: + int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key + int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types + uint8_t *p # same as vptr and vptr_* in bcf_info_t below + uint32_t p_len + uint32_t p_off + uint8_t p_free + + ctypedef union bcf_info_union_t: + int32_t i # integer value + float f # float value + + ctypedef struct bcf_info_t: + int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key + int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars + + # v1 union only set if $len==1; for easier access + bcf_info_union_t v1 + uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes + uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset + uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes + uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new + # data block is bigger than the original + + uint8_t BCF1_DIRTY_ID + uint8_t BCF1_DIRTY_ALS + uint8_t BCF1_DIRTY_FLT + uint8_t BCF1_DIRTY_INF + + ctypedef struct bcf_dec_t: + int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change + int n_flt # Number of FILTER fields + int *flt # FILTER keys in the dictionary + char *id # ID + char *als # REF+ALT block (\0-seperated) + char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated + bcf_info_t *info # INFO + bcf_fmt_t *fmt # FORMAT and individual sample + variant_t *var # $var and $var_type set only when set_variant_types called + int n_var, var_type + int shared_dirty # if set, shared.s must be recreated on BCF output + int indiv_dirty # if set, indiv.s must be recreated on BCF output + + uint8_t BCF_ERR_CTG_UNDEF + uint8_t BCF_ERR_TAG_UNDEF + uint8_t BCF_ERR_NCOLS + + # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file + # is slower because the string is first to be parsed, packed into BCF line + # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it + # is known in advance that some of the fields will not be required (notably + # the sample columns), parsing of these can be skipped by setting max_unpack + # appropriately. + # Similarly, it is fast to output a BCF line because the columns (kept in + # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF + # line must be formatted in vcf_format. + + ctypedef struct bcf1_t: + int32_t rid # CHROM + int32_t pos # POS + int32_t rlen # length of REF + float qual # QUAL + uint32_t n_info, n_allele + uint32_t n_fmt, n_sample + kstring_t shared, indiv + bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() + int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed + int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work + int unpack_size[3] # the original block size of ID, REF+ALT and FILTER + int errcode # one of BCF_ERR_* codes + + ####### API ####### + + # BCF and VCF I/O + # + # A note about naming conventions: htslib internally represents VCF + # records as bcf1_t data structures, therefore most functions are + # prefixed with bcf_. There are a few exceptions where the functions must + # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In + # these cases, functions prefixed with bcf_ are more general and work + # with both BCF and VCF. + + # bcf_hdr_init() - create an empty BCF header. + # @param mode "r" or "w" + # + # When opened for writing, the mandatory fileFormat and + # FILTER=PASS lines are added automatically. + bcf_hdr_t *bcf_hdr_init(const char *mode) + + # Destroy a BCF header struct + void bcf_hdr_destroy(bcf_hdr_t *h) + + # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) + bcf1_t *bcf_init() + + # Deallocate a bcf1_t object + void bcf_destroy(bcf1_t *v) + + # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, + # not the bcf1_t object itself. + void bcf_empty(bcf1_t *v) + + # Make the bcf1_t object ready for next read. Intended mostly for + # internal use, the user should rarely need to call this function + # directly. + void bcf_clear(bcf1_t *v) + + # Reads VCF or BCF header + bcf_hdr_t *bcf_hdr_read(htsFile *fp) + + # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed + # @samples: samples to include or exclude from file or as a comma-separated string. + # LIST|FILE .. select samples in list/file + # ^LIST|FILE .. exclude samples from list/file + # - .. include all samples + # NULL .. exclude all samples + # @is_file: @samples is a file (1) or a comma-separated list (1) + # + # The bottleneck of VCF reading is parsing of genotype fields. If the + # reader knows in advance that only subset of samples is needed (possibly + # no samples at all), the performance of bcf_read() can be significantly + # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). + # The function bcf_read() will subset the VCF/BCF records automatically + # with the notable exception when reading records via bcf_itr_next(). + # In this case, bcf_subset_format() must be called explicitly, because + # bcf_readrec() does not see the header. + # + # Returns 0 on success, -1 on error or a positive integer if the list + # contains samples not present in the VCF header. In such a case, the + # return value is the index of the offending sample. + # + int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) + int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) + + # Writes VCF or BCF header + int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) + + # Parse VCF line contained in kstring and populate the bcf1_t struct + int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) + + # The opposite of vcf_parse. It should rarely be called directly, see vcf_write + int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) + + # bcf_read() - read next VCF or BCF record + # + # Returns -1 on critical errors, 0 otherwise. On errors which are not + # critical for reading, such as missing header definitions, v->errcode is + # set to one of BCF_ERR* code and must be checked before calling + # vcf_write(). + int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + + # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) + # + # Note that bcf_unpack() must be called even when reading VCF. It is safe + # to call the function repeatedly, it will not unpack the same field + # twice. + uint8_t BCF_UN_STR # up to ALT inclusive + uint8_t BCF_UN_FLT # up to FILTER + uint8_t BCF_UN_INFO # up to INFO + uint8_t BCF_UN_SHR # all shared information + uint8_t BCF_UN_FMT # unpack format and each sample + uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT + uint8_t BCF_UN_ALL # everything + + int bcf_unpack(bcf1_t *b, int which) + + # bcf_dup() - create a copy of BCF record. + # + # Note that bcf_unpack() must be called on the returned copy as if it was + # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) + # internally to reflect any changes made by bcf_update_* functions. + bcf1_t *bcf_dup(bcf1_t *src) + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) + + # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. + int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + + # The following functions work only with VCFs and should rarely be called + # directly. Usually one wants to use their bcf_* alternatives, which work + # transparently with both VCFs and BCFs. + bcf_hdr_t *vcf_hdr_read(htsFile *fp) + int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) + int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + + #************************************************************************ + # Header querying and manipulation routines + #************************************************************************ + + # Create a new header using the supplied template + bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) + + # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). + # Returns 0 on success or sets a bit on error: + # 1 .. conflicting definitions of tag length + # # todo + int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) + + # bcf_hdr_add_sample() - add a new sample. + # @param sample: sample name to be added + int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample) + + # Read VCF header from a file and update the header + int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) + + # Returns formatted header (newly allocated string) and its length, + # excluding the terminating \0. If is_bcf parameter is unset, IDX + # fields are discarded. + char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) + + # Append new VCF header line, returns 0 on success + int bcf_hdr_append(bcf_hdr_t *h, const char *line) + int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...) + + # VCF version, e.g. VCFv4.2 + const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) + void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) + + # bcf_hdr_remove() - remove VCF header tag + # @param type: one of BCF_HL_* + # @param key: tag name + void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key) + + # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples + # @param n: number of samples to keep + # @param samples: names of the samples to keep + # @param imap: mapping from index in @samples to the sample index in the original file + # + # Sample names not present in h0 are ignored. The number of unmatched samples can be checked + # by comparing n and bcf_hdr_nsamples(out_hdr). + # This function can be used to reorder samples. + # See also bcf_subset() which subsets individual records. + # + bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) + + # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) + const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs) + + # Get number of samples + int32_t bcf_hdr_nsamples(const bcf_hdr_t *h) + + # The following functions are for internal use and should rarely be called directly + int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) + int bcf_hdr_sync(bcf_hdr_t *h) + bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) + void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) + int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) + + # bcf_hdr_get_hrec() - get header line info + # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN + # @param key: the header key for generic lines (e.g. "fileformat"), any field + # for structured lines, typically "ID". + # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN + # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL + # + bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) + bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) + void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len) + void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted) + int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) + void hrec_add_idx(bcf_hrec_t *hrec, int idx) + void bcf_hrec_destroy(bcf_hrec_t *hrec) + + #************************************************************************ + # Individual record querying and manipulation routines + #************************************************************************ + + # See the description of bcf_hdr_subset() + int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) + + # bcf_translate() - translate tags ids to be consistent with different header. This function + # is useful when lines from multiple VCF need to be combined. + # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine() + # @src_hdr: the source header, used in bcf_read() + # @src_line: line obtained by bcf_read() + int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line) + + # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc + int bcf_get_variant_types(bcf1_t *rec) + int bcf_get_variant_type(bcf1_t *rec, int ith_allele) + int bcf_is_snp(bcf1_t *v) + + # bcf_update_filter() - sets the FILTER column + # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_id2int(hdr, BCF_DT_ID, "PASS") + # @n: Number of filters. If n==0, all filters are removed + int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n) + + # bcf_add_filter() - adds to the FILTER column + # @flt_id: filter ID to add, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS") + # + # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. + int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id) + + # bcf_remove_filter() - removes from the FILTER column + # @flt_id: filter ID to remove, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS") + # @pass: when set to 1 and no filters are present, set to PASS + int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass) + + # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. + int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) + + # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column + # @alleles: Array of alleles + # @nals: Number of alleles + # @alleles_string: Comma-separated alleles, starting with the REF allele + # + # Not that in order for indexing to work correctly in presence of INFO/END tag, + # the length of reference allele (line->rlen) must be set explicitly by the caller, + # or otherwise, if rlen is zero, strlen(line->d.allele[0]) is used to set the length + # on bcf_write(). + # + int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) + int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string) + int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) + + # bcf_update_info_*() - functions for updating INFO fields + # @hdr: the BCF header + # @line: VCF line to be edited + # @key: the INFO tag to be updated + # @values: pointer to the array of values. Pass NULL to remove the tag. + # @n: number of values in the array. When set to 0, the INFO tag is removed + # + # The @string in bcf_update_info_flag() is optional, @n indicates whether + # the flag is set or removed. + # + # Returns 0 on success or negative value on error. + # + int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n) + int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n) + int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) + int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) + int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) + + # bcf_update_format_*() - functions for updating FORMAT fields + # @values: pointer to the array of values, the same number of elements + # is expected for each sample. Missing values must be padded + # with bcf_*_missing or bcf_*_vector_end values. + # @n: number of values in the array. If n==0, existing tag is removed. + # + # The function bcf_update_format_string() is a higher-level (slower) variant of + # bcf_update_format_char(). The former accepts array of \0-terminated strings + # whereas the latter requires that the strings are collapsed into a single array + # of fixed-length strings. In case of strings with variable length, shorter strings + # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() + # are not \0-terminated. + # + # Returns 0 on success or negative value on error. + # + int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n) + int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n) + int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n) + int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n) + int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n) + int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) + + # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds + # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained + # from bcf_get_genotypes() below. + uint32_t bcf_gt_phased(uint32_t idx) + uint32_t bcf_gt_unphased(uint32_t idx) + uint32_t bcf_gt_missing + uint32_t bcf_gt_is_missing(uint32_t val) + uint32_t bcf_gt_is_phased(uint32_t idx) + uint32_t bcf_gt_allele(uint32_t val) + + # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) + uint32_t bcf_alleles2gt(uint32_t a, uint32_t b) + void bcf_gt2alleles(int igt, int *a, int *b) + + # bcf_get_fmt() - returns pointer to FORMAT's field data + # @header: for access to BCF_DT_ID dictionary + # @line: VCF line obtained from vcf_parse1 + # @fmt: one of GT,PL,... + # + # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field + # is not available. + # + bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) + bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) + + # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID + # @line: VCF line obtained from vcf_parse1 + # @id: The header index for the tag, obtained from bcf_hdr_id2int() + # + # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid + # as their goal is to avoid the header lookup. + # + bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id) + bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id) + + # bcf_get_info_*() - get INFO values, integers or floats + # @hdr: BCF header + # @line: BCF record + # @tag: INFO tag to retrieve + # @dst: *dst is pointer to a memory location, can point to NULL + # @ndst: pointer to the size of allocated memory + # + # Returns negative value on error or the number of written values on + # success. bcf_get_info_string() returns on success the number of + # characters written excluding the null-terminating byte. bcf_get_info_flag() + # returns 1 when flag is set or 0 if not. + # + # List of return codes: + # -1 .. no such INFO tag defined in the header + # -2 .. clash between types defined in the header and encountered in the VCF record + # -3 .. tag is not present in the VCF record + # + int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst) + int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst) + int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst) + int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst) + int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) + + # bcf_get_format_*() - same as bcf_get_info*() above + # + # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char(). + # see the description of bcf_update_format_string() and bcf_update_format_char() above. + # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays: + # a single block of \0-terminated strings collapsed into a single array and an array of pointers + # to these strings. Both arrays must be cleaned by the user. + # + # Returns negative value on error or the number of written values on success. + # + # Example: + # int ndst = 0; char **dst = NULL + # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 ) + # for (i=0; i=0 + # + # The returned values are: + # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* + # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields + # bcf_hdr_id2type .. the field type, one of BCF_HT_* + # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* + # + # Notes: Prior to using the macros, the presence of the info should be + # tested with bcf_hdr_idinfo_exists(). + # + int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) + bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) + + void bcf_fmt_array(kstring_t *s, int n, int type, void *data) + uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) + + void bcf_enc_vchar(kstring_t *s, int l, const char *a) + void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) + void bcf_enc_vfloat(kstring_t *s, int n, float *a) + + #************************************************************************ + # BCF index + # + # Note that these functions work with BCFs only. See synced_bcf_reader.h + # which provides (amongst other things) an API to work transparently with + # both indexed BCFs and VCFs. + #************************************************************************ + + int bcf_index_build(const char *fn, int min_shift) + + #******************* + # Typed value I/O * + #****************** + + # Note that in contrast with BCFv2.1 specification, HTSlib implementation + # allows missing values in vectors. For integer types, the values 0x80, + # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, + # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of + # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an + # end-of-vector indicator. + # Note that the end-of-vector byte is not part of the vector. + + # This trial BCF version (v2.2) is compatible with the VCF specification and + # enables to handle correctly vectors with different ploidy in presence of + # missing values. + + int32_t bcf_int8_vector_end + int32_t bcf_int16_vector_end + int32_t bcf_int32_vector_end + int32_t bcf_str_vector_end + int32_t bcf_int8_missing + int32_t bcf_int16_missing + int32_t bcf_int32_missing + int32_t bcf_str_missing + + uint32_t bcf_float_vector_end + uint32_t bcf_float_missing + + void bcf_float_set(float *ptr, uint32_t value) + void bcf_float_set_vector_end(float *x) + void bcf_float_set_missing(float *x) + + int bcf_float_is_missing(float f) + int bcf_float_is_vector_end(float f) + void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) + void bcf_enc_size(kstring_t *s, int size, int type) + int bcf_enc_inttype(long x) + void bcf_enc_int1(kstring_t *s, int32_t x) + int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) + int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) + int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) + + # These trivial wrappers are defined only for consistency with other parts of htslib + bcf1_t *bcf_init1() + int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + void bcf_destroy1(bcf1_t *v) + int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) + void bcf_clear1(bcf1_t *v) + int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) + + # Other nice wrappers + void bcf_itr_destroy(hts_itr_t *iter) + hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) + hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) + int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) + hts_idx_t *bcf_index_load(const char *fn) + const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) diff --git a/pysam/cvcf.pyx b/pysam/cvcf.pyx index 8a6708b7..5feb2a64 100644 --- a/pysam/cvcf.pyx +++ b/pysam/cvcf.pyx @@ -16,7 +16,7 @@ # The sample keys are accessible through vcf.getsamples() # # A dictionary of values contains value keys (defined in ##INFO or -# ##FORMAT lines) which map to a list, containign integers, floats, +# ##FORMAT lines) which map to a list, containing integers, floats, # strings, or characters. Missing values are replaced by a particular # value, often -1 or . # diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h index 36144735..67642d1f 100644 --- a/pysam/htslib_util.h +++ b/pysam/htslib_util.h @@ -1,6 +1,19 @@ #ifndef PYSAM_UTIL_H #define PYSAM_UTIL_H +#include "htslib/sam.h" +#include "htslib/vcf.h" +#include "htslib/khash.h" + +KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +typedef khash_t(vdict) vdict_t; + +KHASH_DECLARE(s2i, kh_cstr_t, int64_t) +typedef khash_t(s2i) s2i_t; + +int hts_useek(htsFile *fp, long uoffset, int where); +long hts_utell(htsFile *fp); + ////////////////////////////////////////////////////////////////// /*! set pysam standard error to point to file descriptor diff --git a/pysam/version.py b/pysam/version.py index 13749ed8..c4038691 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,7 +1,7 @@ # pysam versioning information -__version__ = "0.8.1" +__version__ = "0.8.2" -__samtools_version__ = "1.1" +__samtools_version__ = "1.2" -__htslib_version__ = "1.1" +__htslib_version__ = "1.2.1" diff --git a/requires.txt b/requires.txt index 743df07f..e9378dc1 100644 --- a/requires.txt +++ b/requires.txt @@ -1 +1 @@ -cython>=0.17 +cython>=0.21 diff --git a/samtools/bam.h b/samtools/bam.h index e822331d..b8f7bc1d 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ @copyright Genome Research Ltd. */ -#define BAM_VERSION "1.1" +#define BAM_VERSION "1.2" #include #include diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c index c8d061eb..e80e4c27 100644 --- a/samtools/bam2bcf_indel.c +++ b/samtools/bam2bcf_indel.c @@ -1,7 +1,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012-2014 Genome Research Ltd. Author: Heng Li @@ -26,9 +26,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include "bam.h" +#include "htslib/sam.h" #include "bam2bcf.h" -#include "kaln.h" #include "kprobaln.h" #include "htslib/khash.h" KHASH_SET_INIT_STR(rg) @@ -197,7 +196,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla aux[m++] = MINUS_CONST + p->indel; } } - j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } float frac = (float)na/nt; @@ -224,7 +223,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } if (n_types >= 64) { free(aux); - if (bam_verbose >= 2) + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); return -1; } @@ -264,7 +264,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla cns = calloc(L, 4); ref0 = calloc(L, 1); for (i = 0; i < right - left; ++i) - ref0[i] = bam_nt16_table[(int)ref[i+left]]; + ref0[i] = seq_nt16_table[(int)ref[i+left]]; for (s = 0; s < n; ++s) { r = ref_sample[s] = calloc(L, 1); memset(cns, 0, sizeof(int) * L); @@ -272,8 +272,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; bam1_t *b = p->b; - uint32_t *cigar = bam1_cigar(b); - uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); int x = b->core.pos, y = 0; for (k = 0; k < b->core.n_cigar; ++k) { int op = cigar[k]&0xf; @@ -281,7 +281,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) if (x + j >= left && x + j < right) - cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; + cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; @@ -303,14 +303,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla free(ref0); free(cns); } { // the length of the homopolymer run around the current position - int c = bam_nt16_table[(int)ref[pos + 1]]; + int c = seq_nt16_table[(int)ref[pos + 1]]; if (c == 15) l_run = 1; else { for (i = pos + 2; ref[i]; ++i) - if (bam_nt16_table[(int)ref[i]] != c) break; + if (seq_nt16_table[(int)ref[i]] != c) break; l_run = i; for (i = pos; i >= 0; --i) - if (bam_nt16_table[(int)ref[i]] != c) break; + if (seq_nt16_table[(int)ref[i]] != c) break; l_run -= i + 1; } } @@ -325,9 +325,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; if (p->indel == types[t]) { - uint8_t *seq = bam1_seq(p->b); + uint8_t *seq = bam_get_seq(p->b); for (k = 1; k <= p->indel; ++k) { - int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; + int c = bam_nt16_nt4_table[bam_seqi(seq, p->qpos + k)]; assert(c<5); ++inscns_aux[(t*max_ins+(k-1))*5 + c]; } @@ -383,8 +383,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; int qbeg, qend, tbeg, tend, sc, kk; - uint8_t *seq = bam1_seq(p->b); - uint32_t *cigar = bam1_cigar(p->b); + uint8_t *seq = bam_get_seq(p->b); + uint32_t *cigar = bam_get_cigar(p->b); if (p->b->core.flag&4) continue; // unmapped reads // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. for (kk = 0; kk < p->b->core.n_cigar; ++kk) @@ -392,17 +392,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (kk < p->b->core.n_cigar) continue; // FIXME: the following skips soft clips, but using them may be more sensitive. // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); if (types[t] < 0) { int l = -types[t]; tbeg = tbeg - l > left? tbeg - l : left; } // write the query sequence for (l = qbeg; l < qend; ++l) - query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)]; + query[l - qbeg] = bam_nt16_nt4_table[bam_seqi(seq, l)]; { // do realignment; this is the bottleneck - const uint8_t *qual = bam1_qual(p->b), *bq; + const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; qq = calloc(qend - qbeg, 1); bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c index 480481b5..8a469ee4 100644 --- a/samtools/bam2bcf_indel.c.pysam.c +++ b/samtools/bam2bcf_indel.c.pysam.c @@ -3,7 +3,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012-2014 Genome Research Ltd. Author: Heng Li @@ -28,9 +28,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include "bam.h" +#include "htslib/sam.h" #include "bam2bcf.h" -#include "kaln.h" #include "kprobaln.h" #include "htslib/khash.h" KHASH_SET_INIT_STR(rg) @@ -199,7 +198,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla aux[m++] = MINUS_CONST + p->indel; } } - j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } float frac = (float)na/nt; @@ -226,7 +225,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } if (n_types >= 64) { free(aux); - if (bam_verbose >= 2) + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) fprintf(pysamerr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); return -1; } @@ -266,7 +266,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla cns = calloc(L, 4); ref0 = calloc(L, 1); for (i = 0; i < right - left; ++i) - ref0[i] = bam_nt16_table[(int)ref[i+left]]; + ref0[i] = seq_nt16_table[(int)ref[i+left]]; for (s = 0; s < n; ++s) { r = ref_sample[s] = calloc(L, 1); memset(cns, 0, sizeof(int) * L); @@ -274,8 +274,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; bam1_t *b = p->b; - uint32_t *cigar = bam1_cigar(b); - uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); int x = b->core.pos, y = 0; for (k = 0; k < b->core.n_cigar; ++k) { int op = cigar[k]&0xf; @@ -283,7 +283,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) if (x + j >= left && x + j < right) - cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; + cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; @@ -305,14 +305,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla free(ref0); free(cns); } { // the length of the homopolymer run around the current position - int c = bam_nt16_table[(int)ref[pos + 1]]; + int c = seq_nt16_table[(int)ref[pos + 1]]; if (c == 15) l_run = 1; else { for (i = pos + 2; ref[i]; ++i) - if (bam_nt16_table[(int)ref[i]] != c) break; + if (seq_nt16_table[(int)ref[i]] != c) break; l_run = i; for (i = pos; i >= 0; --i) - if (bam_nt16_table[(int)ref[i]] != c) break; + if (seq_nt16_table[(int)ref[i]] != c) break; l_run -= i + 1; } } @@ -327,9 +327,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; if (p->indel == types[t]) { - uint8_t *seq = bam1_seq(p->b); + uint8_t *seq = bam_get_seq(p->b); for (k = 1; k <= p->indel; ++k) { - int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; + int c = bam_nt16_nt4_table[bam_seqi(seq, p->qpos + k)]; assert(c<5); ++inscns_aux[(t*max_ins+(k-1))*5 + c]; } @@ -385,8 +385,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; int qbeg, qend, tbeg, tend, sc, kk; - uint8_t *seq = bam1_seq(p->b); - uint32_t *cigar = bam1_cigar(p->b); + uint8_t *seq = bam_get_seq(p->b); + uint32_t *cigar = bam_get_cigar(p->b); if (p->b->core.flag&4) continue; // unmapped reads // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. for (kk = 0; kk < p->b->core.n_cigar; ++kk) @@ -394,17 +394,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (kk < p->b->core.n_cigar) continue; // FIXME: the following skips soft clips, but using them may be more sensitive. // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); if (types[t] < 0) { int l = -types[t]; tbeg = tbeg - l > left? tbeg - l : left; } // write the query sequence for (l = qbeg; l < qend; ++l) - query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)]; + query[l - qbeg] = bam_nt16_nt4_table[bam_seqi(seq, l)]; { // do realignment; this is the bottleneck - const uint8_t *qual = bam1_qual(p->b), *bq; + const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; qq = calloc(qend - qbeg, 1); bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index 70882bed..b749062f 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -98,7 +98,7 @@ int main_depth(int argc, char *argv[]) fprintf(stderr, "Options:\n"); fprintf(stderr, " -b list of positions or regions\n"); fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(stderr, " -l minQLen\n"); + fprintf(stderr, " -l read length threshold (ignore reads shorter than )\n"); fprintf(stderr, " -q base quality threshold\n"); fprintf(stderr, " -Q mapping quality threshold\n"); fprintf(stderr, " -r region\n"); @@ -126,6 +126,16 @@ int main_depth(int argc, char *argv[]) status = EXIT_FAILURE; goto depth_end; } + if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | + SAM_SEQ)) { + fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header @@ -177,8 +187,8 @@ int main_depth(int argc, char *argv[]) depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); - sam_close(data[i]->fp); - if (data[i]->iter) hts_itr_destroy(data[i]->iter); + if (data[i]->fp) sam_close(data[i]->fp); + hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index c991e084..5c588f9e 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -100,7 +100,7 @@ int main_depth(int argc, char *argv[]) fprintf(pysamerr, "Options:\n"); fprintf(pysamerr, " -b list of positions or regions\n"); fprintf(pysamerr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(pysamerr, " -l minQLen\n"); + fprintf(pysamerr, " -l read length threshold (ignore reads shorter than )\n"); fprintf(pysamerr, " -q base quality threshold\n"); fprintf(pysamerr, " -Q mapping quality threshold\n"); fprintf(pysamerr, " -r region\n"); @@ -128,6 +128,16 @@ int main_depth(int argc, char *argv[]) status = EXIT_FAILURE; goto depth_end; } + if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | + SAM_SEQ)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header @@ -179,8 +189,8 @@ int main_depth(int argc, char *argv[]) depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); - sam_close(data[i]->fp); - if (data[i]->iter) hts_itr_destroy(data[i]->iter); + if (data[i]->fp) sam_close(data[i]->fp); + hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 70ee18bd..017d5e1b 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -74,7 +74,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) * * How We Handle Input * - * Secondary Reads: + * Secondary and supplementary Reads: * -write to output unchanged * All Reads: * -if pos == 0 (1 based), tid == -1 set UNMAPPED flag @@ -94,6 +94,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) * -write to output * Limitations * -Does not handle tandem reads + * -Should mark supplementary reads the same as primary. * Notes * -CT definition appears to be something else in spec, this was in here before * I started tampering with it, anyone know what is going on here? To work diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index 210e6eae..be0dc379 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -76,7 +76,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) * * How We Handle Input * - * Secondary Reads: + * Secondary and supplementary Reads: * -write to output unchanged * All Reads: * -if pos == 0 (1 based), tid == -1 set UNMAPPED flag @@ -96,6 +96,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) * -write to output * Limitations * -Does not handle tandem reads + * -Should mark supplementary reads the same as primary. * Notes * -CT definition appears to be something else in spec, this was in here before * I started tampering with it, anyone know what is going on here? To work diff --git a/samtools/bam_md.c b/samtools/bam_md.c index 93f64f59..7d1c6a76 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -30,7 +30,6 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/faidx.h" #include "sam.h" #include "htslib/kstring.h" -#include "kaln.h" #include "kprobaln.h" #define USE_EQUAL 1 diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index 840d7740..5f5bb8ac 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -32,7 +32,6 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/faidx.h" #include "sam.h" #include "htslib/kstring.h" -#include "kaln.h" #include "kprobaln.h" #define USE_EQUAL 1 diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index a1b381d5..d574cca1 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -61,7 +61,9 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp); } if (!p->is_del) { - int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]; + int c = p->qpos < p->b->core.l_qseq + ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)] + : 'N'; if (ref) { int rb = pos < ref_len? ref[pos] : 'N'; if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.'; @@ -264,6 +266,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } + if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); @@ -271,7 +277,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } - data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); @@ -281,17 +286,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } - if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { - fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); + if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) { + fprintf(stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } - if (i == 0) h = h_tmp; /* save the header of first file in list */ + else + data[i]->iter = NULL; + + if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file else { - // FIXME: to check consistency + // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); + + // we store only the first file's header; it's (alleged to be) + // compatible with the i-th file's target_name lookup needs + data[i]->h = h; } } // allocate data storage proportionate to number of samples being studied sm->n @@ -316,6 +328,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) exit(1); } + // BCF header creation bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; @@ -335,6 +348,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_hdr_append(bcf_hdr, str.s); } + // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; in_targets; i++) { @@ -381,7 +395,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); + // End of BCF header creation + // Initialise the calling algorithm bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; @@ -422,7 +438,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; - // begin pileup + // init pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; @@ -436,6 +452,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; + // begin pileup while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; @@ -477,7 +494,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { @@ -487,13 +507,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - int c = bam_get_qual(p->b)[p->qpos]; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 2151a1b4..9d2c9875 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -63,7 +63,9 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp); } if (!p->is_del) { - int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]; + int c = p->qpos < p->b->core.l_qseq + ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)] + : 'N'; if (ref) { int rb = pos < ref_len? ref[pos] : 'N'; if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.'; @@ -266,6 +268,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(pysamerr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } + if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); @@ -273,7 +279,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(pysamerr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } - data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); @@ -283,17 +288,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(pysamerr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } - if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { - fprintf(pysamerr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); + if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) { + fprintf(pysamerr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } - if (i == 0) h = h_tmp; /* save the header of first file in list */ + else + data[i]->iter = NULL; + + if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file else { - // FIXME: to check consistency + // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); + + // we store only the first file's header; it's (alleged to be) + // compatible with the i-th file's target_name lookup needs + data[i]->h = h; } } // allocate data storage proportionate to number of samples being studied sm->n @@ -318,6 +330,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) exit(1); } + // BCF header creation bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; @@ -337,6 +350,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_hdr_append(bcf_hdr, str.s); } + // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; in_targets; i++) { @@ -383,7 +397,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); + // End of BCF header creation + // Initialise the calling algorithm bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; @@ -424,7 +440,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; - // begin pileup + // init pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; @@ -438,6 +454,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; + // begin pileup while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; @@ -479,7 +496,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { @@ -489,13 +509,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - int c = bam_get_qual(p->b)[p->qpos]; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index c9c1af31..e721c594 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -230,9 +230,6 @@ static void pretty_header(char** text_in_out, int32_t text_len) static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg) { - // No need to translate header into itself - if (out == translate) { merge_rg = merge_pg = true; } - tbl->n_targets = translate->n_targets; tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); tbl->rg_trans = kh_init(c2c); @@ -594,6 +591,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; + bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? @@ -612,6 +610,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); + hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { @@ -641,9 +640,20 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char return -1; } hin = sam_hdr_read(fp[i]); - if (hout == NULL) hout = hin; - trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); - if (hin != hout) bam_hdr_destroy(hin); + if (hout) + trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); + else { + // As yet, no headers to merge into... + hout = bam_hdr_dup(hin); + // ...so no need to translate header into itself + trans_tbl_init(hout, hin, translation_tbl+i, true, true); + } + + // TODO sam_itr_next() doesn't yet work for SAM files, + // so for those keep the headers around for use with sam_read1() + if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; + else { bam_hdr_destroy(hin); hdr[i] = NULL; } + if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } @@ -677,30 +687,38 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); + if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { - iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); - if (iter[i] == NULL) { - fprintf(stderr, "[%s] Memory allocation failed\n", __func__); - return -1; + if (hdr[i] == NULL) { + iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); + if (iter[i] == NULL) break; } + else iter[i] = NULL; } } + if (i < n) { + fprintf(stderr, "[%s] Memory allocation failed\n", __func__); + return -1; + } + // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; - h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); - if (sam_itr_next(fp[i], iter[i], h->b) >= 0) { + h->b = bam_init1(); + if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; + bam_destroy1(h->b); + h->b = NULL; } } @@ -722,13 +740,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); - if ((j = sam_itr_next(fp[heap->i], iter[heap->i], b)) >= 0) { + if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; - free(heap->b->data); free(heap->b); + bam_destroy1(heap->b); heap->b = NULL; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); @@ -742,11 +760,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); + bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); - free(translation_tbl); free(fp); free(heap); free(iter); + free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; } @@ -1020,12 +1039,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const // write sub files for (;;) { if (k == max_k) { - size_t old_max = max_k; + size_t kk, old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*)); - memset(buf + old_max, 0, sizeof(bam1_t*) * (max_k - old_max)); + for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL; } - if (buf[k] == NULL) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); + if (buf[k] == NULL) buf[k] = bam_init1(); b = buf[k]; if ((ret = sam_read1(fp, header, b)) < 0) break; if (b->l_data < b->m_data>>2) { // shrink @@ -1067,11 +1086,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const free(fns); } // free - for (k = 0; k < max_k; ++k) { - if (!buf[k]) continue; - free(buf[k]->data); - free(buf[k]); - } + for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]); free(buf); bam_hdr_destroy(header); sam_close(fp); diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 630eb26d..33d7f5cd 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -232,9 +232,6 @@ static void pretty_header(char** text_in_out, int32_t text_len) static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg) { - // No need to translate header into itself - if (out == translate) { merge_rg = merge_pg = true; } - tbl->n_targets = translate->n_targets; tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); tbl->rg_trans = kh_init(c2c); @@ -596,6 +593,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; + bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? @@ -614,6 +612,7 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); + hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { @@ -643,9 +642,20 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char return -1; } hin = sam_hdr_read(fp[i]); - if (hout == NULL) hout = hin; - trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); - if (hin != hout) bam_hdr_destroy(hin); + if (hout) + trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); + else { + // As yet, no headers to merge into... + hout = bam_hdr_dup(hin); + // ...so no need to translate header into itself + trans_tbl_init(hout, hin, translation_tbl+i, true, true); + } + + // TODO sam_itr_next() doesn't yet work for SAM files, + // so for those keep the headers around for use with sam_read1() + if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; + else { bam_hdr_destroy(hin); hdr[i] = NULL; } + if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } @@ -679,30 +689,38 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); + if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { - iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); - if (iter[i] == NULL) { - fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); - return -1; + if (hdr[i] == NULL) { + iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); + if (iter[i] == NULL) break; } + else iter[i] = NULL; } } + if (i < n) { + fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); + return -1; + } + // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; - h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); - if (sam_itr_next(fp[i], iter[i], h->b) >= 0) { + h->b = bam_init1(); + if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; + bam_destroy1(h->b); + h->b = NULL; } } @@ -724,13 +742,13 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); - if ((j = sam_itr_next(fp[heap->i], iter[heap->i], b)) >= 0) { + if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; - free(heap->b->data); free(heap->b); + bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); @@ -744,11 +762,12 @@ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); + bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); - free(translation_tbl); free(fp); free(heap); free(iter); + free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; } @@ -1022,12 +1041,12 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const // write sub files for (;;) { if (k == max_k) { - size_t old_max = max_k; + size_t kk, old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*)); - memset(buf + old_max, 0, sizeof(bam1_t*) * (max_k - old_max)); + for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL; } - if (buf[k] == NULL) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); + if (buf[k] == NULL) buf[k] = bam_init1(); b = buf[k]; if ((ret = sam_read1(fp, header, b)) < 0) break; if (b->l_data < b->m_data>>2) { // shrink @@ -1069,11 +1088,7 @@ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const free(fns); } // free - for (k = 0; k < max_k; ++k) { - if (!buf[k]) continue; - free(buf[k]->data); - free(buf[k]); - } + for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]); free(buf); bam_hdr_destroy(header); sam_close(fp); diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c index 5dbe04f4..1bbebdb8 100644 --- a/samtools/bam_stat.c +++ b/samtools/bam_stat.c @@ -23,7 +23,13 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include -#include "bam.h" +#include +#include +#include +#include + +#include "htslib/sam.h" +//#include "bam.h" #include "samtools.h" typedef struct { @@ -59,7 +65,7 @@ typedef struct { if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ } while (0) -bam_flagstat_t *bam_flagstat_core(bamFile fp) +bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) { bam_flagstat_t *s; bam1_t *b; @@ -68,7 +74,7 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp) s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); b = bam_init1(); c = &b->core; - while ((ret = bam_read1(fp, b)) >= 0) + while ((ret = sam_read1(fp, h, b)) >= 0) flagstat_loop(s, c); bam_destroy1(b); if (ret != -1) @@ -77,23 +83,35 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp) } int bam_flagstat(int argc, char *argv[]) { - bamFile fp; - bam_header_t *header; + samFile *fp; + bam_hdr_t *header; bam_flagstat_t *s; if (argc == optind) { fprintf(stderr, "Usage: samtools flagstat \n"); return 1; } - fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(STDIN_FILENO, "r"); + fp = sam_open(argv[optind], "r"); if (fp == NULL) { print_error_errno("Cannot open input file \"%s\"", argv[optind]); return 1; } - header = bam_header_read(fp); - s = bam_flagstat_core(fp); + + if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { + fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + + if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } + + header = sam_hdr_read(fp); + s = bam_flagstat_core(fp, header); printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); - printf("%lld + %lld supplimentary\n", s->n_supp[0], s->n_supp[1]); + printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); @@ -105,7 +123,7 @@ int bam_flagstat(int argc, char *argv[]) printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); - bam_header_destroy(header); - bam_close(fp); + bam_hdr_destroy(header); + sam_close(fp); return 0; } diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c index a07d32de..15a12426 100644 --- a/samtools/bam_stat.c.pysam.c +++ b/samtools/bam_stat.c.pysam.c @@ -25,7 +25,13 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include -#include "bam.h" +#include +#include +#include +#include + +#include "htslib/sam.h" +//#include "bam.h" #include "samtools.h" typedef struct { @@ -61,7 +67,7 @@ typedef struct { if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ } while (0) -bam_flagstat_t *bam_flagstat_core(bamFile fp) +bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) { bam_flagstat_t *s; bam1_t *b; @@ -70,7 +76,7 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp) s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); b = bam_init1(); c = &b->core; - while ((ret = bam_read1(fp, b)) >= 0) + while ((ret = sam_read1(fp, h, b)) >= 0) flagstat_loop(s, c); bam_destroy1(b); if (ret != -1) @@ -79,23 +85,35 @@ bam_flagstat_t *bam_flagstat_core(bamFile fp) } int bam_flagstat(int argc, char *argv[]) { - bamFile fp; - bam_header_t *header; + samFile *fp; + bam_hdr_t *header; bam_flagstat_t *s; if (argc == optind) { fprintf(pysamerr, "Usage: samtools flagstat \n"); return 1; } - fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(STDIN_FILENO, "r"); + fp = sam_open(argv[optind], "r"); if (fp == NULL) { print_error_errno("Cannot open input file \"%s\"", argv[optind]); return 1; } - header = bam_header_read(fp); - s = bam_flagstat_core(fp); + + if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + + if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } + + header = sam_hdr_read(fp); + s = bam_flagstat_core(fp, header); printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); - printf("%lld + %lld supplimentary\n", s->n_supp[0], s->n_supp[1]); + printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); @@ -107,7 +125,7 @@ int bam_flagstat(int argc, char *argv[]) printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); - bam_header_destroy(header); - bam_close(fp); + bam_hdr_destroy(header); + sam_close(fp); return 0; } diff --git a/samtools/errmod.c b/samtools/errmod.c index 9f5740bc..e7759a04 100644 --- a/samtools/errmod.c +++ b/samtools/errmod.c @@ -134,11 +134,10 @@ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) // The total count of each base observed per strand int w[32]; - /* zero out q */ - memset(q, 0, m * m * sizeof(float)); + memset(q, 0, m * m * sizeof(float)); // initialise q to 0 if (n == 0) return 0; - // calculate aux.esum and aux.fsum - if (n > 255) { // then sample 255 bases + // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix + if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255 ks_shuffle(uint16_t, n, bases); n = 255; } diff --git a/samtools/errmod.c.pysam.c b/samtools/errmod.c.pysam.c index c19a2012..db570013 100644 --- a/samtools/errmod.c.pysam.c +++ b/samtools/errmod.c.pysam.c @@ -136,11 +136,10 @@ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) // The total count of each base observed per strand int w[32]; - /* zero out q */ - memset(q, 0, m * m * sizeof(float)); + memset(q, 0, m * m * sizeof(float)); // initialise q to 0 if (n == 0) return 0; - // calculate aux.esum and aux.fsum - if (n > 255) { // then sample 255 bases + // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix + if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255 ks_shuffle(uint16_t, n, bases); n = 255; } diff --git a/samtools/kaln.c b/samtools/kaln.c deleted file mode 100644 index cd4826e1..00000000 --- a/samtools/kaln.c +++ /dev/null @@ -1,486 +0,0 @@ -/* The MIT License - - Copyright (C) 2003-2006, 2008-2010 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include "kaln.h" - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 - -typedef struct { - int i, j; - unsigned char ctype; -} path_t; - -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -int aln_sm_qual[] = { - 0, -23, -23, -23, 0, - -23, 0, -23, -23, 0, - -23, -23, 0, -23, 0, - -23, -23, -23, 0, 0, - 0, 0, 0, 0, 0 -}; - -ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; -ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; - -ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 }; - -static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)calloc(*n_cigar, 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end_ext >= 0) { \ - if ((p)->M - gap_end_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_end_open - gap_end_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end_ext; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end_ext >= 0) { \ - if ((p)->M - gap_end_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_end_open - gap_end_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end_ext; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct { - uint8_t Mt:3, It:2, Dt:3; -} dpcell_t; - -typedef struct { - int M, I, D; -} dpscore_t; - -/*************************** - * banded global alignment * - ***************************/ -uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) -{ - int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - int b1, b2, tmp_end; - int *mat, end, max = 0; - uint8_t type, ctype; - uint32_t *cigar = 0; - - int gap_open, gap_ext, gap_end_open, gap_end_ext, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end_open = ap->gap_end_open; - gap_end_ext = ap->gap_end_ext; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (n_cigar) *n_cigar = 0; - if (len1 == 0 || len2 == 0) return 0; - - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - *_score = last[len1].M; - if (n_cigar) { /* backtrace */ - path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - cigar = ka_path2cigar32(path, p - path - 1, n_cigar); - free(path); - } - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return cigar; -} - -typedef struct { - int M, I, D; -} score_aux_t; - -#define MINUS_INF -0x40000000 - -// matrix: len2 rows and len1 columns -int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) -{ - -#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ - int t1, t2; \ - score_aux_t *_q; \ - _q = _q0; \ - _p->M = _q->M >= _q->I? _q->M : _q->I; \ - _p->M = _p->M >= _q->D? _p->M : _q->D; \ - _p->M += (_sc); \ - ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ - _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ - } - - int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; - const uint8_t *seq1, *seq2; - score_aux_t *curr, *last, *swap; - bw = abs(len1 - len2) + ap->band_width; - i = len1 > len2? len1 : len2; - if (bw > i + 1) bw = i + 1; - seq1 = _seq1 - 1; seq2 = _seq2 - 1; - curr = calloc(len1 + 2, sizeof(score_aux_t)); - last = calloc(len1 + 2, sizeof(score_aux_t)); - { // the zero-th row - int x, end = len1; - score_aux_t *p; - j = 0; - x = j + bw; end = len1 < x? len1 : x; // band end - p = curr; - p->M = 0; p->I = p->D = MINUS_INF; - for (i = 1, p = &curr[1]; i <= end; ++i, ++p) - p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); - p->M = p->I = p->D = MINUS_INF; - swap = curr; curr = last; last = swap; - } - for (j = 1; j < len2; ++j) { - int x, beg = 0, end = len1, *scrow, col_end; - score_aux_t *p; - x = j - bw; beg = 0 > x? 0 : x; // band start - x = j + bw; end = len1 < x? len1 : x; // band end - if (beg == 0) { // from zero-th column - p = curr; - p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); - ++beg; // then beg = 1 - } - scrow = scmat + seq2[j] * scmat_size; - if (end == len1) col_end = 1, --end; - else col_end = 0; - for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); - if (col_end) { - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); - ++p; - } - p->M = p->I = p->D = MINUS_INF; -// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); - swap = curr; curr = last; last = swap; - } - { // the last row - int x, beg = 0, *scrow; - score_aux_t *p; - j = len2; - x = j - bw; beg = 0 > x? 0 : x; // band start - if (beg == 0) { // from zero-th column - p = curr; - p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); - ++beg; // then beg = 1 - } - scrow = scmat + seq2[j] * scmat_size; - for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); -// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); - } - ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; - ret = ret >= curr[len1].D? ret : curr[len1].D; - free(curr); free(last); - return ret; -} - -#ifdef _MAIN -int main(int argc, char *argv[]) -{ -// int len1 = 35, len2 = 35; -// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; -// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; - int len1 = 4, len2 = 4; - uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; - uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; - int sc; -// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); - sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); - printf("%d\n", sc); - return 0; -} -#endif diff --git a/samtools/kaln.c.pysam.c b/samtools/kaln.c.pysam.c deleted file mode 100644 index 1922cc1f..00000000 --- a/samtools/kaln.c.pysam.c +++ /dev/null @@ -1,488 +0,0 @@ -#include "pysam.h" - -/* The MIT License - - Copyright (C) 2003-2006, 2008-2010 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include "kaln.h" - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 - -typedef struct { - int i, j; - unsigned char ctype; -} path_t; - -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -int aln_sm_qual[] = { - 0, -23, -23, -23, 0, - -23, 0, -23, -23, 0, - -23, -23, 0, -23, 0, - -23, -23, -23, 0, 0, - 0, 0, 0, 0, 0 -}; - -ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; -ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; - -ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 }; - -static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)calloc(*n_cigar, 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end_ext >= 0) { \ - if ((p)->M - gap_end_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_end_open - gap_end_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end_ext; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end_ext >= 0) { \ - if ((p)->M - gap_end_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_end_open - gap_end_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end_ext; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct { - uint8_t Mt:3, It:2, Dt:3; -} dpcell_t; - -typedef struct { - int M, I, D; -} dpscore_t; - -/*************************** - * banded global alignment * - ***************************/ -uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) -{ - int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - int b1, b2, tmp_end; - int *mat, end, max = 0; - uint8_t type, ctype; - uint32_t *cigar = 0; - - int gap_open, gap_ext, gap_end_open, gap_end_ext, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end_open = ap->gap_end_open; - gap_end_ext = ap->gap_end_ext; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (n_cigar) *n_cigar = 0; - if (len1 == 0 || len2 == 0) return 0; - - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - *_score = last[len1].M; - if (n_cigar) { /* backtrace */ - path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - cigar = ka_path2cigar32(path, p - path - 1, n_cigar); - free(path); - } - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return cigar; -} - -typedef struct { - int M, I, D; -} score_aux_t; - -#define MINUS_INF -0x40000000 - -// matrix: len2 rows and len1 columns -int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) -{ - -#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ - int t1, t2; \ - score_aux_t *_q; \ - _q = _q0; \ - _p->M = _q->M >= _q->I? _q->M : _q->I; \ - _p->M = _p->M >= _q->D? _p->M : _q->D; \ - _p->M += (_sc); \ - ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ - _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ - } - - int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; - const uint8_t *seq1, *seq2; - score_aux_t *curr, *last, *swap; - bw = abs(len1 - len2) + ap->band_width; - i = len1 > len2? len1 : len2; - if (bw > i + 1) bw = i + 1; - seq1 = _seq1 - 1; seq2 = _seq2 - 1; - curr = calloc(len1 + 2, sizeof(score_aux_t)); - last = calloc(len1 + 2, sizeof(score_aux_t)); - { // the zero-th row - int x, end = len1; - score_aux_t *p; - j = 0; - x = j + bw; end = len1 < x? len1 : x; // band end - p = curr; - p->M = 0; p->I = p->D = MINUS_INF; - for (i = 1, p = &curr[1]; i <= end; ++i, ++p) - p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); - p->M = p->I = p->D = MINUS_INF; - swap = curr; curr = last; last = swap; - } - for (j = 1; j < len2; ++j) { - int x, beg = 0, end = len1, *scrow, col_end; - score_aux_t *p; - x = j - bw; beg = 0 > x? 0 : x; // band start - x = j + bw; end = len1 < x? len1 : x; // band end - if (beg == 0) { // from zero-th column - p = curr; - p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); - ++beg; // then beg = 1 - } - scrow = scmat + seq2[j] * scmat_size; - if (end == len1) col_end = 1, --end; - else col_end = 0; - for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); - if (col_end) { - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); - ++p; - } - p->M = p->I = p->D = MINUS_INF; -// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); - swap = curr; curr = last; last = swap; - } - { // the last row - int x, beg = 0, *scrow; - score_aux_t *p; - j = len2; - x = j - bw; beg = 0 > x? 0 : x; // band start - if (beg == 0) { // from zero-th column - p = curr; - p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); - ++beg; // then beg = 1 - } - scrow = scmat + seq2[j] * scmat_size; - for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); -// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); - } - ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; - ret = ret >= curr[len1].D? ret : curr[len1].D; - free(curr); free(last); - return ret; -} - -#ifdef _MAIN -int main(int argc, char *argv[]) -{ -// int len1 = 35, len2 = 35; -// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; -// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; - int len1 = 4, len2 = 4; - uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; - uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; - int sc; -// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); - sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); - printf("%d\n", sc); - return 0; -} -#endif diff --git a/samtools/kaln.h b/samtools/kaln.h deleted file mode 100644 index 8f4a2c64..00000000 --- a/samtools/kaln.h +++ /dev/null @@ -1,67 +0,0 @@ -/* The MIT License - - Copyright (C) 2003-2006, 2008-2010 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef LH3_KALN_H_ -#define LH3_KALN_H_ - -#include - -#define MINOR_INF -1073741823 - -typedef struct { - int gap_open; - int gap_ext; - int gap_end_open; - int gap_end_ext; - - int *matrix; - int row; - int band_width; -} ka_param_t; - -typedef struct { - int iio, iie, ido, ide; - int eio, eie, edo, ede; - int *matrix; - int row; - int band_width; -} ka_param2_t; - -#ifdef __cplusplus -extern "C" { -#endif - - uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, - int *_score, int *n_cigar); - int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap); -#ifdef __cplusplus -} -#endif - -extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ -extern ka_param_t ka_param_qual; // only use this for global alignment!!! -extern ka_param2_t ka_param2_qual; // only use this for global alignment!!! - -#endif diff --git a/samtools/misc/ace2sam.c b/samtools/misc/ace2sam.c index 078830a4..24b69338 100644 --- a/samtools/misc/ace2sam.c +++ b/samtools/misc/ace2sam.c @@ -109,7 +109,7 @@ int main(int argc, char *argv[]) if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences - fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line + fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; fprintf(stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { diff --git a/samtools/misc/ace2sam.c.pysam.c b/samtools/misc/ace2sam.c.pysam.c index 53c82dce..a7f92e26 100644 --- a/samtools/misc/ace2sam.c.pysam.c +++ b/samtools/misc/ace2sam.c.pysam.c @@ -111,7 +111,7 @@ int main(int argc, char *argv[]) if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences - fprintf(pysamerr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line + fprintf(pysamerr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; fprintf(pysamerr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { diff --git a/samtools/padding.c b/samtools/padding.c index 89916ed0..ea1c9334 100644 --- a/samtools/padding.c +++ b/samtools/padding.c @@ -196,7 +196,7 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai) r_tid = b->core.tid; unpad_seq(b, &r); if (h->target_len[r_tid] != r.l) { - fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); + fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam1_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c index 25ed0f41..562ceba4 100644 --- a/samtools/padding.c.pysam.c +++ b/samtools/padding.c.pysam.c @@ -198,7 +198,7 @@ int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai) r_tid = b->core.tid; unpad_seq(b, &r); if (h->target_len[r_tid] != r.l) { - fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); + fprintf(pysamerr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam1_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { diff --git a/samtools/sam.c b/samtools/sam.c index 61c7b3e5..9f5f6a0e 100644 --- a/samtools/sam.c +++ b/samtools/sam.c @@ -30,7 +30,7 @@ DEALINGS IN THE SOFTWARE. */ int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) { - if (!fp->file->is_bin || !fp->file->is_write) return -1; + if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1; bgzf_mt(fp->x.bam, n_threads, n_sub_blks); return 0; } @@ -47,12 +47,14 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (strchr(mode, 'r')) { if (aux) hts_set_fai_filename(fp->file, aux); fp->header = sam_hdr_read(fp->file); // samclose() will free this + fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } else { fp->header = (bam_hdr_t *)aux; // For writing, we won't free it - if (fp->file->is_bin || fp->file->is_cram || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header); + fp->is_write = 1; + if (hts_get_format(fp->file)->format != sam || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header); } return fp; @@ -61,7 +63,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) void samclose(samfile_t *fp) { if (fp) { - if (!fp->file->is_write && fp->header) bam_hdr_destroy(fp->header); + if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header); sam_close(fp->file); free(fp); } diff --git a/samtools/sam.c.pysam.c b/samtools/sam.c.pysam.c index bfa4fc0b..3a2d8608 100644 --- a/samtools/sam.c.pysam.c +++ b/samtools/sam.c.pysam.c @@ -32,7 +32,7 @@ DEALINGS IN THE SOFTWARE. */ int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) { - if (!fp->file->is_bin || !fp->file->is_write) return -1; + if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1; bgzf_mt(fp->x.bam, n_threads, n_sub_blks); return 0; } @@ -49,12 +49,14 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (strchr(mode, 'r')) { if (aux) hts_set_fai_filename(fp->file, aux); fp->header = sam_hdr_read(fp->file); // samclose() will free this + fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(pysamerr, "[samopen] no @SQ lines in the header.\n"); } else { fp->header = (bam_hdr_t *)aux; // For writing, we won't free it - if (fp->file->is_bin || fp->file->is_cram || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header); + fp->is_write = 1; + if (hts_get_format(fp->file)->format != sam || strchr(mode, 'h')) sam_hdr_write(fp->file, fp->header); } return fp; @@ -63,7 +65,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) void samclose(samfile_t *fp) { if (fp) { - if (!fp->file->is_write && fp->header) bam_hdr_destroy(fp->header); + if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header); sam_close(fp->file); free(fp); } diff --git a/samtools/sam.h b/samtools/sam.h index 39da0063..e642920b 100644 --- a/samtools/sam.h +++ b/samtools/sam.h @@ -1,6 +1,6 @@ /* sam.h -- format-neutral SAM/BAM API. - Copyright (C) 2009, 2013 Genome Research Ltd. + Copyright (C) 2009, 2013, 2014 Genome Research Ltd. Author: Heng Li @@ -50,6 +50,7 @@ typedef struct { samFile *file; struct { BGZF *bam; } x; // Hack so that fp->x.bam still works bam_hdr_t *header; + short is_write:1; } samfile_t; #ifdef __cplusplus diff --git a/samtools/sam_view.c b/samtools/sam_view.c index e2a44203..55e7e3df 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -353,17 +353,29 @@ int main_samview(int argc, char *argv[]) goto view_end; } if (fn_list) hts_set_fai_filename(out, fn_list); - if (*out_format || is_header) sam_hdr_write(out, header); + if (*out_format || is_header) { + if (sam_hdr_write(out, header) != 0) { + fprintf(stderr, "[main_samview] failed to write the SAM header\n"); + ret = 1; + goto view_end; + } + } if (fn_un_out) { if ((un_out = sam_open(fn_un_out, out_mode)) == 0) { print_error_errno("failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } - if (*out_format || is_header) sam_hdr_write(un_out, header); + if (*out_format || is_header) { + if (sam_hdr_write(un_out, header) != 0) { + fprintf(stderr, "[main_samview] failed to write the SAM header\n"); + ret = 1; + goto view_end; + } + } } } - if (n_threads > 1) { hts_set_threads(out, n_threads); } + if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } if (is_header_only) goto view_end; // no need to print alignments if (argc == optind + 1) { // convert/print the entire file @@ -572,6 +584,15 @@ int main_bam2fq(int argc, char *argv[]) print_error_errno("Cannot read file \"%s\"", argv[optind]); return 1; } + if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, + SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL)) { + fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } fpse = NULL; if (fnse) { fpse = fopen(fnse,"w"); diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 9ae5ed68..34840b95 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -355,17 +355,29 @@ int main_samview(int argc, char *argv[]) goto view_end; } if (fn_list) hts_set_fai_filename(out, fn_list); - if (*out_format || is_header) sam_hdr_write(out, header); + if (*out_format || is_header) { + if (sam_hdr_write(out, header) != 0) { + fprintf(pysamerr, "[main_samview] failed to write the SAM header\n"); + ret = 1; + goto view_end; + } + } if (fn_un_out) { if ((un_out = sam_open(fn_un_out, out_mode)) == 0) { print_error_errno("failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } - if (*out_format || is_header) sam_hdr_write(un_out, header); + if (*out_format || is_header) { + if (sam_hdr_write(un_out, header) != 0) { + fprintf(pysamerr, "[main_samview] failed to write the SAM header\n"); + ret = 1; + goto view_end; + } + } } } - if (n_threads > 1) { hts_set_threads(out, n_threads); } + if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } if (is_header_only) goto view_end; // no need to print alignments if (argc == optind + 1) { // convert/print the entire file @@ -574,6 +586,15 @@ int main_bam2fq(int argc, char *argv[]) print_error_errno("Cannot read file \"%s\"", argv[optind]); return 1; } + if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, + SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } fpse = NULL; if (fnse) { fpse = fopen(fnse,"w"); diff --git a/samtools/stats.c b/samtools/stats.c index 2eab4773..fe43e713 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -355,10 +355,8 @@ int unclipped_length(bam1_t *bam_line) return read_len; } -void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) +void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) { - int read_len = unclipped_length(bam_line); - if ( read_len >= stats->nbases ) realloc_buffers(stats,read_len); int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; int icig,iread=0,icycle=0; int iref = bam_line->core.pos - stats->rseq_pos; @@ -428,7 +426,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) int idx = is_fwd ? icycle : read_len-icycle-1; if ( idx>stats->max_len ) - error("mpc: %d>%d\n",idx,stats->max_len); + error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); idx = idx*stats->nquals + qual; if ( idx>=stats->nquals*stats->nbases ) @@ -645,12 +643,13 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; - if ( seq_len >= stats->nbases ) - realloc_buffers(stats,seq_len); - if ( stats->max_lenmax_len = seq_len; + int read_len = unclipped_length(bam_line); + if ( read_len >= stats->nbases ) + realloc_buffers(stats,read_len); + if ( stats->max_lenmax_len = read_len; - stats->read_lengths[seq_len]++; + stats->read_lengths[read_len]++; // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored uint8_t base, *seq = bam_get_seq(bam_line); @@ -850,7 +849,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size); } - count_mismatches_per_cycle(stats,bam_line); + count_mismatches_per_cycle(stats,bam_line,read_len); } // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) @@ -1053,7 +1052,7 @@ void output_stats(stats_t *stats, int sparse) if ( ! sum ) continue; printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); } - printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); + printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); for (isize=0; isizeisize->inward(stats->isize->data, isize)); long out = (long)(stats->isize->outward(stats->isize->data, isize)); diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index 507c6041..a7ea9e07 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -357,10 +357,8 @@ int unclipped_length(bam1_t *bam_line) return read_len; } -void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) +void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) { - int read_len = unclipped_length(bam_line); - if ( read_len >= stats->nbases ) realloc_buffers(stats,read_len); int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; int icig,iread=0,icycle=0; int iref = bam_line->core.pos - stats->rseq_pos; @@ -430,7 +428,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) int idx = is_fwd ? icycle : read_len-icycle-1; if ( idx>stats->max_len ) - error("mpc: %d>%d\n",idx,stats->max_len); + error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); idx = idx*stats->nquals + qual; if ( idx>=stats->nquals*stats->nbases ) @@ -647,12 +645,13 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; - if ( seq_len >= stats->nbases ) - realloc_buffers(stats,seq_len); - if ( stats->max_lenmax_len = seq_len; + int read_len = unclipped_length(bam_line); + if ( read_len >= stats->nbases ) + realloc_buffers(stats,read_len); + if ( stats->max_lenmax_len = read_len; - stats->read_lengths[seq_len]++; + stats->read_lengths[read_len]++; // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored uint8_t base, *seq = bam_get_seq(bam_line); @@ -852,7 +851,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size); } - count_mismatches_per_cycle(stats,bam_line); + count_mismatches_per_cycle(stats,bam_line,read_len); } // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) @@ -1055,7 +1054,7 @@ void output_stats(stats_t *stats, int sparse) if ( ! sum ) continue; printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); } - printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); + printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); for (isize=0; isizeisize->inward(stats->isize->data, isize)); long out = (long)(stats->isize->outward(stats->isize->data, isize)); diff --git a/samtools/test/merge/test_rtrans_build.c b/samtools/test/merge/test_rtrans_build.c index d3fbbb3a..df509215 100644 --- a/samtools/test/merge/test_rtrans_build.c +++ b/samtools/test/merge/test_rtrans_build.c @@ -76,8 +76,8 @@ int main(int argc, char**argv) break; } } - const long GIMIC_SEED = 0x1234abcd330e; - srand48(GIMIC_SEED); + const long GIMMICK_SEED = 0x1234330e; + srand48(GIMMICK_SEED); if (verbose) printf("BEGIN test 1\n"); // setup diff --git a/samtools/test/merge/test_rtrans_build.c.pysam.c b/samtools/test/merge/test_rtrans_build.c.pysam.c index ad7f36a6..fcbc4585 100644 --- a/samtools/test/merge/test_rtrans_build.c.pysam.c +++ b/samtools/test/merge/test_rtrans_build.c.pysam.c @@ -78,8 +78,8 @@ int main(int argc, char**argv) break; } } - const long GIMIC_SEED = 0x1234abcd330e; - srand48(GIMIC_SEED); + const long GIMMICK_SEED = 0x1234330e; + srand48(GIMMICK_SEED); if (verbose) printf("BEGIN test 1\n"); // setup diff --git a/samtools/test/merge/test_trans_tbl_init.c b/samtools/test/merge/test_trans_tbl_init.c index 2a18e2fe..64b97867 100644 --- a/samtools/test/merge/test_trans_tbl_init.c +++ b/samtools/test/merge/test_trans_tbl_init.c @@ -320,7 +320,7 @@ int main(int argc, char**argv) } // Set the seed to a fixed value so that calls to lrand48 within functions return predictable values - const long GIMMICK_SEED = 0x1234abcd330e; + const long GIMMICK_SEED = 0x1234330e; srand48(GIMMICK_SEED); bam_hdr_t* out; diff --git a/samtools/test/merge/test_trans_tbl_init.c.pysam.c b/samtools/test/merge/test_trans_tbl_init.c.pysam.c index 2c69e21f..594bf2c6 100644 --- a/samtools/test/merge/test_trans_tbl_init.c.pysam.c +++ b/samtools/test/merge/test_trans_tbl_init.c.pysam.c @@ -322,7 +322,7 @@ int main(int argc, char**argv) } // Set the seed to a fixed value so that calls to lrand48 within functions return predictable values - const long GIMMICK_SEED = 0x1234abcd330e; + const long GIMMICK_SEED = 0x1234330e; srand48(GIMMICK_SEED); bam_hdr_t* out; diff --git a/samtools/version.h b/samtools/version.h index 45580079..64eb5420 100644 --- a/samtools/version.h +++ b/samtools/version.h @@ -1 +1 @@ -#define SAMTOOLS_VERSION "1.1" +#define SAMTOOLS_VERSION "1.2" diff --git a/setup.py b/setup.py index 45fb8bc1..8c0a132a 100644 --- a/setup.py +++ b/setup.py @@ -40,18 +40,29 @@ version = version.__version__ # exclude sources that contains a main function -samtools_exclude = ("bamtk.c", "razip.c", "bgzip.c", - "main.c", "calDepth.c", "bam2bed.c", - "wgsim.c", "md5fa.c", "maq2sam.c", +samtools_exclude = ("bamtk.c", + "razip.c", + "bgzip.c", + "main.c", + "calDepth.c", + "bam2bed.c", + "wgsim.c", + "md5fa.c", + "maq2sam.c", "bamcheck.c", "chk_indel.c", - "vcf-miniview.c") -htslib_exclude = ('htslib/tabix.c', 'htslib/bgzip.c') -tabix_exclude = ("main.c",) + "vcf-miniview.c", + "htslib-1.2.1", # do not import twice + "hfile_irods.c", # requires irods library + ) + +htslib_exclude = ('htslib/tabix.c', + 'htslib/bgzip.c', + 'htslib/htsfile.c', + 'htslib/hfile_irods.c') # destination directories for import of samtools and tabix samtools_dest = os.path.abspath("samtools") -tabix_dest = os.path.abspath("tabix") if HTSLIB_LIBRARY_DIR: # linking against a shared, externally installed htslib version, no @@ -93,7 +104,8 @@ def locate(pattern, root=os.curdir): '''Locate all files matching supplied filename pattern in and below - supplied root directory.''' + supplied root directory. + ''' for path, dirs, files in os.walk(os.path.abspath(root)): for filename in fnmatch.filter(files, pattern): yield os.path.join(path, filename) @@ -137,58 +149,68 @@ def _update_pysam_files(cf, destdir): if len(sys.argv) < 3: raise ValueError("missing PATH to samtools source directory") - for destdir, srcdir, exclude in zip( - (samtools_dest,), - sys.argv[2:3], - (samtools_exclude,)): - - srcdir = os.path.abspath(srcdir) - if not os.path.exists(srcdir): - raise IOError("source directory `%s` does not exist." % srcdir) - - cfiles = locate("*.c", srcdir) - hfiles = locate("*.h", srcdir) - ncopied = 0 - - def _compareAndCopy(src, srcdir, destdir, exclude): - - d, f = os.path.split(src) - if f in exclude: - return None - common_prefix = os.path.commonprefix((d, srcdir)) - subdir = re.sub(common_prefix, "", d)[1:] - targetdir = os.path.join(destdir, subdir) - if not os.path.exists(targetdir): - os.makedirs(targetdir) - old_file = os.path.join(targetdir, f) - if os.path.exists(old_file): - md5_old = hashlib.md5( - "".join(open(old_file, "r").readlines())).digest() - md5_new = hashlib.md5( - "".join(open(src, "r").readlines())).digest() - if md5_old != md5_new: - raise ValueError( - "incompatible files for %s and %s" % (old_file, src)) - - shutil.copy(src, targetdir) - return old_file - - for src_file in hfiles: - _compareAndCopy(src_file, srcdir, destdir, exclude) - ncopied += 1 - - cf = [] - for src_file in cfiles: - cf.append(_compareAndCopy(src_file, srcdir, destdir, exclude)) - ncopied += 1 - - sys.stdout.write( - "installed latest source code from %s: " - "%i files copied\n" % (srcdir, ncopied)) - # redirect stderr to pysamerr and replace bam.h with a stub. - sys.stdout.write("applying stderr redirection\n") - - _update_pysam_files(cf, destdir) + destdir = samtools_dest + srcdir = sys.argv[2] + exclude = samtools_exclude + + srcdir = os.path.abspath(srcdir) + if not os.path.exists(srcdir): + raise IOError( + "source directory `%s` does not exist." % srcdir) + + cfiles = locate("*.c", srcdir) + hfiles = locate("*.h", srcdir) + + # remove unwanted files and htslib subdirectory. + cfiles = [x for x in cfiles if os.path.basename(x) not in exclude + and not re.search("htslib-", x)] + + hfiles = [x for x in hfiles if os.path.basename(x) not in exclude + and not re.search("htslib-", x)] + + ncopied = 0 + + def _compareAndCopy(src, srcdir, destdir, exclude): + + d, f = os.path.split(src) + common_prefix = os.path.commonprefix((d, srcdir)) + subdir = re.sub(common_prefix, "", d)[1:] + targetdir = os.path.join(destdir, subdir) + if not os.path.exists(targetdir): + os.makedirs(targetdir) + old_file = os.path.join(targetdir, f) + if os.path.exists(old_file): + md5_old = hashlib.md5( + "".join(open(old_file, "r").readlines())).digest() + md5_new = hashlib.md5( + "".join(open(src, "r").readlines())).digest() + if md5_old != md5_new: + raise ValueError( + "incompatible files for %s and %s" % + (old_file, src)) + + shutil.copy(src, targetdir) + return old_file + + for src_file in hfiles: + _compareAndCopy(src_file, srcdir, destdir, exclude) + ncopied += 1 + + cf = [] + for src_file in cfiles: + cf.append(_compareAndCopy(src_file, + srcdir, + destdir, + exclude)) + ncopied += 1 + + sys.stdout.write( + "installed latest source code from %s: " + "%i files copied\n" % (srcdir, ncopied)) + # redirect stderr to pysamerr and replace bam.h with a stub. + sys.stdout.write("applying stderr redirection\n") + + _update_pysam_files(cf, destdir) sys.exit(0) @@ -197,7 +219,7 @@ def _compareAndCopy(src, srcdir, destdir, exclude): sys.stdout.write("refreshing latest source code from .c to .pysam.c") # redirect stderr to pysamerr and replace bam.h with a stub. sys.stdout.write("applying stderr redirection") - for destdir in ('samtools', 'tabix'): + for destdir in ('samtools', ): pysamcfiles = locate("*.pysam.c", destdir) for f in pysamcfiles: os.remove(f) @@ -231,6 +253,7 @@ def _compareAndCopy(src, srcdir, destdir, exclude): calignmentfile_sources = ["pysam/calignmentfile.c"] tabproxies_sources = ["pysam/TabProxies.c"] cvcf_sources = ["pysam/cvcf.c"] + cbcf_sources = ["pysam/cbcf.c"] else: # remove existing files to recompute # necessary to be both compatible for python 2.7 and 3.3 @@ -241,7 +264,9 @@ def _compareAndCopy(src, srcdir, destdir, exclude): "pysam/cfaidx.c", "pysam/csamfile.c", "pysam/TabProxies.c", - "pysam/cvcf.c"): + "pysam/cvcf.c", + "pysam/bvcf.c", + ): try: os.unlink(f) except: @@ -256,6 +281,7 @@ def _compareAndCopy(src, srcdir, destdir, exclude): faidx_sources = ["pysam/cfaidx.pyx"] tabproxies_sources = ["pysam/TabProxies.pyx"] cvcf_sources = ["pysam/cvcf.pyx"] + cbcf_sources = ["pysam/cbcf.pyx"] ####################################################### @@ -415,6 +441,22 @@ def _compareAndCopy(src, srcdir, destdir, exclude): extra_compile_args=["-Wno-error=declaration-after-statement"], ) +cbcf = Extension( + "pysam.cbcf", + cbcf_sources + + htslib_sources + + os_c_files, + library_dirs=htslib_library_dirs, + include_dirs=["htslib"] + include_os + htslib_include_dirs, + libraries=["z"] + htslib_libraries, + language="c", + extra_compile_args=[ + "-Wno-error=declaration-after-statement", + "-DSAMTOOLS=1"], + define_macros=[('_FILE_OFFSET_BITS', '64'), + ('_USE_KNETFILE', '')] +) + metadata = { 'name': name, 'version': version, @@ -432,7 +474,7 @@ def _compareAndCopy(src, srcdir, destdir, exclude): 'pysam.include.samtools', # 'pysam.include.samtools.bcftools', 'pysam.include.samtools.win32'], - 'requires': ['cython (>=0.20.1)'], + 'requires': ['cython (>=0.21)'], 'ext_modules': [samtools, htslib, samfile, @@ -440,6 +482,7 @@ def _compareAndCopy(src, srcdir, destdir, exclude): tabix, tabproxies, cvcf, + cbcf, faidx], 'cmdclass': cmdclass, 'package_dir': {'pysam': 'pysam', diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py new file mode 100644 index 00000000..bdf72fbd --- /dev/null +++ b/tests/AlignedSegment_test.py @@ -0,0 +1,325 @@ +import os +import pysam +import unittest +from TestUtils import checkFieldEqual + +SAMTOOLS = "samtools" +WORKDIR = "pysam_test_work" +DATADIR = "pysam_data" + + +class ReadTest(unittest.TestCase): + + def buildRead(self): + '''build an example read.''' + + a = pysam.AlignedSegment() + a.query_name = "read_12345" + a.query_sequence = "ACGT" * 10 + a.flag = 0 + a.reference_id = 0 + a.reference_start = 20 + a.mapping_quality = 20 + a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) + a.next_reference_id = 0 + a.next_reference_start = 200 + a.template_length = 167 + a.query_qualities = pysam.fromQualityString("1234") * 10 + # todo: create tags + return a + + +class TestAlignedSegment(ReadTest): + + '''tests to check if aligned read can be constructed + and manipulated. + ''' + + def testEmpty(self): + a = pysam.AlignedSegment() + self.assertEqual(a.query_name, None) + self.assertEqual(a.query_sequence, None) + self.assertEqual(pysam.toQualityString(a.query_qualities), None) + self.assertEqual(a.flag, 0) + self.assertEqual(a.reference_id, 0) + self.assertEqual(a.mapping_quality, 0) + self.assertEqual(a.cigartuples, None) + self.assertEqual(a.tags, []) + self.assertEqual(a.next_reference_id, 0) + self.assertEqual(a.next_reference_start, 0) + self.assertEqual(a.template_length, 0) + + def testStrOfEmptyRead(self): + a = pysam.AlignedSegment() + s = str(a) + self.assertEqual( + "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]", + s) + + def testSettingTagInEmptyRead(self): + '''see issue 62''' + a = pysam.AlignedSegment() + a.tags = (("NM", 1),) + a.query_qualities = None + self.assertEqual(a.tags, [("NM", 1), ]) + + def testUpdate(self): + '''check if updating fields affects other variable length data + ''' + a = self.buildRead() + b = self.buildRead() + + # check qname + b.query_name = "read_123" + checkFieldEqual(self, a, b, "query_name") + b.query_name = "read_12345678" + checkFieldEqual(self, a, b, "query_name") + b.query_name = "read_12345" + checkFieldEqual(self, a, b) + + # check cigar + b.cigartuples = ((0, 10), ) + checkFieldEqual(self, a, b, "cigartuples") + b.cigartuples = ((0, 10), (2, 1), (0, 10)) + checkFieldEqual(self, a, b, "cigartuples") + b.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) + checkFieldEqual(self, a, b) + + # check seq + b.query_sequence = "ACGT" + checkFieldEqual(self, + a, b, + ("query_sequence", "query_qualities", "query_length")) + b.query_sequence = "ACGT" * 3 + checkFieldEqual(self, + a, b, + ("query_sequence", "query_qualities", "query_length")) + b.query_sequence = "ACGT" * 10 + checkFieldEqual(self, a, b, ("query_qualities",)) + + # reset qual + b = self.buildRead() + + # check flags: + for x in ( + "is_paired", "is_proper_pair", + "is_unmapped", "mate_is_unmapped", + "is_reverse", "mate_is_reverse", + "is_read1", "is_read2", + "is_secondary", "is_qcfail", + "is_duplicate"): + setattr(b, x, True) + self.assertEqual(getattr(b, x), True) + checkFieldEqual(self, a, b, ("flag", x,)) + setattr(b, x, False) + self.assertEqual(getattr(b, x), False) + checkFieldEqual(self, a, b) + + def testUpdate2(self): + '''issue 135: inplace update of sequence and quality score. + + This does not work as setting the sequence will erase + the quality scores. + ''' + a = self.buildRead() + a.query_sequence = a.query_sequence[5:10] + self.assertEqual(pysam.toQualityString(a.query_qualities), None) + + a = self.buildRead() + s = pysam.toQualityString(a.query_qualities) + a.query_sequence = a.query_sequence[5:10] + a.query_qualities = pysam.fromQualityString(s[5:10]) + + self.assertEqual(pysam.toQualityString(a.query_qualities), s[5:10]) + + def testLargeRead(self): + '''build an example read.''' + + a = pysam.AlignedSegment() + a.query_name = "read_12345" + a.query_sequence = "ACGT" * 200 + a.flag = 0 + a.reference_id = 0 + a.reference_start = 20 + a.mapping_quality = 20 + a.cigartuples = ((0, 4 * 200), ) + a.next_reference_id = 0 + a.next_reference_start = 200 + a.template_length = 167 + a.query_qualities = pysam.fromQualityString("1234") * 200 + + return a + + def testUpdateTlen(self): + '''check if updating tlen works''' + a = self.buildRead() + oldlen = a.template_length + oldlen *= 2 + a.template_length = oldlen + self.assertEqual(a.template_length, oldlen) + + def testPositions(self): + a = self.buildRead() + self.assertEqual(a.get_reference_positions(), + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]) + + self.assertEqual(a.get_aligned_pairs(), + [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24), + (5, 25), (6, 26), (7, 27), (8, 28), (9, 29), + (None, 30), + (10, 31), (11, 32), (12, 33), (13, 34), (14, 35), + (15, 36), (16, 37), (17, 38), (18, 39), (19, None), + (20, 40), (21, 41), (22, 42), (23, 43), (24, 44), + (25, 45), (26, 46), (27, 47), (28, 48), (29, 49), + (30, 50), (31, 51), (32, 52), (33, 53), (34, 54), + (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)]) + + self.assertEqual( + a.get_reference_positions(), + [x[1] for x in a.get_aligned_pairs() + if x[0] is not None and x[1] is not None]) + # alen is the length of the aligned read in genome + self.assertEqual(a.reference_length, + a.get_aligned_pairs()[-1][0] + 1) + # aend points to one beyond last aligned base in ref + self.assertEqual(a.get_reference_positions()[-1], + a.reference_end - 1) + + def testFullReferencePositions(self): + '''see issue 26''' + a = self.buildRead() + a.cigar = [(4, 30), (0, 20), (1, 3), (0, 47)] + + self.assertEqual(100, + len(a.get_reference_positions(full_length=True))) + + def testBlocks(self): + a = self.buildRead() + self.assertEqual(a.get_blocks(), + [(20, 30), (31, 40), (40, 60)]) + + +class TestTags(ReadTest): + + def testMissingTag(self): + a = self.buildRead() + self.assertRaises(KeyError, a.get_tag, "XP") + + def testEmptyTag(self): + a = self.buildRead() + self.assertRaises(KeyError, a.get_tag, "XT") + + def testSetTag(self): + a = self.buildRead() + self.assertEqual(False, a.has_tag("NM")) + a.set_tag("NM", 2) + self.assertEqual(True, a.has_tag("NM")) + self.assertEqual(a.get_tag("NM"), 2) + a.set_tag("NM", 3) + self.assertEqual(a.get_tag("NM"), 3) + a.set_tag("NM", None) + self.assertEqual(False, a.has_tag("NM")) + # check if deleting a non-existing tag is fine + a.set_tag("NM", None) + + def testAddTagsType(self): + a = self.buildRead() + a.tags = None + self.assertEqual(a.tags, []) + + a.setTag('X1', 5.0) + a.setTag('X2', "5.0") + a.setTag('X3', 5) + + self.assertEqual(sorted(a.tags), + sorted([('X1', 5.0), + ('X2', "5.0"), + ('X3', 5)])) + + # test setting float for int value + a.setTag('X4', 5, value_type='d') + self.assertEqual(sorted(a.tags), + sorted([('X1', 5.0), + ('X2', "5.0"), + ('X3', 5), + ('X4', 5.0)])) + + # test setting int for float value - the + # value will be rounded. + a.setTag('X5', 5.2, value_type='i') + self.assertEqual(sorted(a.tags), + sorted([('X1', 5.0), + ('X2', "5.0"), + ('X3', 5), + ('X4', 5.0), + ('X5', 5)])) + + # test setting invalid type code + self.assertRaises(ValueError, a.setTag, 'X6', 5.2, 'g') + + def testTagsUpdatingFloat(self): + a = self.buildRead() + a.tags = [('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U')] + + self.assertEqual(a.tags, + [('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U')]) + a.tags += [('XC', 5.0)] + self.assertEqual(a.tags, + [('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)]) + + def testAddTags(self): + a = self.buildRead() + a.tags = [('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U')] + + self.assertEqual(sorted(a.tags), + sorted([('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U')])) + + a.setTag('X1', 'C') + self.assertEqual(sorted(a.tags), + sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U'), ])) + a.setTag('X2', 5) + self.assertEqual(sorted(a.tags), + sorted([('X2', 5), ('X1', 'C'), + ('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U'), ])) + # add with replacement + a.setTag('X2', 10) + self.assertEqual(sorted(a.tags), + sorted([('X2', 10), ('X1', 'C'), + ('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U'), ])) + + # add without replacement + a.setTag('X2', 5, replace=False) + self.assertEqual(sorted(a.tags), + sorted([('X2', 10), ('X1', 'C'), + ('X2', 5), + ('NM', 1), ('RG', 'L1'), + ('PG', 'P1'), ('XT', 'U'), ])) + + def testTagParsing(self): + '''test for tag parsing + + see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a + ''' + samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex8.bam"), + "rb") + + for entry in samfile: + before = entry.get_tags() + entry.set_tags(before) + after = entry.get_tags() + self.assertEqual(after, before) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 081f5af7..65dbf44c 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -13,7 +13,8 @@ import collections import subprocess import logging -from TestUtils import checkBinaryEqual, checkURL +from functools import partial +from TestUtils import checkBinaryEqual, checkURL, checkSamtoolsViewEqual, checkFieldEqual IS_PYTHON3 = sys.version_info[0] >= 3 @@ -22,7 +23,15 @@ DATADIR = "pysam_data" -class BasicTestBAMFetch(unittest.TestCase): +################################################## +# +# Detailed test of file contents +# +# Data are read either through file based iterator +# access (BasicTestBAMFromFile) or by calling fetch +# without coordinates (BasicTestBAMFromFetch) +################################################## +class BasicTestBAMFromFetch(unittest.TestCase): '''basic first test - detailed testing if information in file is consistent @@ -164,30 +173,62 @@ def testARqual(self): "quality string mismatch in read 3: %s != %s" % (pysam.toQualityString(self.reads[3].query_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) def testARquery(self): - self.assertEqual(self.reads[0].query_alignment_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "query mismatch in read 1: %s != %s" % ( - self.reads[0].query_alignment_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) - self.assertEqual(self.reads[1].query_alignment_sequence, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "query size mismatch in read 2: %s != %s" % ( - self.reads[1].query_alignment_sequence, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA")) - self.assertEqual(self.reads[3].query_alignment_sequence, "TAGCTAGCTACCTATATCTTGGTCTT", "query mismatch in read 4: %s != %s" % ( - self.reads[3].query_alignment_sequence, "TAGCTAGCTACCTATATCTTGGTCTT")) + self.assertEqual( + self.reads[0].query_alignment_sequence, + "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", + "query mismatch in read 1: %s != %s" % + (self.reads[0].query_alignment_sequence, + "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) + self.assertEqual( + self.reads[1].query_alignment_sequence, + "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", + "query size mismatch in read 2: %s != %s" % + (self.reads[1].query_alignment_sequence, + "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA")) + self.assertEqual( + self.reads[3].query_alignment_sequence, + "TAGCTAGCTACCTATATCTTGGTCTT", + "query mismatch in read 4: %s != %s" % + (self.reads[3].query_alignment_sequence, + "TAGCTAGCTACCTATATCTTGGTCTT")) def testARqqual(self): - self.assertEqual(pysam.toQualityString(self.reads[0].query_alignment_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", - "qquality string mismatch in read 1: %s != %s" % (pysam.toQualityString(self.reads[0].query_alignment_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) - self.assertEqual(pysam.toQualityString(self.reads[1].query_alignment_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "qquality string mismatch in read 2: %s != %s" % ( - pysam.toQualityString(self.reads[1].query_alignment_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")) - self.assertEqual(pysam.toQualityString(self.reads[3].query_alignment_qualities), "<<<<<<<<<<<<<<<<<:<9/,&,22", - "qquality string mismatch in read 3: %s != %s" % (pysam.toQualityString(self.reads[3].query_alignment_qualities), "<<<<<<<<<<<<<<<<<:<9/,&,22")) + self.assertEqual( + pysam.toQualityString(self.reads[0].query_alignment_qualities), + "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", + "qquality string mismatch in read 1: %s != %s" % + (pysam.toQualityString(self.reads[0].query_alignment_qualities), + "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) + self.assertEqual( + pysam.toQualityString(self.reads[1].query_alignment_qualities), + "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", + "qquality string mismatch in read 2: %s != %s" % + (pysam.toQualityString(self.reads[1].query_alignment_qualities), + "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")) + self.assertEqual( + pysam.toQualityString(self.reads[3].query_alignment_qualities), + "<<<<<<<<<<<<<<<<<:<9/,&,22", + "qquality string mismatch in read 3: %s != %s" % + (pysam.toQualityString(self.reads[3].query_alignment_qualities), + "<<<<<<<<<<<<<<<<<:<9/,&,22")) def testPresentOptionalFields(self): - self.assertEqual(self.reads[0].opt( - 'NM'), 1, "optional field mismatch in read 1, NM: %s != %s" % (self.reads[0].opt('NM'), 1)) - self.assertEqual(self.reads[0].opt( - 'RG'), 'L1', "optional field mismatch in read 1, RG: %s != %s" % (self.reads[0].opt('RG'), 'L1')) - self.assertEqual(self.reads[1].opt( - 'RG'), 'L2', "optional field mismatch in read 2, RG: %s != %s" % (self.reads[1].opt('RG'), 'L2')) - self.assertEqual(self.reads[1].opt( - 'MF'), 18, "optional field mismatch in read 2, MF: %s != %s" % (self.reads[1].opt('MF'), 18)) + self.assertEqual( + self.reads[0].opt('NM'), 1, + "optional field mismatch in read 1, NM: %s != %s" % + (self.reads[0].opt('NM'), 1)) + self.assertEqual( + self.reads[0].opt('RG'), 'L1', + "optional field mismatch in read 1, RG: %s != %s" % + (self.reads[0].opt('RG'), 'L1')) + self.assertEqual( + self.reads[1].opt('RG'), 'L2', + "optional field mismatch in read 2, RG: %s != %s" % + (self.reads[1].opt('RG'), 'L2')) + self.assertEqual( + self.reads[1].opt('MF'), 18, + "optional field mismatch in read 2, MF: %s != %s" % + (self.reads[1].opt('MF'), 18)) def testPairedBools(self): self.assertEqual(self.reads[0].is_paired, True, "is paired mismatch in read 1: %s != %s" % ( @@ -207,102 +248,67 @@ def testTags(self): [('MF', 18), ('RG', 'L2'), ('PG', 'P2'), ('XT', 'R')]) - def testAddTags(self): - self.assertEqual(sorted(self.reads[0].tags), - sorted([('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U')])) - - self.reads[0].setTag('X1', 'C') - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - self.reads[0].setTag('X2', 5) - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X2', 5), ('X1', 'C'), - ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - # add with replacement - self.reads[0].setTag('X2', 10) - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X2', 10), ('X1', 'C'), - ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - - # add without replacement - self.reads[0].setTag('X2', 5, replace=False) - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X2', 10), ('X1', 'C'), - ('X2', 5), - ('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ])) - - def testAddTagsType(self): - self.reads[0].tags = None - self.assertEqual(self.reads[0].tags, []) - - self.reads[0].setTag('X1', 5.0) - self.reads[0].setTag('X2', "5.0") - self.reads[0].setTag('X3', 5) - - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 5.0), - ('X2', "5.0"), - ('X3', 5)])) - - # test setting float for int value - self.reads[0].setTag('X4', 5, value_type='d') - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 5.0), - ('X2', "5.0"), - ('X3', 5), - ('X4', 5.0)])) - - # test setting int for float value - the - # value will be rounded. - self.reads[0].setTag('X5', 5.2, value_type='i') - self.assertEqual(sorted(self.reads[0].tags), - sorted([('X1', 5.0), - ('X2', "5.0"), - ('X3', 5), - ('X4', 5.0), - ('X5', 5)])) - - # test setting invalid type code - self.assertRaises(ValueError, self.reads[0].setTag, 'X6', 5.2, 'g') - - def testTagsUpdatingFloat(self): - self.assertEqual(self.reads[0].tags, - [('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U')]) - self.reads[0].tags += [('XC', 5.0)] - self.assertEqual(self.reads[0].tags, - [('NM', 1), ('RG', 'L1'), - ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)]) - def testOpt(self): self.assertEqual(self.reads[0].opt("XT"), "U") self.assertEqual(self.reads[1].opt("XT"), "R") - def testMissingOpt(self): - self.assertRaises(KeyError, self.reads[0].opt, "XP") - - def testEmptyOpt(self): - self.assertRaises(KeyError, self.reads[2].opt, "XT") - def tearDown(self): self.samfile.close() -class BasicTestBAMFile(BasicTestBAMFetch): - +class BasicTestSAMFromFetch(BasicTestBAMFromFetch): def setUp(self): self.samfile = pysam.AlignmentFile( os.path.join(DATADIR, "ex3.sam"), "r") - self.reads = [r for r in self.samfile] + self.reads = list(self.samfile.fetch()) + + +class BasicTestCRAMFromFetch(BasicTestBAMFromFetch): + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex3.cram"), + "rc") + self.reads = list(self.samfile.fetch()) -class BasicTestSAMFile(BasicTestBAMFetch): + def testTags(self): + self.assertEqual( + sorted(self.reads[0].tags), + sorted([('RG', 'L1'), + ('NM', 22), + ('MD','0C0T1G1C0C0A1G0^G0C1C1G1A0T2G0G0G0A1C1G1G1A2C0'), + ('PG', 'P1'), + ('XT', 'U'), + ])) + self.assertEqual( + sorted(self.reads[1].tags), + sorted([('RG', 'L2'), + ('NM', 26), + ('MD', '1G0A0A1G1G0G2C0A0G0A0A0C0T0T0G0A0A0G0A0C0A0A1T2C0T0T1'), + ('MF', 18), + ('PG', 'P2'), + ('XT', 'R')])) + + def testPresentOptionalFields(self): + self.assertEqual( + self.reads[0].opt('NM'), 22, + "optional field mismatch in read 1, NM: %s != %s" % + (self.reads[0].opt('NM'), 22)) + self.assertEqual( + self.reads[0].opt('RG'), 'L1', + "optional field mismatch in read 1, RG: %s != %s" % + (self.reads[0].opt('RG'), 'L1')) + self.assertEqual( + self.reads[1].opt('RG'), 'L2', + "optional field mismatch in read 2, RG: %s != %s" % + (self.reads[1].opt('RG'), 'L2')) + self.assertEqual( + self.reads[1].opt('MF'), 18, + "optional field mismatch in read 2, MF: %s != %s" % + (self.reads[1].opt('MF'), 18)) + +class BasicTestSAMFromFile(BasicTestBAMFromFetch): def setUp(self): self.samfile = pysam.AlignmentFile( @@ -311,77 +317,167 @@ def setUp(self): self.reads = [r for r in self.samfile] -class BasicTestSAMFetch(BasicTestBAMFetch): +class BasicTestCRAMFromFile(BasicTestCRAMFromFetch): + def setUp(self): self.samfile = pysam.AlignmentFile( - os.path.join(DATADIR, "ex3.sam"), - "r") - self.reads = list(self.samfile.fetch()) + os.path.join(DATADIR, "ex3.cram"), + "rc") + self.reads = [r for r in self.samfile] -# needs to be implemented -# class TestAlignedSegmentFromSamWithoutHeader(TestAlignedSegmentFromBam): -# -# def setUp(self): -# self.samfile=pysam.AlignmentFile( "ex7.sam","r" ) -# self.reads=list(self.samfile.fetch()) +class BasicTestBAMFromFile(BasicTestBAMFromFetch): + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex3.bam"), + "rb") + self.reads = [r for r in self.samfile] +################################################## +# +# Test of basic File I/O +# +# * format conversions +# * reading with/without index +# * reading from closed files +# +################################################## class TestIO(unittest.TestCase): - - '''check if reading samfile and writing a samfile are consistent.''' + '''check if reading samfile and writing a samfile + are consistent.''' def checkEcho(self, input_filename, reference_filename, output_filename, input_mode, output_mode, - use_template=True): - '''iterate through *input_filename* writing to *output_filename* and - comparing the output to *reference_filename*. + sequence_filename=None, + use_template=True, + checkf=checkBinaryEqual): + '''iterate through *input_filename* writing to + *output_filename* and comparing the output to + *reference_filename*. - The files are opened according to the *input_mode* and *output_mode*. + The files are opened according to the *input_mode* and + *output_mode*. If *use_template* is set, the header is copied from infile using the template mechanism, otherwise target names and lengths are passed explicitely. + The *checkf* is used to determine if the files are + equal. ''' - - infile = pysam.AlignmentFile(os.path.join(DATADIR, input_filename), - input_mode) + infile = pysam.AlignmentFile( + os.path.join(DATADIR, input_filename), + input_mode) if use_template: - outfile = pysam.AlignmentFile(output_filename, - output_mode, - template=infile) + outfile = pysam.AlignmentFile( + output_filename, + output_mode, + reference_filename=sequence_filename, + template=infile) else: - outfile = pysam.AlignmentFile(output_filename, - output_mode, - referencenames=infile.references, - referencelengths=infile.lengths, - add_sq_text=False) + outfile = pysam.AlignmentFile( + output_filename, + output_mode, + reference_names=infile.references, + reference_lengths=infile.lengths, + reference_filename=sequence_filename, + add_sq_text=False) iter = infile.fetch() for x in iter: outfile.write(x) + infile.close() outfile.close() - self.assertTrue( - checkBinaryEqual(os.path.join(DATADIR, reference_filename), - output_filename), - "files %s and %s are not the same" % (reference_filename, - output_filename)) + self.assertTrue(checkf( + os.path.join(DATADIR, reference_filename), + output_filename), + "files %s and %s are not the same" % + (reference_filename, + output_filename)) - def testReadWriteBam(self): + os.unlink(output_filename) - input_filename = "ex1.bam" - output_filename = "pysam_ex1.bam" - reference_filename = "ex1.bam" + def testSAM2SAM(self): + self.checkEcho("ex2.sam", + "ex2.sam", + "tmp_ex2.sam", + "r", "wh") + + def testBAM2BAM(self): + self.checkEcho("ex2.bam", + "ex2.bam", + "tmp_ex2.bam", + "rb", "wb") + + def testCRAM2CRAM(self): + self.checkEcho("ex2.cram", + "ex2.cram", + "tmp_ex2.cram", + "rc", "wc", + sequence_filename="pysam_data/ex1.fa", + checkf=checkSamtoolsViewEqual) + + def testSAM2BAM(self): + self.checkEcho("ex2.sam", + "ex2.bam", + "tmp_ex2.bam", + "r", "wb") + + def testBAM2SAM(self): + self.checkEcho("ex2.bam", + "ex2.sam", + "tmp_ex2.sam", + "rb", "wh") + + def testBAM2CRAM(self): + # ignore header (md5 sum) + self.checkEcho("ex2.bam", + "ex2.cram", + "tmp_ex2.cram", + "rb", "wc", + sequence_filename="pysam_data/ex1.fa", + checkf=partial( + checkSamtoolsViewEqual, + without_header=True)) + + def testCRAM2BAM(self): + # ignore header (md5 sum) + self.checkEcho("ex2.cram", + "ex2.bam", + "tmp_ex2.bam", + "rc", "wb", + sequence_filename="pysam_data/ex1.fa", + checkf=partial( + checkSamtoolsViewEqual, + without_header=True)) + + def testSAM2CRAM(self): + self.checkEcho("ex2.sam", + "ex2.cram", + "tmp_ex2.cram", + "r", "wc", + sequence_filename="pysam_data/ex1.fa", + checkf=partial( + checkSamtoolsViewEqual, + without_header=True)) + + def testCRAM2SAM(self): + self.checkEcho("ex2.cram", + "ex2.sam", + "tmp_ex2.sam", + "rc", "wh", + sequence_filename="pysam_data/ex1.fa", + checkf=partial( + checkSamtoolsViewEqual, + without_header=True)) - self.checkEcho(input_filename, reference_filename, output_filename, - "rb", "wb", use_template=True) # Disabled - should work, files are not binary equal, but are # non-binary equal: @@ -394,40 +490,30 @@ def testReadWriteBam(self): # self.checkEcho(input_filename, reference_filename, output_filename, # "rb", "wb", use_template=False) - def testReadWriteSamWithHeader(self): - - input_filename = "ex2.sam" - output_filename = "pysam_ex2.sam" - reference_filename = "ex2.sam" - - self.checkEcho(input_filename, - reference_filename, - output_filename, - "r", "wh") - # Release 0.8.0 # no samfiles without header - def testReadWriteSamWithoutHeader(self): - - input_filename = "ex2.sam" - output_filename = "pysam_ex2.sam" - reference_filename = "ex1.sam" - - self.checkEcho(input_filename, - reference_filename, - output_filename, + def testSAM2SAMWithoutHeader(self): + self.checkEcho("ex2.sam", + "ex1.sam", + "tmp_ex2.sam", "r", "w") + def testReadSamWithoutTargetNames(self): '''see issue 104.''' - input_filename = os.path.join(DATADIR, - "example_unmapped_reads_no_sq.sam") + input_filename = os.path.join( + DATADIR, + "example_unmapped_reads_no_sq.sam") # raise exception in default mode - self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r") + self.assertRaises(ValueError, + pysam.AlignmentFile, + input_filename, + "r") # raise exception if no SQ files - self.assertRaises(ValueError, pysam.AlignmentFile, + self.assertRaises(ValueError, + pysam.AlignmentFile, input_filename, "r", check_header=True) @@ -446,10 +532,16 @@ def testReadBamWithoutTargetNames(self): DATADIR, "example_unmapped_reads_no_sq.bam") # raise exception in default mode - self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r") + self.assertRaises(ValueError, + pysam.AlignmentFile, + input_filename, + "r") # raise exception if no SQ files - self.assertRaises(ValueError, pysam.AlignmentFile, input_filename, "r", + self.assertRaises(ValueError, + pysam.AlignmentFile, + input_filename, + "r", check_header=True) infile = pysam.AlignmentFile( @@ -502,19 +594,24 @@ def testReadSamWithoutHeader(self): def testBAMWithoutAlignedSegments(self): '''see issue 117''' input_filename = os.path.join(DATADIR, "test_unaligned.bam") - samfile = pysam.AlignmentFile(input_filename, "rb", check_sq=False) + samfile = pysam.AlignmentFile(input_filename, + "rb", + check_sq=False) samfile.fetch(until_eof=True) def testBAMWithShortBAI(self): '''see issue 116''' input_filename = os.path.join(DATADIR, "example_bai.bam") - samfile = pysam.AlignmentFile(input_filename, "rb", check_sq=False) + samfile = pysam.AlignmentFile(input_filename, + "rb", + check_sq=False) samfile.fetch('chr2') def testFetchFromClosedFile(self): - samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), - "rb") + samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex1.bam"), + "rb") samfile.close() self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120) @@ -563,12 +660,14 @@ def testAutoDetection(self): def testReadingFromFileWithoutIndex(self): '''read from bam file without index.''' - shutil.copyfile(os.path.join(DATADIR, "ex2.bam"), 'tmp_ex2.bam') + shutil.copyfile(os.path.join(DATADIR, "ex2.bam"), + 'tmp_ex2.bam') samfile = pysam.AlignmentFile('tmp_ex2.bam', "rb") self.assertRaises(ValueError, samfile.fetch) - self.assertEqual(len(list(samfile.fetch(until_eof=True))), - 3270) + self.assertEqual( + len(list(samfile.fetch(until_eof=True))), + 3270) os.unlink('tmp_ex2.bam') # def testReadingUniversalFileMode(self): @@ -612,12 +711,254 @@ def testWriteUncompressedBAMFile(self): output_filename, "r", "wbu") - def testEmptyBAM(self): - samfile = pysam.Samfile(os.path.join(DATADIR, "empty.bam"), - "rb") - self.assertEqual(samfile.mapped, 0) - self.assertEqual(samfile.unmapped, 0) - self.assertEqual(samfile.nocoordinate, 0) + def testEmptyBAM(self): + samfile = pysam.Samfile(os.path.join(DATADIR, "empty.bam"), + "rb") + self.assertEqual(samfile.mapped, 0) + self.assertEqual(samfile.unmapped, 0) + self.assertEqual(samfile.nocoordinate, 0) + + +################################################## +# +# Random access iterator tests +# +################################################## +class TestIteratorRowBAM(unittest.TestCase): + + filename = os.path.join(DATADIR, "ex2.bam") + mode = "rb" + + def setUp(self): + self.samfile = pysam.AlignmentFile( + self.filename, self.mode, + ) + + def checkRange(self, rnge): + '''compare results from iterator with those from samtools.''' + ps = list(self.samfile.fetch(region=rnge)) + sa = list(pysam.view(self.filename, + rnge, + raw=True)) + self.assertEqual( + len(ps), len(sa), + "unequal number of results for range %s: %i != %i" % + (rnge, len(ps), len(sa))) + # check if the same reads are returned and in the same order + for line, (a, b) in enumerate(list(zip(ps, sa))): + d = b.split("\t") + self.assertEqual( + a.query_name, d[0], + "line %i: read id mismatch: %s != %s" % + (line, a.reference_id, d[0])) + self.assertEqual( + a.reference_start, + int(d[3]) - 1, + "line %i: read position mismatch: %s != %s, \n%s\n%s\n" % + (line, a.reference_start, int(d[3]) - 1, + str(a), str(d))) + qual = d[10] + self.assertEqual( + pysam.toQualityString(a.query_qualities), + qual, + "line %i: quality mismatch: %s != %s, \n%s\n%s\n" % + (line, pysam.toQualityString(a.query_qualities), qual, + str(a), str(d))) + + def testIteratePerContig(self): + '''check random access per contig''' + for contig in self.samfile.references: + self.checkRange(contig) + + def testIterateRanges(self): + '''check random access per range''' + for contig, length in zip(self.samfile.references, + self.samfile.lengths): + for start in range(1, length, 90): + # this includes empty ranges + self.checkRange("%s:%i-%i" % + (contig, start, start + 90)) + + def tearDown(self): + self.samfile.close() + + +class TestIteratorRowAllBAM(unittest.TestCase): + + filename = os.path.join(DATADIR, "ex2.bam") + mode = "rb" + + def setUp(self): + self.samfile = pysam.AlignmentFile( + self.filename, + self.mode) + + def testIterate(self): + '''compare results from iterator with those from samtools.''' + ps = list(self.samfile.fetch()) + sa = list(pysam.view(self.filename, + raw=True)) + self.assertEqual( + len(ps), len(sa), + "unequal number of results: %i != %i" % + (len(ps), len(sa))) + # check if the same reads are returned + for line, pair in enumerate(list(zip(ps, sa))): + data = pair[1].split("\t") + self.assertEqual( + pair[0].query_name, + data[0], + "read id mismatch in line %i: %s != %s" % + (line, pair[0].reference_id, data[0])) + + def tearDown(self): + self.samfile.close() + + +class TestIteratorColumnBAM(unittest.TestCase): + + '''test iterator column against contents of ex4.bam.''' + + # note that samfile contains 1-based coordinates + # 1D means deletion with respect to reference sequence + # + mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35), + 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35), + } + + def setUp(self): + self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex4.bam"), + "rb") + + def checkRange(self, contig, start=None, end=None, truncate=False): + '''compare results from iterator with those from samtools.''' + # check if the same reads are returned and in the same order + for column in self.samfile.pileup( + contig, start, end, truncate=truncate): + if truncate: + self.assertGreaterEqual(column.reference_pos, start) + self.assertLess(column.reference_pos, end) + thiscov = len(column.pileups) + refcov = self.mCoverages[ + self.samfile.getrname(column.reference_id)][column.reference_pos] + self.assertEqual(thiscov, refcov, + "wrong coverage at pos %s:%i %i should be %i" % ( + self.samfile.getrname(column.reference_id), + column.reference_pos, thiscov, refcov)) + + def testIterateAll(self): + '''check random access per contig''' + self.checkRange(None) + + def testIteratePerContig(self): + '''check random access per contig''' + for contig in self.samfile.references: + self.checkRange(contig) + + def testIterateRanges(self): + '''check random access per range''' + for contig, length in zip( + self.samfile.references, self.samfile.lengths): + for start in range(1, length, 90): + # this includes empty ranges + self.checkRange(contig, start, start + 90) + + def testInverse(self): + '''test the inverse, is point-wise pileup accurate.''' + for contig, refseq in list(self.mCoverages.items()): + refcolumns = sum(refseq) + for pos, refcov in enumerate(refseq): + columns = list(self.samfile.pileup(contig, pos, pos + 1)) + if refcov == 0: + # if no read, no coverage + self.assertEqual( + len(columns), + refcov, + "wrong number of pileup columns returned for position %s:%i, %i should be %i" % ( + contig, pos, + len(columns), refcov)) + elif refcov == 1: + # one read, all columns of the read are returned + self.assertEqual( + len(columns), + refcolumns, + "pileup incomplete at position %i: got %i, expected %i " % + (pos, len(columns), refcolumns)) + + def testIterateTruncate(self): + '''check random access per range''' + for contig, length in zip(self.samfile.references, + self.samfile.lengths): + for start in range(1, length, 90): + # this includes empty ranges + self.checkRange(contig, start, start + 90, truncate=True) + + def tearDown(self): + self.samfile.close() + + +class TestIteratorRowCRAM(TestIteratorRowBAM): + filename = os.path.join(DATADIR, "ex2.cram") + mode = "rc" + + +class TestIteratorRowCRAM(TestIteratorRowBAM): + filename = os.path.join(DATADIR, "ex2.cram") + mode = "rc" + +########################################################## +########################################################## +########################################################## +# needs to be implemented +# class TestAlignedSegmentFromSamWithoutHeader(TestAlignedSegmentFromBam): +# +# def setUp(self): +# self.samfile=pysam.AlignmentFile( "ex7.sam","r" ) +# self.reads=list(self.samfile.fetch()) + + +class TestIteratorColumn2(unittest.TestCase): + + '''test iterator column against contents of ex1.bam.''' + + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex1.bam"), + "rb") + + def testStart(self): + # print self.samfile.fetch().next().reference_start + # print self.samfile.pileup().next().reference_start + pass + + def testTruncate(self): + '''see issue 107.''' + # note that ranges in regions start from 1 + p = self.samfile.pileup(region='chr1:170:172', truncate=True) + columns = [x.reference_pos for x in p] + self.assertEqual(len(columns), 3) + self.assertEqual(columns, [169, 170, 171]) + + p = self.samfile.pileup('chr1', 169, 172, truncate=True) + columns = [x.reference_pos for x in p] + + self.assertEqual(len(columns), 3) + self.assertEqual(columns, [169, 170, 171]) + + def testAccessOnClosedIterator(self): + '''see issue 131 + + Accessing pileup data after iterator has closed. + ''' + pcolumn = self.samfile.pileup('chr1', 170, 180).__next__() + self.assertRaises(ValueError, getattr, pcolumn, "pileups") + + def testStr(self): + '''test if PileupRead can be printed.''' + iter = self.samfile.pileup('chr1', 170, 180) + pcolumn = iter.__next__() + s = str(pcolumn) + self.assertEqual(len(s.split("\n")), 2) class TestFloatTagBug(unittest.TestCase): @@ -808,200 +1149,8 @@ def testClipping(self): '01234') -class TestIteratorRow(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def checkRange(self, rnge): - '''compare results from iterator with those from samtools.''' - ps = list(self.samfile.fetch(region=rnge)) - sa = list(pysam.view(os.path.join(DATADIR, "ex1.bam"), - rnge, - raw=True)) - self.assertEqual(len(ps), len( - sa), "unequal number of results for range %s: %i != %i" % (rnge, len(ps), len(sa))) - # check if the same reads are returned and in the same order - for line, (a, b) in enumerate(list(zip(ps, sa))): - d = b.split("\t") - self.assertEqual( - a.query_name, d[0], "line %i: read id mismatch: %s != %s" % (line, a.reference_id, d[0])) - self.assertEqual(a.reference_start, int(d[3]) - 1, "line %i: read position mismatch: %s != %s, \n%s\n%s\n" % - (line, a.reference_start, int(d[3]) - 1, - str(a), str(d))) - qual = d[10] - self.assertEqual(pysam.toQualityString(a.query_qualities), qual, "line %i: quality mismatch: %s != %s, \n%s\n%s\n" % - (line, pysam.toQualityString(a.query_qualities), qual, - str(a), str(d))) - - def testIteratePerContig(self): - '''check random access per contig''' - for contig in self.samfile.references: - self.checkRange(contig) - - def testIterateRanges(self): - '''check random access per range''' - for contig, length in zip(self.samfile.references, - self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange("%s:%i-%i" % (contig, start, start + 90)) - - def tearDown(self): - self.samfile.close() - - -class TestIteratorRowAll(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), - "rb") - - def testIterate(self): - '''compare results from iterator with those from samtools.''' - ps = list(self.samfile.fetch()) - sa = list(pysam.view(os.path.join(DATADIR, "ex1.bam"), - raw=True)) - self.assertEqual( - len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa))) - # check if the same reads are returned - for line, pair in enumerate(list(zip(ps, sa))): - data = pair[1].split("\t") - self.assertEqual(pair[0].query_name, data[ - 0], "read id mismatch in line %i: %s != %s" % (line, pair[0].reference_id, data[0])) - - def tearDown(self): - self.samfile.close() - - -class TestIteratorColumn(unittest.TestCase): - - '''test iterator column against contents of ex4.bam.''' - - # note that samfile contains 1-based coordinates - # 1D means deletion with respect to reference sequence - # - mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35), - 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35), - } - - def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex4.bam"), - "rb") - - def checkRange(self, contig, start=None, end=None, truncate=False): - '''compare results from iterator with those from samtools.''' - # check if the same reads are returned and in the same order - for column in self.samfile.pileup( - contig, start, end, truncate=truncate): - if truncate: - self.assertGreaterEqual(column.reference_pos, start) - self.assertLess(column.reference_pos, end) - thiscov = len(column.pileups) - refcov = self.mCoverages[ - self.samfile.getrname(column.reference_id)][column.reference_pos] - self.assertEqual(thiscov, refcov, - "wrong coverage at pos %s:%i %i should be %i" % ( - self.samfile.getrname(column.reference_id), - column.reference_pos, thiscov, refcov)) - - def testIterateAll(self): - '''check random access per contig''' - self.checkRange(None) - - def testIteratePerContig(self): - '''check random access per contig''' - for contig in self.samfile.references: - self.checkRange(contig) - - def testIterateRanges(self): - '''check random access per range''' - for contig, length in zip( - self.samfile.references, self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange(contig, start, start + 90) - - def testInverse(self): - '''test the inverse, is point-wise pileup accurate.''' - for contig, refseq in list(self.mCoverages.items()): - refcolumns = sum(refseq) - for pos, refcov in enumerate(refseq): - columns = list(self.samfile.pileup(contig, pos, pos + 1)) - if refcov == 0: - # if no read, no coverage - self.assertEqual( - len(columns), - refcov, - "wrong number of pileup columns returned for position %s:%i, %i should be %i" % ( - contig, pos, - len(columns), refcov)) - elif refcov == 1: - # one read, all columns of the read are returned - self.assertEqual( - len(columns), - refcolumns, - "pileup incomplete at position %i: got %i, expected %i " % - (pos, len(columns), refcolumns)) - - def testIterateTruncate(self): - '''check random access per range''' - for contig, length in zip(self.samfile.references, - self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange(contig, start, start + 90, truncate=True) - - def tearDown(self): - self.samfile.close() - - -class TestIteratorColumn2(unittest.TestCase): - - '''test iterator column against contents of ex1.bam.''' - - def setUp(self): - self.samfile = pysam.AlignmentFile( - os.path.join(DATADIR, "ex1.bam"), - "rb") - - def testStart(self): - # print self.samfile.fetch().next().reference_start - # print self.samfile.pileup().next().reference_start - pass - - def testTruncate(self): - '''see issue 107.''' - # note that ranges in regions start from 1 - p = self.samfile.pileup(region='chr1:170:172', truncate=True) - columns = [x.reference_pos for x in p] - self.assertEqual(len(columns), 3) - self.assertEqual(columns, [169, 170, 171]) - - p = self.samfile.pileup('chr1', 169, 172, truncate=True) - columns = [x.reference_pos for x in p] - - self.assertEqual(len(columns), 3) - self.assertEqual(columns, [169, 170, 171]) - - def testAccessOnClosedIterator(self): - '''see issue 131 - - Accessing pileup data after iterator has closed. - ''' - pcolumn = self.samfile.pileup('chr1', 170, 180).__next__() - self.assertRaises(ValueError, getattr, pcolumn, "pileups") - - def testStr(self): - '''test if PileupRead can be printed.''' - iter = self.samfile.pileup('chr1', 170, 180) - pcolumn = iter.__next__() - s = str(pcolumn) - self.assertEqual(len(s.split("\n")), 2) - - -class TestHeaderSam(unittest.TestCase): +class TestHeaderSAM(unittest.TestCase): + """testing header manipulation""" header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}], @@ -1021,8 +1170,9 @@ def compareHeaders(self, a, b): self.assertEqual(av, b[ak]) def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex3.sam"), - "r") + self.samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex3.sam"), + "r") def testHeaders(self): self.compareHeaders(self.header, self.samfile.header) @@ -1042,11 +1192,35 @@ def tearDown(self): self.samfile.close() -class TestHeaderBam(TestHeaderSam): +class TestHeaderBAM(TestHeaderSAM): + + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex3.bam"), + "rb") + + +class TestHeaderCRAM(TestHeaderSAM): def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex3.bam"), - "rb") + self.samfile = pysam.AlignmentFile( + os.path.join(DATADIR, "ex3.cram"), + "rc") + + def compareHeaders(self, a, b): + '''compare two headers a and b.''' + def _strip(dd): + for x in dd: + for y in ("M5", "UR"): + if y in x: + del x[y] + + for ak, av in a.items(): + _strip(av) + self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) + _strip(b[ak]) + + self.assertEqual(sorted(av), sorted(b[ak])) class TestHeaderFromRefs(unittest.TestCase): @@ -1145,9 +1319,8 @@ def testPileupRead(self): ": %s != %s" % (len(pcolumn1.pileups), 2)) - -# self.assertEqual( pcolumn1.pileups[0] # need to test additional -# properties here + # self.assertEqual( pcolumn1.pileups[0] # need to test additional + # properties here def tearDown(self): self.samfile.close() @@ -1269,252 +1442,7 @@ def testOpenFastaAsBam(self): 'rb') -class ReadTest(unittest.TestCase): - - def checkFieldEqual(self, read1, read2, exclude=[]): - '''check if two reads are equal by comparing each field.''' - - # add the . for refactoring purposes. - for x in (".query_name", - ".query_sequence", - ".flag", - ".reference_id", - ".reference_start", - ".mapping_quality", - ".cigartuples", - ".next_reference_id", - ".next_reference_start", - ".template_length", - ".query_length", - ".query_qualities", - ".bin", - ".is_paired", ".is_proper_pair", - ".is_unmapped", ".mate_is_unmapped", - ".is_reverse", ".mate_is_reverse", - ".is_read1", ".is_read2", - ".is_secondary", ".is_qcfail", - ".is_duplicate"): - n = x[1:] - if n in exclude: - continue - self.assertEqual(getattr(read1, n), getattr(read2, n), - "attribute mismatch for %s: %s != %s" % - (n, getattr(read1, n), getattr(read2, n))) - - -class TestAlignedSegment(ReadTest): - - '''tests to check if aligned read can be constructed - and manipulated. - ''' - - def testEmpty(self): - a = pysam.AlignedSegment() - self.assertEqual(a.query_name, None) - self.assertEqual(a.query_sequence, None) - self.assertEqual(pysam.toQualityString(a.query_qualities), None) - self.assertEqual(a.flag, 0) - self.assertEqual(a.reference_id, 0) - self.assertEqual(a.mapping_quality, 0) - self.assertEqual(a.cigartuples, None) - self.assertEqual(a.tags, []) - self.assertEqual(a.next_reference_id, 0) - self.assertEqual(a.next_reference_start, 0) - self.assertEqual(a.template_length, 0) - - def testStrOfEmptyRead(self): - a = pysam.AlignedSegment() - s = str(a) - self.assertEqual( - "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]", - s) - - def buildRead(self): - '''build an example read.''' - - a = pysam.AlignedSegment() - a.query_name = "read_12345" - a.query_sequence = "ACGT" * 10 - a.flag = 0 - a.reference_id = 0 - a.reference_start = 20 - a.mapping_quality = 20 - a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) - a.next_reference_id = 0 - a.next_reference_start = 200 - a.template_length = 167 - a.query_qualities = pysam.fromQualityString("1234") * 10 - # todo: create tags - return a - - def testSettingTagInEmptyRead(self): - '''see issue 62''' - a = pysam.AlignedSegment() - a.tags = (("NM", 1),) - a.query_qualities = None - self.assertEqual(a.tags, [("NM", 1), ]) - - def testUpdate(self): - '''check if updating fields affects other variable length data - ''' - a = self.buildRead() - b = self.buildRead() - - # check qname - b.query_name = "read_123" - self.checkFieldEqual(a, b, "query_name") - b.query_name = "read_12345678" - self.checkFieldEqual(a, b, "query_name") - b.query_name = "read_12345" - self.checkFieldEqual(a, b) - - # check cigar - b.cigartuples = ((0, 10), ) - self.checkFieldEqual(a, b, "cigartuples") - b.cigartuples = ((0, 10), (2, 1), (0, 10)) - self.checkFieldEqual(a, b, "cigartuples") - b.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) - self.checkFieldEqual(a, b) - - # check seq - b.query_sequence = "ACGT" - self.checkFieldEqual( - a, b, - ("query_sequence", "query_qualities", "query_length")) - b.query_sequence = "ACGT" * 3 - self.checkFieldEqual( - a, b, - ("query_sequence", "query_qualities", "query_length")) - b.query_sequence = "ACGT" * 10 - self.checkFieldEqual(a, b, ("query_qualities",)) - - # reset qual - b = self.buildRead() - - # check flags: - for x in ( - "is_paired", "is_proper_pair", - "is_unmapped", "mate_is_unmapped", - "is_reverse", "mate_is_reverse", - "is_read1", "is_read2", - "is_secondary", "is_qcfail", - "is_duplicate"): - setattr(b, x, True) - self.assertEqual(getattr(b, x), True) - self.checkFieldEqual(a, b, ("flag", x,)) - setattr(b, x, False) - self.assertEqual(getattr(b, x), False) - self.checkFieldEqual(a, b) - - def testUpdate2(self): - '''issue 135: inplace update of sequence and quality score. - - This does not work as setting the sequence will erase - the quality scores. - ''' - a = self.buildRead() - a.query_sequence = a.query_sequence[5:10] - self.assertEqual(pysam.toQualityString(a.query_qualities), None) - - a = self.buildRead() - s = pysam.toQualityString(a.query_qualities) - a.query_sequence = a.query_sequence[5:10] - a.query_qualities = pysam.fromQualityString(s[5:10]) - - self.assertEqual(pysam.toQualityString(a.query_qualities), s[5:10]) - - def testLargeRead(self): - '''build an example read.''' - - a = pysam.AlignedSegment() - a.query_name = "read_12345" - a.query_sequence = "ACGT" * 200 - a.flag = 0 - a.reference_id = 0 - a.reference_start = 20 - a.mapping_quality = 20 - a.cigartuples = ((0, 4 * 200), ) - a.next_reference_id = 0 - a.next_reference_start = 200 - a.template_length = 167 - a.query_qualities = pysam.fromQualityString("1234") * 200 - - return a - - def testTagParsing(self): - '''test for tag parsing - - see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a - ''' - samfile = pysam.AlignmentFile(os.path.join(DATADIR, "ex8.bam"), - "rb") - - for entry in samfile: - before = entry.tags - entry.tags = entry.tags - after = entry.tags - self.assertEqual(after, before) - - def testUpdateTlen(self): - '''check if updating tlen works''' - a = self.buildRead() - oldlen = a.template_length - oldlen *= 2 - a.template_length = oldlen - self.assertEqual(a.template_length, oldlen) - - def testPositions(self): - a = self.buildRead() - self.assertEqual(a.get_reference_positions(), - [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]) - - self.assertEqual(a.get_aligned_pairs(), - [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24), - (5, 25), (6, 26), (7, 27), (8, 28), (9, 29), - (None, 30), - (10, 31), (11, 32), (12, 33), (13, 34), (14, 35), - (15, 36), (16, 37), (17, 38), (18, 39), (19, None), - (20, 40), (21, 41), (22, 42), (23, 43), (24, 44), - (25, 45), (26, 46), (27, 47), (28, 48), (29, 49), - (30, 50), (31, 51), (32, 52), (33, 53), (34, 54), - (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)]) - - self.assertEqual( - a.get_reference_positions(), - [x[1] for x in a.get_aligned_pairs() - if x[0] is not None and x[1] is not None]) - # alen is the length of the aligned read in genome - self.assertEqual(a.reference_length, - a.get_aligned_pairs()[-1][0] + 1) - # aend points to one beyond last aligned base in ref - self.assertEqual(a.get_reference_positions()[-1], - a.reference_end - 1) - - def testFullReferencePositions(self): - '''see issue 26''' - a = self.buildRead() - a.cigar = [(4, 30), (0, 20), (1, 3), (0, 47)] - - self.assertEqual(100, - len(a.get_reference_positions(full_length=True))) - - def testBlocks(self): - a = self.buildRead() - self.assertEqual(a.get_blocks(), - [(20, 30), (31, 40), (40, 60)]) - - # Disabled as not backwards compatible - # def testFancyStr(self): - # a = self.buildRead() - # output = a.fancy_str() - # self.assertEqual(len(output), 9) - - -class TestDeNovoConstruction(ReadTest): - +class TestDeNovoConstruction(unittest.TestCase): '''check BAM/SAM file construction using ex6.sam (note these are +1 coordinates): @@ -1587,7 +1515,7 @@ def testBAMPerRead(self): others = list(infile) for denovo, other in zip(others, self.reads): - self.checkFieldEqual(other, denovo) + checkFieldEqual(self, other, denovo) self.assertEqual(other.compare(denovo), 0) # TODO @@ -1597,7 +1525,7 @@ def testBAMPerRead(self): # others = list(infile) # for denovo, other in zip(others, self.reads): - # self.checkFieldEqual(other, denovo) + # checkFieldEqual(self, other, denovo) # self.assertEqual(other.compare(denovo), 0) def testBAMWholeFile(self): diff --git a/tests/TestUtils.py b/tests/TestUtils.py index d005cd9e..8f07022a 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -1,5 +1,8 @@ import sys import os +import pysam +import difflib + IS_PYTHON3 = sys.version_info[0] >= 3 @@ -12,7 +15,8 @@ def checkBinaryEqual(filename1, filename2): - '''return true if the two files are binary equal.''' + '''return true if the two files are binary equal. + ''' if os.path.getsize(filename1) != os.path.getsize(filename2): return False @@ -38,6 +42,42 @@ def chariter(infile): return found +def checkSamtoolsViewEqual(filename1, filename2, + without_header=False): + '''return true if the two files are equal in their + content through samtools view. + ''' + + # strip MD and NM tags, as not preserved in CRAM files + args = ["-x", "MD", "-x", "NM"] + if not without_header: + args.append("-h") + + lines1 = pysam.view(*(args + [filename1])) + lines2 = pysam.view(*(args + [filename2])) + + if len(lines1) != len(lines2): + return False + + if lines1 != lines2: + # line by line comparison + # sort each line, as tags get rearranged between + # BAM/CRAM + for n, pair in enumerate(zip(lines1, lines2)): + l1, l2 = pair + l1 = sorted(l1[:-1].split("\t")) + l2 = sorted(l2[:-1].split("\t")) + if l1 != l2: + print "mismatch in line %i" % n + print l1 + print l2 + return False + else: + return False + + return True + + def checkURL(url): '''return True if URL is available. @@ -50,3 +90,34 @@ def checkURL(url): except: return False + +def checkFieldEqual(cls, read1, read2, exclude=[]): + '''check if two reads are equal by comparing each field.''' + + # add the . for refactoring purposes. + for x in (".query_name", + ".query_sequence", + ".flag", + ".reference_id", + ".reference_start", + ".mapping_quality", + ".cigartuples", + ".next_reference_id", + ".next_reference_start", + ".template_length", + ".query_length", + ".query_qualities", + ".bin", + ".is_paired", ".is_proper_pair", + ".is_unmapped", ".mate_is_unmapped", + ".is_reverse", ".mate_is_reverse", + ".is_read1", ".is_read2", + ".is_secondary", ".is_qcfail", + ".is_duplicate"): + n = x[1:] + if n in exclude: + continue + cls.assertEqual(getattr(read1, n), getattr(read2, n), + "attribute mismatch for %s: %s != %s" % + (n, getattr(read1, n), getattr(read2, n))) + diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index 092d552f..8b0964a2 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -1,14 +1,17 @@ SAM=$(wildcard *.sam) BAM=$(SAM:%.sam=%.bam) BAI=$(BAM:%.bam=%.bam.bai) +CRAM=ex1.cram ex2.cram ex3.cram +CRAI=$(CRAM:%.cram=%.cram.crai) # ex2.bam - bam file without index all: ex1.pileup.gz \ ex1.sam ex1.bam \ - ex2.sam.gz ex2.sam ex2.bam \ + ex2.sam.gz ex2.sam ex2.bam ex2.bam.bai \ uncompressed.bam \ $(BAM) $(BAI) \ + $(CRAM) $(CRAI) \ example_bai.bam \ rg_with_tab.bam \ ex2_truncated.bam ex2_truncated.bam.bai \ @@ -27,6 +30,12 @@ uncompressed.bam: ex2.sam %.bam: %.sam samtools view -bS $< > $@ +%.cram: %.sam + samtools view -bC -T ex1.fa $< > $@ + +%.cram.crai: %.cram + samtools index $< + %.sam: %.sam.gz gunzip < $< > $@ diff --git a/tests/samtools_test.py b/tests/samtools_test.py index 203c0a7a..a88bc9b1 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -23,7 +23,6 @@ def runSamtools(cmd): '''run a samtools command''' - try: retcode = subprocess.call(cmd, shell=True, stderr=subprocess.PIPE) @@ -224,16 +223,11 @@ def setUp(self): # copy the source files to WORKDIR os.makedirs(WORKDIR) - shutil.copy(os.path.join(DATADIR, "ex1.fa"), - os.path.join(WORKDIR, "pysam_ex1.fa")) - shutil.copy(os.path.join(DATADIR, "ex1.fa"), - os.path.join(WORKDIR, "ex1.fa")) - shutil.copy(os.path.join(DATADIR, "ex1.sam.gz"), - os.path.join(WORKDIR, "ex1.sam.gz")) - shutil.copy(os.path.join(DATADIR, "ex1.sam"), - os.path.join(WORKDIR, "ex1.sam")) - shutil.copy(os.path.join(DATADIR, "ex2.bam"), - os.path.join(WORKDIR, "ex2.bam")) + for f in ("ex1.fa", "ex1.sam.gz", + "ex1.sam", "ex2.bam", + "ex1.bed"): + shutil.copy(os.path.join(DATADIR, f), + os.path.join(WORKDIR, f)) # cd to workdir savedir = os.getcwd()