diff --git a/.cirrus.yml b/.cirrus.yml index bbd36a40..98331454 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -1,3 +1,24 @@ +freebsd_ci_task: + name: CI / FreeBSD + + freebsd_instance: + image_family: freebsd-14-2 + + install_script: | + pkg install -y bcftools gmake py311-cython3 py311-mypy py311-pytest samtools + + env: + CC: "clang -isystem /usr/local/include" + MAKE: "gmake" + REF_PATH: ":" + + build_script: | + python setup.py build + + test_script: | + PYTHONPATH="$(echo $PWD/build/lib.*)" pytest + + build_wheels_task: only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*" diff --git a/doc/api.rst b/doc/api.rst index 47fe314b..fc88f172 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -15,7 +15,7 @@ To use the module to read a file in BAM format, create a import pysam samfile = pysam.AlignmentFile("ex1.bam", "rb") -Once a file is opened you can iterate over all of the read mapping to +Once a file is opened you can iterate over all of the reads mapping to a specified region using :meth:`~pysam.AlignmentFile.fetch`. Each iteration returns a :class:`~pysam.AlignedSegment` object which represents a single read along with its fields and optional tags:: @@ -103,7 +103,7 @@ tabix indexed tab-separated file formats with genomic data:: :class:`~pysam.TabixFile` implements lazy parsing in order to iterate over large tables efficiently. -More detailed usage instructions is at :ref:`usage`. +More detailed usage instructions are available at :ref:`usage`. .. note:: @@ -200,7 +200,6 @@ FASTQ files .. autoclass:: pysam.FastxFile :members: - .. autoclass:: pysam.FastqProxy :members: @@ -214,10 +213,10 @@ VCF/BCF files .. autoclass:: pysam.VariantHeader :members: -.. autoclass:: pysam.VariantRecord +.. autoclass:: pysam.VariantHeaderRecord :members: -.. autoclass:: pysam.VariantHeaderRecord +.. autoclass:: pysam.VariantRecord :members: HTSFile diff --git a/doc/developer.rst b/doc/developer.rst index cefc9eaf..8d424847 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -36,19 +36,21 @@ directories: Python language level ===================== -Pysam currently requires Python 3.6 as a minimum language level. +Pysam currently requires Python 3.8 as a minimum language level. For example, this means that the following comparatively recent language features and library functions are available for use: * f-strings * ``raise ... from None`` * :meth:`str.startswith`, :meth:`str.endswith`, :meth:`str.rstrip`, etc +* walrus ``:=`` operator in Python code -However in particular the the following should not be used in +However in particular the following should not be used in pysam source code or infrastructure scripts: * :meth:`str.removeprefix`, :meth:`str.removesuffix` (new in 3.9) -* walrus ``:=`` operator (new in 3.8) +* walrus ``:=`` operator in Cython code (requires Cython 3) +* ``Optional[type]`` type hints written as ``type | None`` etc (new in 3.10) Importing new versions of htslib and samtools diff --git a/pysam/cbcftools_util.h b/pysam/cbcftools_util.h deleted file mode 100644 index 4a9f2e9c..00000000 --- a/pysam/cbcftools_util.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef CBCFTOOLS_UTIL_H -#define CBCFTOOLS_UTIL_H - -int bcftools_main(int argc, char *argv[]); - -#endif diff --git a/pysam/csamtools_util.h b/pysam/csamtools_util.h deleted file mode 100644 index 0a03c138..00000000 --- a/pysam/csamtools_util.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef CSAMTOOLS_UTIL_H -#define CSAMTOOLS_UTIL_H - -int samtools_main(int argc, char *argv[]); - -#endif diff --git a/pysam/htslib_util.c b/pysam/htslib_util.c index 08309006..bc8ab894 100644 --- a/pysam/htslib_util.c +++ b/pysam/htslib_util.c @@ -1,13 +1,8 @@ -#include #include #include "htslib/khash.h" -#include "htslib/ksort.h" #include "htslib/sam.h" #include "htslib/hts.h" -#include "htslib/knetfile.h" -#include "htslib/kseq.h" #include "htslib_util.h" -#include #ifndef inline #define inline __inline diff --git a/pysam/libcalignedsegment.pyi b/pysam/libcalignedsegment.pyi index 5665d3e6..66da76ed 100644 --- a/pysam/libcalignedsegment.pyi +++ b/pysam/libcalignedsegment.pyi @@ -2,12 +2,7 @@ import enum import re import sys from array import array -from typing import Any, List, Optional, Dict, Tuple, Union, overload - -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal +from typing import Any, List, Literal, Optional, Dict, Tuple, Union, overload from pysam import AlignmentHeader # type: ignore diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 6f106af9..5723a5af 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -11,14 +11,10 @@ from typing import ( Union, Callable, List, + Literal, Iterable, ) -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal - from pysam.libchtslib import HTSFile, _HasFileNo from pysam.libcalignedsegment import AlignedSegment, PileupColumn from pysam.libcfaidx import FastaFile diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 4373a75f..d65d06b4 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -952,7 +952,7 @@ cdef class AlignmentFile(HTSFile): else: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) - if self.htsfile.format.category != sequence_data: + if hts_get_format(self.htsfile).category != sequence_data: raise ValueError("file does not contain alignment data") if format_options and len(format_options): @@ -2320,38 +2320,30 @@ cdef class IteratorRowSelection(IteratorRow): raise IOError(read_failure_reason(ret)) -cdef int __advance_nofilter(void *data, bam1_t *b): +cdef int __advance_nofilter(void *data, bam1_t *b) noexcept nogil: '''advance without any read filtering. ''' cdef __iterdata * d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - return ret + return sam_itr_next(d.htsfile, d.iter, b) -cdef int __advance_raw_nofilter(void *data, bam1_t *b): +cdef int __advance_raw_nofilter(void *data, bam1_t *b) noexcept nogil: '''advance (without iterator) without any read filtering. ''' cdef __iterdata * d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_read1(d.htsfile, d.header, b) - return ret + return sam_read1(d.htsfile, d.header, b) -cdef int __advance_all(void *data, bam1_t *b): +cdef int __advance_all(void *data, bam1_t *b) noexcept nogil: '''only use reads for pileup passing basic filters such as BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP ''' cdef __iterdata * d = <__iterdata*>data - cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP cdef int ret while 1: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) + ret = sam_itr_next(d.htsfile, d.iter, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2360,7 +2352,7 @@ cdef int __advance_all(void *data, bam1_t *b): return ret -cdef int __advance_raw_all(void *data, bam1_t *b): +cdef int __advance_raw_all(void *data, bam1_t *b) noexcept nogil: '''only use reads for pileup passing basic filters such as BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP @@ -2369,8 +2361,7 @@ cdef int __advance_raw_all(void *data, bam1_t *b): cdef __iterdata * d = <__iterdata*>data cdef int ret while 1: - with nogil: - ret = sam_read1(d.htsfile, d.header, b) + ret = sam_read1(d.htsfile, d.header, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2379,7 +2370,7 @@ cdef int __advance_raw_all(void *data, bam1_t *b): return ret -cdef int __advance_samtools(void * data, bam1_t * b): +cdef int __advance_samtools(void * data, bam1_t *b) nogil: '''advance using same filter and read processing as in the samtools pileup. ''' @@ -2388,8 +2379,7 @@ cdef int __advance_samtools(void * data, bam1_t * b): cdef int q while 1: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) + ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2402,13 +2392,7 @@ cdef int __advance_samtools(void * data, bam1_t * b): if d.seq != NULL: free(d.seq) d.tid = b.core.tid - with nogil: - d.seq = faidx_fetch_seq( - d.fastafile, - d.header.target_name[d.tid], - 0, MAX_POS, - &d.seq_len) - + d.seq = faidx_fetch_seq(d.fastafile, d.header.target_name[d.tid], 0, MAX_POS, &d.seq_len) if d.seq == NULL: raise ValueError( "reference sequence for '{}' (tid={}) not found".format( @@ -2560,19 +2544,13 @@ cdef class IteratorColumn: if self.stepper is None or self.stepper == "all": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_all, - data) + self.pileup_iter = bam_mplp_init(1, __advance_all, data) elif self.stepper == "nofilter": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_nofilter, - data) + self.pileup_iter = bam_mplp_init(1, __advance_nofilter, data) elif self.stepper == "samtools": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_samtools, - data) + self.pileup_iter = bam_mplp_init(1, __advance_samtools, data) else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) @@ -2609,19 +2587,13 @@ cdef class IteratorColumn: if self.stepper is None or self.stepper == "all": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_raw_all, - data) + self.pileup_iter = bam_mplp_init(1, __advance_raw_all, data) elif self.stepper == "nofilter": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_raw_nofilter, - data) + self.pileup_iter = bam_mplp_init(1, __advance_raw_nofilter, data) elif self.stepper == "samtools": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_samtools, - data) + self.pileup_iter = bam_mplp_init(1, __advance_samtools, data) else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) diff --git a/pysam/libcbcf.pyi b/pysam/libcbcf.pyi index e643562d..f5a7e34c 100644 --- a/pysam/libcbcf.pyi +++ b/pysam/libcbcf.pyi @@ -7,6 +7,7 @@ from typing import ( Tuple, Iterator, List, + Literal, Iterable, Dict, overload, @@ -15,11 +16,6 @@ from typing import ( Generic, ) -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal - from pysam.libchtslib import HTSFile, _HasFileNo _D = TypeVar("_D") diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index f1eac7c0..24179d6c 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -4248,6 +4248,7 @@ cdef class VariantFile(HTSFile): """ cdef bcf_hdr_t *hdr cdef BGZF *bgzfp + cdef const htsFormat *fmt cdef hts_idx_t *idx cdef tbx_t *tidx cdef char *cfilename @@ -4341,7 +4342,8 @@ cdef class VariantFile(HTSFile): else: raise ValueError('could not open variant file `{}`'.format(filename)) - if self.htsfile.format.format not in (bcf, vcf): + fmt = hts_get_format(self.htsfile) + if fmt.format not in (bcf, vcf): raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) self.check_truncation(ignore_truncation) @@ -4360,14 +4362,14 @@ cdef class VariantFile(HTSFile): cfilename = NULL # check for index and open if present - if self.htsfile.format.format == bcf and cfilename: + if fmt.format == bcf and cfilename: if index_filename is not None: cindex_filename = index_filename with nogil: idx = bcf_index_load2(cfilename, cindex_filename) self.index = makeBCFIndex(self.header, idx) - elif self.htsfile.format.compression == bgzf and cfilename: + elif fmt.compression == bgzf and cfilename: if index_filename is not None: cindex_filename = index_filename with nogil: diff --git a/pysam/libcbgzf.pyi b/pysam/libcbgzf.pyi index 4d64e8db..6c19d785 100644 --- a/pysam/libcbgzf.pyi +++ b/pysam/libcbgzf.pyi @@ -1,11 +1,6 @@ import sys -from typing import Optional, Union, Any, NoReturn - -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal +from typing import Literal, Optional, Union, Any, NoReturn BUFFER_SIZE: int diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd index c17d0ba0..8380e90e 100644 --- a/pysam/libcfaidx.pxd +++ b/pysam/libcfaidx.pxd @@ -9,31 +9,19 @@ cimport cython from cpython cimport array from pysam.libchtslib cimport faidx_t, kstring_t, BGZF -# These functions are put here and not in chtslib.pxd in order -# to avoid warnings for unused functions. -cdef extern from "pysam_stream.h" nogil: - - ctypedef struct kstream_t: - pass - +cdef extern from "htslib/kseq.h" nogil: + """ + struct __kstream_t; + #define kstream_t struct __kstream_t + __KSEQ_TYPE(type_t_unused_here) + #undef kstream_t + """ ctypedef struct kseq_t: kstring_t name kstring_t comment kstring_t seq kstring_t qual - kseq_t *kseq_init(BGZF *) - int kseq_read(kseq_t *) - void kseq_destroy(kseq_t *) - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) cdef class FastaFile: cdef bint is_remote diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index d4e7427b..dd214170 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -70,6 +70,17 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size from pysam.libcutils cimport qualitystring_to_array, parse_region +cdef extern from "htslib/kseq.h" nogil: + """ + #undef __KSEQ_TYPE + #define __KSEQ_TYPE(type_t) + KSEQ_INIT2(static, BGZF *, bgzf_read) + """ + kseq_t *kseq_init(BGZF *) + int kseq_read(kseq_t *) + void kseq_destroy(kseq_t *) + + cdef class FastqProxy cdef makeFastqProxy(kseq_t * src): '''enter src into AlignedRead.''' diff --git a/pysam/libchtslib.pyi b/pysam/libchtslib.pyi index 61b64b6a..ffa1b43d 100644 --- a/pysam/libchtslib.pyi +++ b/pysam/libchtslib.pyi @@ -1,10 +1,5 @@ import sys -from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, TypeVar - -if sys.version_info < (3, 8): - from typing_extensions import Protocol -else: - from typing import Protocol +from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, Protocol, TypeVar class _HasFileNo(Protocol): def fileno(self) -> int: ... diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 25d79ecc..ce471765 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -349,7 +349,7 @@ cdef class HTSFile(object): if not self.htsfile: return - if self.htsfile.format.compression != bgzf: + if hts_get_format(self.htsfile).compression != bgzf: return cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile) @@ -379,7 +379,7 @@ cdef class HTSFile(object): VARIANTS, INDEX, REGIONS""" if not self.htsfile: raise ValueError('metadata not available on closed file') - return FORMAT_CATEGORIES[self.htsfile.format.category] + return FORMAT_CATEGORIES[hts_get_format(self.htsfile).category] @property def format(self): @@ -390,14 +390,15 @@ cdef class HTSFile(object): """ if not self.htsfile: raise ValueError('metadata not available on closed file') - return FORMATS[self.htsfile.format.format] + return FORMATS[hts_get_format(self.htsfile).format] @property def version(self): """Tuple of file format version numbers (major, minor)""" if not self.htsfile: raise ValueError('metadata not available on closed file') - return self.htsfile.format.version.major, self.htsfile.format.version.minor + cdef const htsFormat *fmt = hts_get_format(self.htsfile) + return fmt.version.major, fmt.version.minor @property def compression(self): @@ -406,14 +407,14 @@ cdef class HTSFile(object): One of NONE, GZIP, BGZF, CUSTOM.""" if not self.htsfile: raise ValueError('metadata not available on closed file') - return COMPRESSION[self.htsfile.format.compression] + return COMPRESSION[hts_get_format(self.htsfile).compression] @property def description(self): """Vaguely human readable description of the file format""" if not self.htsfile: raise ValueError('metadata not available on closed file') - cdef char *desc = hts_format_description(&self.htsfile.format) + cdef char *desc = hts_format_description(hts_get_format(self.htsfile)) try: return charptr_to_str(desc) finally: @@ -447,27 +448,27 @@ cdef class HTSFile(object): @property def is_sam(self): """return True if HTSFile is reading or writing a SAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == sam + return self.htsfile != NULL and hts_get_format(self.htsfile).format == sam @property def is_bam(self): """return True if HTSFile is reading or writing a BAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == bam + return self.htsfile != NULL and hts_get_format(self.htsfile).format == bam @property def is_cram(self): """return True if HTSFile is reading or writing a BAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == cram + return self.htsfile != NULL and hts_get_format(self.htsfile).format == cram @property def is_vcf(self): """return True if HTSFile is reading or writing a VCF variant file""" - return self.htsfile != NULL and self.htsfile.format.format == vcf + return self.htsfile != NULL and hts_get_format(self.htsfile).format == vcf @property def is_bcf(self): """return True if HTSFile is reading or writing a BCF variant file""" - return self.htsfile != NULL and self.htsfile.format.format == bcf + return self.htsfile != NULL and hts_get_format(self.htsfile).format == bcf def reset(self): """reset file position to beginning of file just after the header. @@ -490,14 +491,14 @@ cdef class HTSFile(object): whence = libc_whence_from_io(whence) cdef int64_t ret - if self.htsfile.format.compression == bgzf: + cdef htsCompression compression = hts_get_format(self.htsfile).compression + if compression == bgzf: with nogil: ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence) - elif self.htsfile.format.compression == no_compression: + elif compression == no_compression: ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1 else: - raise NotImplementedError("seek not implemented in files compressed by method {}".format( - self.htsfile.format.compression)) + raise NotImplementedError(f"seek not implemented in files compressed by method {compression}") return ret def tell(self): @@ -508,17 +509,17 @@ cdef class HTSFile(object): raise IOError('tell not available in streams') cdef int64_t ret - if self.htsfile.format.compression == bgzf: + cdef const htsFormat *fmt = hts_get_format(self.htsfile) + if fmt.compression == bgzf: with nogil: ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) - elif self.htsfile.format.compression == no_compression: + elif fmt.compression == no_compression: ret = htell(self.htsfile.fp.hfile) - elif self.htsfile.format.format == cram: + elif fmt.format == cram: with nogil: ret = htell(cram_fd_get_fp(self.htsfile.fp.cram)) else: - raise NotImplementedError("seek not implemented in files compressed by method {}".format( - self.htsfile.format.compression)) + raise NotImplementedError(f"seek not implemented in files compressed by method {fmt.compression}") return ret diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd index 174dd8b4..4096790c 100644 --- a/pysam/libctabix.pxd +++ b/pysam/libctabix.pxd @@ -18,36 +18,9 @@ from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \ tbx_t, kstring_t, BGZF, HTSFile -# These functions are put here and not in chtslib.pxd in order -# to avoid warnings for unused functions. -cdef extern from "pysam_stream.h" nogil: - - ctypedef struct kstream_t: - pass - - ctypedef struct kseq_t: - kstring_t name - kstring_t comment - kstring_t seq - kstring_t qual - - kseq_t *kseq_init(BGZF *) - int kseq_read(kseq_t *) - void kseq_destroy(kseq_t *) - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) - - cdef class tabix_file_iterator: cdef BGZF * fh - cdef kstream_t * kstream + cdef void * unused cdef kstring_t buffer cdef size_t size cdef Parser parser @@ -109,7 +82,7 @@ cdef class TabixIteratorParsed(TabixIterator): cdef class GZIterator: cdef object _filename cdef BGZF * gzipfile - cdef kstream_t * kstream + cdef void * unused cdef kstring_t buffer cdef int __cnext__(self) cdef encoding diff --git a/pysam/libctabix.pyi b/pysam/libctabix.pyi index 517a74d2..3a4f5b57 100644 --- a/pysam/libctabix.pyi +++ b/pysam/libctabix.pyi @@ -1,10 +1,5 @@ import sys -from typing import Optional, List, Any - -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal +from typing import Optional, List, Literal, Any from pysam.libchtslib import HTSFile diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index deae908b..54a2006a 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -69,7 +69,7 @@ from cpython cimport PyErr_SetString, PyBytes_Check, \ cimport pysam.libctabixproxies as ctabixproxies from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ - BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ + BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_getline, bgzf_write, \ tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ tbx_destroy, hisremote, region_list, hts_getline, \ @@ -79,6 +79,7 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size + cdef class Parser: def __init__(self, encoding="ascii"): @@ -389,7 +390,7 @@ cdef class TabixFile: if self.htsfile == NULL: raise IOError("could not open file `%s`" % filename) - #if self.htsfile.format.category != region_list: + #if hts_get_format(self.htsfile).category != region_list: # raise ValueError("file does not contain region data") with nogil: @@ -732,7 +733,6 @@ cdef class GZIterator: with nogil: self.gzipfile = bgzf_open(cfilename, "r") self._filename = filename - self.kstream = ks_init(self.gzipfile) self.encoding = encoding self.buffer.l = 0 @@ -746,24 +746,15 @@ cdef class GZIterator: self.gzipfile = NULL if self.buffer.s != NULL: free(self.buffer.s) - if self.kstream != NULL: - ks_destroy(self.kstream) def __iter__(self): return self cdef int __cnext__(self): - cdef int dret = 0 - cdef int retval = 0 - while 1: - with nogil: - retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) - - if retval < 0: - break - - return dret - return -1 + cdef int retval + with nogil: + retval = bgzf_getline(self.gzipfile, b'\n', &self.buffer) + return retval def __next__(self): """python version of next(). @@ -1021,70 +1012,6 @@ def tabix_index(filename, return filename -# ######################################################### -# cdef class tabix_file_iterator_old: -# '''iterate over ``infile``. - -# This iterator is not safe. If the :meth:`__next__()` method is called -# after ``infile`` is closed, the result is undefined (see ``fclose()``). - -# The iterator might either raise a StopIteration or segfault. -# ''' - - -# def __cinit__(self, -# infile, -# Parser parser, -# int buffer_size = 65536 ): - -# cdef int fd = PyObject_AsFileDescriptor( infile ) -# if fd == -1: raise ValueError( "I/O operation on closed file." ) -# self.infile = fdopen( fd, 'r') - -# if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) - -# self.buffer = malloc( buffer_size ) -# self.size = buffer_size -# self.parser = parser - -# def __iter__(self): -# return self - -# cdef __cnext__(self): - -# cdef char * b -# cdef size_t nbytes -# b = self.buffer - -# while not feof( self.infile ): -# nbytes = getline( &b, &self.size, self.infile) - -# # stop at first error or eof -# if (nbytes == -1): break -# # skip comments -# if (b[0] == '#'): continue - -# # skip empty lines -# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue - -# # make sure that entry is complete -# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': -# result = b -# raise ValueError( "incomplete line at %s" % result ) - -# # make sure that this goes fully through C -# # otherwise buffer is copied to/from a -# # Python object causing segfaults as -# # the wrong memory is freed -# return self.parser.parse( b, nbytes ) - -# raise StopIteration - -# def __dealloc__(self): -# free(self.buffer) - -# def __next__(self): -# return self.__cnext__() ######################################################### ######################################################### @@ -1127,8 +1054,6 @@ cdef class tabix_file_iterator: if self.fh == NULL: raise IOError('%s' % strerror(errno)) - self.kstream = ks_init(self.fh) - self.buffer.s = malloc(buffer_size) #if self.buffer == NULL: # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) @@ -1141,12 +1066,11 @@ cdef class tabix_file_iterator: cdef __cnext__(self): cdef char * b - cdef int dret = 0 cdef int retval = 0 while 1: with nogil: - retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) - + retval = bgzf_getline(self.fh, b'\n', &self.buffer) + if retval < 0: break #raise IOError('gzip error: %s' % buildGzipError( self.fh )) @@ -1170,7 +1094,6 @@ cdef class tabix_file_iterator: def __dealloc__(self): free(self.buffer.s) - ks_destroy(self.kstream) bgzf_close(self.fh) def __next__(self): diff --git a/pysam/pysam_stream.h b/pysam/pysam_stream.h deleted file mode 100644 index 3a4eb16c..00000000 --- a/pysam/pysam_stream.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef PYSAM_STREAM_H -#define PYSAM_STREAM_H - -#include "htslib/kseq.h" - -// ####################################################### -// fastq parsing -// KSEQ_INIT(gzFile, gzread) -KSEQ_INIT(BGZF *, bgzf_read) - -//KSTREAM_INIT( gzFile, gzread, 16384) - -#endif diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c deleted file mode 100644 index 349af44d..00000000 --- a/pysam/pysam_util.c +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "htslib/khash.h" -#include "htslib/ksort.h" -#include "htslib/knetfile.h" - -#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) -/* - * A rudimentary emulation of getline() for systems that dont support it - * natively. Since this is used for PPD file reading, it assumes (possibly - * falsely) that BUFSIZ is big enough. - */ -ssize_t -getline(char **line, size_t *linelen, FILE *fp) -{ - if (*linelen == 0) - { - *linelen = BUFSIZ; - *line = malloc(*linelen); - } - - memset(*line, 0, *linelen); - fgets(*line, *linelen, fp); - - return (strlen(*line)); - -} -#endif - - - diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h deleted file mode 100644 index 789e9d0d..00000000 --- a/pysam/pysam_util.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifndef PYSAM_UTIL_H -#define PYSAM_UTIL_H - - -#endif diff --git a/setup.py b/setup.py index 455b42a1..5c263257 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,7 @@ def run_make(targets): def run_make_print_config(): - stdout = subprocess.check_output(["make", "-s", "print-config"], encoding="ascii") + stdout = subprocess.check_output([os.environ.get("MAKE", "make"), "-s", "print-config"], encoding="ascii") make_print_config = {} for line in stdout.splitlines(): @@ -630,7 +630,7 @@ def prebuild_libcsamtools(ext, force): extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries), dict(name="pysam.libcutils", - sources=[source_pattern % "utils", "pysam/pysam_util.c"] + os_c_files, + sources=[source_pattern % "utils"] + os_c_files, extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries + internal_samtools_libraries), dict(name="pysam.libcalignmentfile", diff --git a/tests/compile_test.py b/tests/compile_test.py index 2de1b1e7..4ce3d2e4 100644 --- a/tests/compile_test.py +++ b/tests/compile_test.py @@ -63,6 +63,15 @@ def test_alignments(self): assert hdr.__sizeof__() == 24 assert aln.__sizeof__() == 72 + def test_tabix(self): + gzit = pysam.GZIterator(os.path.join(TABIX_DATADIR, "example.gtf.gz")) + + with open(os.path.join(TABIX_DATADIR, "example.gtf.gz")) as fp: + tfit = pysam.tabix_file_iterator(fp, pysam.asTuple()) + + assert gzit.__sizeof__() == 80 + assert tfit.__sizeof__() == 96 + def test_variants(self): fp = pysam.VariantFile(os.path.join(CBCF_DATADIR, "example_vcf43.vcf")) hdr = pysam.VariantHeader()