From 55b9c0b4662ade149348eb4f9c9f2c7467e1c1db Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 3 Jun 2025 22:16:59 +1200 Subject: [PATCH 01/11] Take advantage of our minimum Python now being 3.8 --- pysam/libcalignedsegment.pyi | 7 +------ pysam/libcalignmentfile.pyi | 6 +----- pysam/libcbcf.pyi | 6 +----- pysam/libcbgzf.pyi | 7 +------ pysam/libchtslib.pyi | 7 +------ pysam/libctabix.pyi | 7 +------ 6 files changed, 6 insertions(+), 34 deletions(-) diff --git a/pysam/libcalignedsegment.pyi b/pysam/libcalignedsegment.pyi index 5665d3e6..66da76ed 100644 --- a/pysam/libcalignedsegment.pyi +++ b/pysam/libcalignedsegment.pyi @@ -2,12 +2,7 @@ import enum import re import sys from array import array -from typing import Any, List, Optional, Dict, Tuple, Union, overload - -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal +from typing import Any, List, Literal, Optional, Dict, Tuple, Union, overload from pysam import AlignmentHeader # type: ignore diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 6f106af9..5723a5af 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -11,14 +11,10 @@ from typing import ( Union, Callable, List, + Literal, Iterable, ) -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal - from pysam.libchtslib import HTSFile, _HasFileNo from pysam.libcalignedsegment import AlignedSegment, PileupColumn from pysam.libcfaidx import FastaFile diff --git a/pysam/libcbcf.pyi b/pysam/libcbcf.pyi index e643562d..f5a7e34c 100644 --- a/pysam/libcbcf.pyi +++ b/pysam/libcbcf.pyi @@ -7,6 +7,7 @@ from typing import ( Tuple, Iterator, List, + Literal, Iterable, Dict, overload, @@ -15,11 +16,6 @@ from typing import ( Generic, ) -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal - from pysam.libchtslib import HTSFile, _HasFileNo _D = TypeVar("_D") diff --git a/pysam/libcbgzf.pyi b/pysam/libcbgzf.pyi index 4d64e8db..6c19d785 100644 --- a/pysam/libcbgzf.pyi +++ b/pysam/libcbgzf.pyi @@ -1,11 +1,6 @@ import sys -from typing import Optional, Union, Any, NoReturn - -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal +from typing import Literal, Optional, Union, Any, NoReturn BUFFER_SIZE: int diff --git a/pysam/libchtslib.pyi b/pysam/libchtslib.pyi index 61b64b6a..ffa1b43d 100644 --- a/pysam/libchtslib.pyi +++ b/pysam/libchtslib.pyi @@ -1,10 +1,5 @@ import sys -from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, TypeVar - -if sys.version_info < (3, 8): - from typing_extensions import Protocol -else: - from typing import Protocol +from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, Protocol, TypeVar class _HasFileNo(Protocol): def fileno(self) -> int: ... diff --git a/pysam/libctabix.pyi b/pysam/libctabix.pyi index 517a74d2..3a4f5b57 100644 --- a/pysam/libctabix.pyi +++ b/pysam/libctabix.pyi @@ -1,10 +1,5 @@ import sys -from typing import Optional, List, Any - -if sys.version_info < (3, 8): - from typing_extensions import Literal -else: - from typing import Literal +from typing import Optional, List, Literal, Any from pysam.libchtslib import HTSFile From fee8d8224209393da29fe59c18c84b4b45943d0c Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 2 Jun 2025 23:38:40 +1200 Subject: [PATCH 02/11] Add FreeBSD CI workflow running on Cirrus CI Fix one subprocess call that neglected to use $MAKE to override "make". (On FreeBSD we need to use "gmake" instead.) --- .cirrus.yml | 21 +++++++++++++++++++++ setup.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index bbd36a40..11acf2f4 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -1,3 +1,24 @@ +freebsd_ci_task: + name: CI / FreeBSD + + freebsd_instance: + image_family: freebsd-14-2 + + install_script: | + pkg install -y bcftools gmake py311-cython3 py311-mypy py311-pytest samtools + + env: + CC: "clang -I/usr/local/include" + MAKE: "gmake" + REF_PATH: ":" + + build_script: | + python setup.py build + + test_script: | + PYTHONPATH="$(echo $PWD/build/lib.*)" pytest + + build_wheels_task: only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*" diff --git a/setup.py b/setup.py index 4952bb39..05409c38 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,7 @@ def run_make(targets): def run_make_print_config(): - stdout = subprocess.check_output(["make", "-s", "print-config"], encoding="ascii") + stdout = subprocess.check_output([os.environ.get("MAKE", "make"), "-s", "print-config"], encoding="ascii") make_print_config = {} for line in stdout.splitlines(): From 96e26524f378ec0958e225499bd754129e730c74 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 10 Jun 2025 21:50:47 +1200 Subject: [PATCH 03/11] Work around Cython 3.1.2 non-static internal function bug Cython recently added __pyx_CommonTypesMetaclass_get_module() but neglected to make it static; cf cython/cython#6957. Remove it from the lists of defined symbols to be checked for collisions. Also ensure that we only strip one leading underscore on macOS, so that this function's name is as expected. --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 05409c38..d461ef1d 100644 --- a/setup.py +++ b/setup.py @@ -99,11 +99,14 @@ def run_nm_defined_symbols(objfile): if symtype not in "UFNWw": if IS_DARWIN: # On macOS, all symbols have a leading underscore - symbols.add(sym.lstrip('_')) + symbols.add(sym[1:] if sym.startswith("_") else sym) else: # Ignore symbols such as _edata (present in all shared objects) if sym[0] not in "_$.@": symbols.add(sym) + # Work around Cython 3.1.2 bug whereby this function is not static + symbols.discard("__pyx_CommonTypesMetaclass_get_module") + return symbols From 0c5e147a594683d9206330c14af6568056e9abf4 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 16 Jun 2025 18:50:39 +1200 Subject: [PATCH 04/11] Refactor htslib/kseq.h usage to avoid unused function warnings Rearrange so that KSTREAM_INIT/KSEQ_INIT is used only in *.pyx files rather than *.pxd headers, so that the functions don't pollute other extension modules. Declare only the needed types in the *.pxd headers. --- pysam/libcfaidx.pxd | 26 +++++++------------------- pysam/libcfaidx.pyx | 11 +++++++++++ pysam/libctabix.pxd | 31 +++++-------------------------- pysam/libctabix.pyx | 17 +++++++++++++++++ pysam/pysam_stream.h | 13 ------------- 5 files changed, 40 insertions(+), 58 deletions(-) delete mode 100644 pysam/pysam_stream.h diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd index c17d0ba0..8380e90e 100644 --- a/pysam/libcfaidx.pxd +++ b/pysam/libcfaidx.pxd @@ -9,31 +9,19 @@ cimport cython from cpython cimport array from pysam.libchtslib cimport faidx_t, kstring_t, BGZF -# These functions are put here and not in chtslib.pxd in order -# to avoid warnings for unused functions. -cdef extern from "pysam_stream.h" nogil: - - ctypedef struct kstream_t: - pass - +cdef extern from "htslib/kseq.h" nogil: + """ + struct __kstream_t; + #define kstream_t struct __kstream_t + __KSEQ_TYPE(type_t_unused_here) + #undef kstream_t + """ ctypedef struct kseq_t: kstring_t name kstring_t comment kstring_t seq kstring_t qual - kseq_t *kseq_init(BGZF *) - int kseq_read(kseq_t *) - void kseq_destroy(kseq_t *) - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) cdef class FastaFile: cdef bint is_remote diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index d4e7427b..dd214170 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -70,6 +70,17 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size from pysam.libcutils cimport qualitystring_to_array, parse_region +cdef extern from "htslib/kseq.h" nogil: + """ + #undef __KSEQ_TYPE + #define __KSEQ_TYPE(type_t) + KSEQ_INIT2(static, BGZF *, bgzf_read) + """ + kseq_t *kseq_init(BGZF *) + int kseq_read(kseq_t *) + void kseq_destroy(kseq_t *) + + cdef class FastqProxy cdef makeFastqProxy(kseq_t * src): '''enter src into AlignedRead.''' diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd index 174dd8b4..b9636314 100644 --- a/pysam/libctabix.pxd +++ b/pysam/libctabix.pxd @@ -17,32 +17,11 @@ cdef extern from "unistd.h" nogil: from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \ tbx_t, kstring_t, BGZF, HTSFile - -# These functions are put here and not in chtslib.pxd in order -# to avoid warnings for unused functions. -cdef extern from "pysam_stream.h" nogil: - - ctypedef struct kstream_t: - pass - - ctypedef struct kseq_t: - kstring_t name - kstring_t comment - kstring_t seq - kstring_t qual - - kseq_t *kseq_init(BGZF *) - int kseq_read(kseq_t *) - void kseq_destroy(kseq_t *) - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) +cdef extern from "htslib/kseq.h" nogil: + """ + __KS_TYPE(BGZF *) + """ + ctypedef struct kstream_t cdef class tabix_file_iterator: diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index deae908b..e3e3ff83 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -79,6 +79,23 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size +cdef extern from "htslib/kseq.h" nogil: + """ + #undef __KS_TYPE + #define __KS_TYPE(type_t) + KSTREAM_INIT2(static, BGZF *, bgzf_read, 16384) + """ + kstream_t *ks_init(BGZF *) + void ks_destroy(kstream_t *) + + # Retrieve characters from stream until delimiter + # is reached placing results in str. + int ks_getuntil(kstream_t *, + int delimiter, + kstring_t * str, + int * dret) + + cdef class Parser: def __init__(self, encoding="ascii"): diff --git a/pysam/pysam_stream.h b/pysam/pysam_stream.h deleted file mode 100644 index 3a4eb16c..00000000 --- a/pysam/pysam_stream.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef PYSAM_STREAM_H -#define PYSAM_STREAM_H - -#include "htslib/kseq.h" - -// ####################################################### -// fastq parsing -// KSEQ_INIT(gzFile, gzread) -KSEQ_INIT(BGZF *, bgzf_read) - -//KSTREAM_INIT( gzFile, gzread, 16384) - -#endif From 41f6b1974a6c7f14ef9f8e24d54aa03e626b9f0f Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 16 Jun 2025 21:18:06 +1200 Subject: [PATCH 05/11] Simplify libctabix.pyx by using bgzf_getline() directly Remove kstream_t, so is used only by libcfaidx.pyx. --- pysam/libctabix.pxd | 10 ++-------- pysam/libctabix.pyx | 44 +++++++------------------------------------ tests/compile_test.py | 9 +++++++++ 3 files changed, 18 insertions(+), 45 deletions(-) diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd index b9636314..4096790c 100644 --- a/pysam/libctabix.pxd +++ b/pysam/libctabix.pxd @@ -17,16 +17,10 @@ cdef extern from "unistd.h" nogil: from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \ tbx_t, kstring_t, BGZF, HTSFile -cdef extern from "htslib/kseq.h" nogil: - """ - __KS_TYPE(BGZF *) - """ - ctypedef struct kstream_t - cdef class tabix_file_iterator: cdef BGZF * fh - cdef kstream_t * kstream + cdef void * unused cdef kstring_t buffer cdef size_t size cdef Parser parser @@ -88,7 +82,7 @@ cdef class TabixIteratorParsed(TabixIterator): cdef class GZIterator: cdef object _filename cdef BGZF * gzipfile - cdef kstream_t * kstream + cdef void * unused cdef kstring_t buffer cdef int __cnext__(self) cdef encoding diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index e3e3ff83..2e8c05f5 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -69,7 +69,7 @@ from cpython cimport PyErr_SetString, PyBytes_Check, \ cimport pysam.libctabixproxies as ctabixproxies from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ - BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ + BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_getline, bgzf_write, \ tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ tbx_destroy, hisremote, region_list, hts_getline, \ @@ -79,22 +79,6 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size -cdef extern from "htslib/kseq.h" nogil: - """ - #undef __KS_TYPE - #define __KS_TYPE(type_t) - KSTREAM_INIT2(static, BGZF *, bgzf_read, 16384) - """ - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) - cdef class Parser: @@ -749,7 +733,6 @@ cdef class GZIterator: with nogil: self.gzipfile = bgzf_open(cfilename, "r") self._filename = filename - self.kstream = ks_init(self.gzipfile) self.encoding = encoding self.buffer.l = 0 @@ -763,24 +746,15 @@ cdef class GZIterator: self.gzipfile = NULL if self.buffer.s != NULL: free(self.buffer.s) - if self.kstream != NULL: - ks_destroy(self.kstream) def __iter__(self): return self cdef int __cnext__(self): - cdef int dret = 0 - cdef int retval = 0 - while 1: - with nogil: - retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) - - if retval < 0: - break - - return dret - return -1 + cdef int retval + with nogil: + retval = bgzf_getline(self.gzipfile, b'\n', &self.buffer) + return retval def __next__(self): """python version of next(). @@ -1144,8 +1118,6 @@ cdef class tabix_file_iterator: if self.fh == NULL: raise IOError('%s' % strerror(errno)) - self.kstream = ks_init(self.fh) - self.buffer.s = malloc(buffer_size) #if self.buffer == NULL: # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) @@ -1158,12 +1130,11 @@ cdef class tabix_file_iterator: cdef __cnext__(self): cdef char * b - cdef int dret = 0 cdef int retval = 0 while 1: with nogil: - retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) - + retval = bgzf_getline(self.fh, b'\n', &self.buffer) + if retval < 0: break #raise IOError('gzip error: %s' % buildGzipError( self.fh )) @@ -1187,7 +1158,6 @@ cdef class tabix_file_iterator: def __dealloc__(self): free(self.buffer.s) - ks_destroy(self.kstream) bgzf_close(self.fh) def __next__(self): diff --git a/tests/compile_test.py b/tests/compile_test.py index 2de1b1e7..4ce3d2e4 100644 --- a/tests/compile_test.py +++ b/tests/compile_test.py @@ -63,6 +63,15 @@ def test_alignments(self): assert hdr.__sizeof__() == 24 assert aln.__sizeof__() == 72 + def test_tabix(self): + gzit = pysam.GZIterator(os.path.join(TABIX_DATADIR, "example.gtf.gz")) + + with open(os.path.join(TABIX_DATADIR, "example.gtf.gz")) as fp: + tfit = pysam.tabix_file_iterator(fp, pysam.asTuple()) + + assert gzit.__sizeof__() == 80 + assert tfit.__sizeof__() == 96 + def test_variants(self): fp = pysam.VariantFile(os.path.join(CBCF_DATADIR, "example_vcf43.vcf")) hdr = pysam.VariantHeader() From 4dfc9e99c31df0dc6fe54357731e5a185dd330f2 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 16 Jun 2025 22:00:40 +1200 Subject: [PATCH 06/11] Remove obsolete source files and unnecessary includes c{sam,bcf}tools_util.h are long since replaced by {sam,bcf}tools.pysam.h. Only this commented-out libctabix.pyx code used getline(), so remove its fallback definition in pysam_utils.c and hence remove the now-empty pysam_util.[ch]. Remove unused header inclusions from htslib_util.c. --- pysam/cbcftools_util.h | 6 ---- pysam/csamtools_util.h | 6 ---- pysam/htslib_util.c | 5 ---- pysam/libctabix.pyx | 64 ------------------------------------------ pysam/pysam_util.c | 36 ------------------------ pysam/pysam_util.h | 5 ---- setup.py | 2 +- 7 files changed, 1 insertion(+), 123 deletions(-) delete mode 100644 pysam/cbcftools_util.h delete mode 100644 pysam/csamtools_util.h delete mode 100644 pysam/pysam_util.c delete mode 100644 pysam/pysam_util.h diff --git a/pysam/cbcftools_util.h b/pysam/cbcftools_util.h deleted file mode 100644 index 4a9f2e9c..00000000 --- a/pysam/cbcftools_util.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef CBCFTOOLS_UTIL_H -#define CBCFTOOLS_UTIL_H - -int bcftools_main(int argc, char *argv[]); - -#endif diff --git a/pysam/csamtools_util.h b/pysam/csamtools_util.h deleted file mode 100644 index 0a03c138..00000000 --- a/pysam/csamtools_util.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef CSAMTOOLS_UTIL_H -#define CSAMTOOLS_UTIL_H - -int samtools_main(int argc, char *argv[]); - -#endif diff --git a/pysam/htslib_util.c b/pysam/htslib_util.c index 08309006..bc8ab894 100644 --- a/pysam/htslib_util.c +++ b/pysam/htslib_util.c @@ -1,13 +1,8 @@ -#include #include #include "htslib/khash.h" -#include "htslib/ksort.h" #include "htslib/sam.h" #include "htslib/hts.h" -#include "htslib/knetfile.h" -#include "htslib/kseq.h" #include "htslib_util.h" -#include #ifndef inline #define inline __inline diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index 2e8c05f5..d570655d 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -1012,70 +1012,6 @@ def tabix_index(filename, return filename -# ######################################################### -# cdef class tabix_file_iterator_old: -# '''iterate over ``infile``. - -# This iterator is not safe. If the :meth:`__next__()` method is called -# after ``infile`` is closed, the result is undefined (see ``fclose()``). - -# The iterator might either raise a StopIteration or segfault. -# ''' - - -# def __cinit__(self, -# infile, -# Parser parser, -# int buffer_size = 65536 ): - -# cdef int fd = PyObject_AsFileDescriptor( infile ) -# if fd == -1: raise ValueError( "I/O operation on closed file." ) -# self.infile = fdopen( fd, 'r') - -# if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) - -# self.buffer = malloc( buffer_size ) -# self.size = buffer_size -# self.parser = parser - -# def __iter__(self): -# return self - -# cdef __cnext__(self): - -# cdef char * b -# cdef size_t nbytes -# b = self.buffer - -# while not feof( self.infile ): -# nbytes = getline( &b, &self.size, self.infile) - -# # stop at first error or eof -# if (nbytes == -1): break -# # skip comments -# if (b[0] == '#'): continue - -# # skip empty lines -# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue - -# # make sure that entry is complete -# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': -# result = b -# raise ValueError( "incomplete line at %s" % result ) - -# # make sure that this goes fully through C -# # otherwise buffer is copied to/from a -# # Python object causing segfaults as -# # the wrong memory is freed -# return self.parser.parse( b, nbytes ) - -# raise StopIteration - -# def __dealloc__(self): -# free(self.buffer) - -# def __next__(self): -# return self.__cnext__() ######################################################### ######################################################### diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c deleted file mode 100644 index 349af44d..00000000 --- a/pysam/pysam_util.c +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "htslib/khash.h" -#include "htslib/ksort.h" -#include "htslib/knetfile.h" - -#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) -/* - * A rudimentary emulation of getline() for systems that dont support it - * natively. Since this is used for PPD file reading, it assumes (possibly - * falsely) that BUFSIZ is big enough. - */ -ssize_t -getline(char **line, size_t *linelen, FILE *fp) -{ - if (*linelen == 0) - { - *linelen = BUFSIZ; - *line = malloc(*linelen); - } - - memset(*line, 0, *linelen); - fgets(*line, *linelen, fp); - - return (strlen(*line)); - -} -#endif - - - diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h deleted file mode 100644 index 789e9d0d..00000000 --- a/pysam/pysam_util.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifndef PYSAM_UTIL_H -#define PYSAM_UTIL_H - - -#endif diff --git a/setup.py b/setup.py index d461ef1d..5c263257 100644 --- a/setup.py +++ b/setup.py @@ -630,7 +630,7 @@ def prebuild_libcsamtools(ext, force): extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries), dict(name="pysam.libcutils", - sources=[source_pattern % "utils", "pysam/pysam_util.c"] + os_c_files, + sources=[source_pattern % "utils"] + os_c_files, extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries + internal_samtools_libraries), dict(name="pysam.libcalignmentfile", From bb20683c76127c3db75668c553242fc12505cad5 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 21 Jun 2025 20:19:22 +1200 Subject: [PATCH 07/11] Update language level implications for the new py3.8 baseline --- doc/developer.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/developer.rst b/doc/developer.rst index cefc9eaf..8d424847 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -36,19 +36,21 @@ directories: Python language level ===================== -Pysam currently requires Python 3.6 as a minimum language level. +Pysam currently requires Python 3.8 as a minimum language level. For example, this means that the following comparatively recent language features and library functions are available for use: * f-strings * ``raise ... from None`` * :meth:`str.startswith`, :meth:`str.endswith`, :meth:`str.rstrip`, etc +* walrus ``:=`` operator in Python code -However in particular the the following should not be used in +However in particular the following should not be used in pysam source code or infrastructure scripts: * :meth:`str.removeprefix`, :meth:`str.removesuffix` (new in 3.9) -* walrus ``:=`` operator (new in 3.8) +* walrus ``:=`` operator in Cython code (requires Cython 3) +* ``Optional[type]`` type hints written as ``type | None`` etc (new in 3.10) Importing new versions of htslib and samtools From 89380af7c06118a91ad48065d6fbfd2c3627679b Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 23 Aug 2025 23:25:15 +1200 Subject: [PATCH 08/11] Use hts_get_format() accessor instead of directly accessing htsFile.format --- pysam/libcalignmentfile.pyx | 2 +- pysam/libcbcf.pyx | 8 +++++--- pysam/libchtslib.pyx | 41 +++++++++++++++++++------------------ pysam/libctabix.pyx | 2 +- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 4373a75f..8bc32e78 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -952,7 +952,7 @@ cdef class AlignmentFile(HTSFile): else: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) - if self.htsfile.format.category != sequence_data: + if hts_get_format(self.htsfile).category != sequence_data: raise ValueError("file does not contain alignment data") if format_options and len(format_options): diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index f1eac7c0..24179d6c 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -4248,6 +4248,7 @@ cdef class VariantFile(HTSFile): """ cdef bcf_hdr_t *hdr cdef BGZF *bgzfp + cdef const htsFormat *fmt cdef hts_idx_t *idx cdef tbx_t *tidx cdef char *cfilename @@ -4341,7 +4342,8 @@ cdef class VariantFile(HTSFile): else: raise ValueError('could not open variant file `{}`'.format(filename)) - if self.htsfile.format.format not in (bcf, vcf): + fmt = hts_get_format(self.htsfile) + if fmt.format not in (bcf, vcf): raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode)) self.check_truncation(ignore_truncation) @@ -4360,14 +4362,14 @@ cdef class VariantFile(HTSFile): cfilename = NULL # check for index and open if present - if self.htsfile.format.format == bcf and cfilename: + if fmt.format == bcf and cfilename: if index_filename is not None: cindex_filename = index_filename with nogil: idx = bcf_index_load2(cfilename, cindex_filename) self.index = makeBCFIndex(self.header, idx) - elif self.htsfile.format.compression == bgzf and cfilename: + elif fmt.compression == bgzf and cfilename: if index_filename is not None: cindex_filename = index_filename with nogil: diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 25d79ecc..ce471765 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -349,7 +349,7 @@ cdef class HTSFile(object): if not self.htsfile: return - if self.htsfile.format.compression != bgzf: + if hts_get_format(self.htsfile).compression != bgzf: return cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile) @@ -379,7 +379,7 @@ cdef class HTSFile(object): VARIANTS, INDEX, REGIONS""" if not self.htsfile: raise ValueError('metadata not available on closed file') - return FORMAT_CATEGORIES[self.htsfile.format.category] + return FORMAT_CATEGORIES[hts_get_format(self.htsfile).category] @property def format(self): @@ -390,14 +390,15 @@ cdef class HTSFile(object): """ if not self.htsfile: raise ValueError('metadata not available on closed file') - return FORMATS[self.htsfile.format.format] + return FORMATS[hts_get_format(self.htsfile).format] @property def version(self): """Tuple of file format version numbers (major, minor)""" if not self.htsfile: raise ValueError('metadata not available on closed file') - return self.htsfile.format.version.major, self.htsfile.format.version.minor + cdef const htsFormat *fmt = hts_get_format(self.htsfile) + return fmt.version.major, fmt.version.minor @property def compression(self): @@ -406,14 +407,14 @@ cdef class HTSFile(object): One of NONE, GZIP, BGZF, CUSTOM.""" if not self.htsfile: raise ValueError('metadata not available on closed file') - return COMPRESSION[self.htsfile.format.compression] + return COMPRESSION[hts_get_format(self.htsfile).compression] @property def description(self): """Vaguely human readable description of the file format""" if not self.htsfile: raise ValueError('metadata not available on closed file') - cdef char *desc = hts_format_description(&self.htsfile.format) + cdef char *desc = hts_format_description(hts_get_format(self.htsfile)) try: return charptr_to_str(desc) finally: @@ -447,27 +448,27 @@ cdef class HTSFile(object): @property def is_sam(self): """return True if HTSFile is reading or writing a SAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == sam + return self.htsfile != NULL and hts_get_format(self.htsfile).format == sam @property def is_bam(self): """return True if HTSFile is reading or writing a BAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == bam + return self.htsfile != NULL and hts_get_format(self.htsfile).format == bam @property def is_cram(self): """return True if HTSFile is reading or writing a BAM alignment file""" - return self.htsfile != NULL and self.htsfile.format.format == cram + return self.htsfile != NULL and hts_get_format(self.htsfile).format == cram @property def is_vcf(self): """return True if HTSFile is reading or writing a VCF variant file""" - return self.htsfile != NULL and self.htsfile.format.format == vcf + return self.htsfile != NULL and hts_get_format(self.htsfile).format == vcf @property def is_bcf(self): """return True if HTSFile is reading or writing a BCF variant file""" - return self.htsfile != NULL and self.htsfile.format.format == bcf + return self.htsfile != NULL and hts_get_format(self.htsfile).format == bcf def reset(self): """reset file position to beginning of file just after the header. @@ -490,14 +491,14 @@ cdef class HTSFile(object): whence = libc_whence_from_io(whence) cdef int64_t ret - if self.htsfile.format.compression == bgzf: + cdef htsCompression compression = hts_get_format(self.htsfile).compression + if compression == bgzf: with nogil: ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence) - elif self.htsfile.format.compression == no_compression: + elif compression == no_compression: ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1 else: - raise NotImplementedError("seek not implemented in files compressed by method {}".format( - self.htsfile.format.compression)) + raise NotImplementedError(f"seek not implemented in files compressed by method {compression}") return ret def tell(self): @@ -508,17 +509,17 @@ cdef class HTSFile(object): raise IOError('tell not available in streams') cdef int64_t ret - if self.htsfile.format.compression == bgzf: + cdef const htsFormat *fmt = hts_get_format(self.htsfile) + if fmt.compression == bgzf: with nogil: ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) - elif self.htsfile.format.compression == no_compression: + elif fmt.compression == no_compression: ret = htell(self.htsfile.fp.hfile) - elif self.htsfile.format.format == cram: + elif fmt.format == cram: with nogil: ret = htell(cram_fd_get_fp(self.htsfile.fp.cram)) else: - raise NotImplementedError("seek not implemented in files compressed by method {}".format( - self.htsfile.format.compression)) + raise NotImplementedError(f"seek not implemented in files compressed by method {fmt.compression}") return ret diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index d570655d..54a2006a 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -390,7 +390,7 @@ cdef class TabixFile: if self.htsfile == NULL: raise IOError("could not open file `%s`" % filename) - #if self.htsfile.format.category != region_list: + #if hts_get_format(self.htsfile).category != region_list: # raise ValueError("file does not contain region data") with nogil: From bfa5a41a2e1f226c91ecb2baa63b9f1f0881582a Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 26 Aug 2025 19:31:15 +1200 Subject: [PATCH 09/11] Mark callback functions used from C as nogil and (mostly) noexcept Prevents "Casting a GIL-requiring function into a nogil function" warnings. --- pysam/libcalignmentfile.pyx | 62 ++++++++++--------------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 8bc32e78..d65d06b4 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -2320,38 +2320,30 @@ cdef class IteratorRowSelection(IteratorRow): raise IOError(read_failure_reason(ret)) -cdef int __advance_nofilter(void *data, bam1_t *b): +cdef int __advance_nofilter(void *data, bam1_t *b) noexcept nogil: '''advance without any read filtering. ''' cdef __iterdata * d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - return ret + return sam_itr_next(d.htsfile, d.iter, b) -cdef int __advance_raw_nofilter(void *data, bam1_t *b): +cdef int __advance_raw_nofilter(void *data, bam1_t *b) noexcept nogil: '''advance (without iterator) without any read filtering. ''' cdef __iterdata * d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_read1(d.htsfile, d.header, b) - return ret + return sam_read1(d.htsfile, d.header, b) -cdef int __advance_all(void *data, bam1_t *b): +cdef int __advance_all(void *data, bam1_t *b) noexcept nogil: '''only use reads for pileup passing basic filters such as BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP ''' cdef __iterdata * d = <__iterdata*>data - cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP cdef int ret while 1: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) + ret = sam_itr_next(d.htsfile, d.iter, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2360,7 +2352,7 @@ cdef int __advance_all(void *data, bam1_t *b): return ret -cdef int __advance_raw_all(void *data, bam1_t *b): +cdef int __advance_raw_all(void *data, bam1_t *b) noexcept nogil: '''only use reads for pileup passing basic filters such as BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP @@ -2369,8 +2361,7 @@ cdef int __advance_raw_all(void *data, bam1_t *b): cdef __iterdata * d = <__iterdata*>data cdef int ret while 1: - with nogil: - ret = sam_read1(d.htsfile, d.header, b) + ret = sam_read1(d.htsfile, d.header, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2379,7 +2370,7 @@ cdef int __advance_raw_all(void *data, bam1_t *b): return ret -cdef int __advance_samtools(void * data, bam1_t * b): +cdef int __advance_samtools(void * data, bam1_t *b) nogil: '''advance using same filter and read processing as in the samtools pileup. ''' @@ -2388,8 +2379,7 @@ cdef int __advance_samtools(void * data, bam1_t * b): cdef int q while 1: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) + ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b) if ret < 0: break if b.core.flag & d.flag_filter: @@ -2402,13 +2392,7 @@ cdef int __advance_samtools(void * data, bam1_t * b): if d.seq != NULL: free(d.seq) d.tid = b.core.tid - with nogil: - d.seq = faidx_fetch_seq( - d.fastafile, - d.header.target_name[d.tid], - 0, MAX_POS, - &d.seq_len) - + d.seq = faidx_fetch_seq(d.fastafile, d.header.target_name[d.tid], 0, MAX_POS, &d.seq_len) if d.seq == NULL: raise ValueError( "reference sequence for '{}' (tid={}) not found".format( @@ -2560,19 +2544,13 @@ cdef class IteratorColumn: if self.stepper is None or self.stepper == "all": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_all, - data) + self.pileup_iter = bam_mplp_init(1, __advance_all, data) elif self.stepper == "nofilter": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_nofilter, - data) + self.pileup_iter = bam_mplp_init(1, __advance_nofilter, data) elif self.stepper == "samtools": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_samtools, - data) + self.pileup_iter = bam_mplp_init(1, __advance_samtools, data) else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) @@ -2609,19 +2587,13 @@ cdef class IteratorColumn: if self.stepper is None or self.stepper == "all": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_raw_all, - data) + self.pileup_iter = bam_mplp_init(1, __advance_raw_all, data) elif self.stepper == "nofilter": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_raw_nofilter, - data) + self.pileup_iter = bam_mplp_init(1, __advance_raw_nofilter, data) elif self.stepper == "samtools": with nogil: - self.pileup_iter = bam_mplp_init(1, - &__advance_samtools, - data) + self.pileup_iter = bam_mplp_init(1, __advance_samtools, data) else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) From 5ac440fa55179133d55c3f3251e9fa82ac378873 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 15 Sep 2025 23:52:48 +1200 Subject: [PATCH 10/11] Ensure -I./htslib takes priority over /usr/local/include We need to search /usr/local/include to find curl/curl.h etc. However it needs to be searched AFTER ./htslib et al so that pysam is compiled against headers from ./htslib/htslib/*.h rather than any system-installed HTSlib headers in /usr/local/include. --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 11acf2f4..98331454 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -8,7 +8,7 @@ freebsd_ci_task: pkg install -y bcftools gmake py311-cython3 py311-mypy py311-pytest samtools env: - CC: "clang -I/usr/local/include" + CC: "clang -isystem /usr/local/include" MAKE: "gmake" REF_PATH: ":" From 2f9d50dde8eea7ddcdb643dcde0a088380f4549a Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 10 Sep 2025 10:48:15 -0400 Subject: [PATCH 11/11] Minor documentation wordsmithing and alphabetising [trivial] --- doc/api.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 47fe314b..fc88f172 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -15,7 +15,7 @@ To use the module to read a file in BAM format, create a import pysam samfile = pysam.AlignmentFile("ex1.bam", "rb") -Once a file is opened you can iterate over all of the read mapping to +Once a file is opened you can iterate over all of the reads mapping to a specified region using :meth:`~pysam.AlignmentFile.fetch`. Each iteration returns a :class:`~pysam.AlignedSegment` object which represents a single read along with its fields and optional tags:: @@ -103,7 +103,7 @@ tabix indexed tab-separated file formats with genomic data:: :class:`~pysam.TabixFile` implements lazy parsing in order to iterate over large tables efficiently. -More detailed usage instructions is at :ref:`usage`. +More detailed usage instructions are available at :ref:`usage`. .. note:: @@ -200,7 +200,6 @@ FASTQ files .. autoclass:: pysam.FastxFile :members: - .. autoclass:: pysam.FastqProxy :members: @@ -214,10 +213,10 @@ VCF/BCF files .. autoclass:: pysam.VariantHeader :members: -.. autoclass:: pysam.VariantRecord +.. autoclass:: pysam.VariantHeaderRecord :members: -.. autoclass:: pysam.VariantHeaderRecord +.. autoclass:: pysam.VariantRecord :members: HTSFile