diff --git a/.cirrus.yml b/.cirrus.yml
index 2b20b311c..98331454c 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -1,3 +1,24 @@
+freebsd_ci_task:
+  name: CI / FreeBSD
+
+  freebsd_instance:
+    image_family: freebsd-14-2
+
+  install_script: |
+    pkg install -y bcftools gmake py311-cython3 py311-mypy py311-pytest samtools
+
+  env:
+    CC: "clang -isystem /usr/local/include"
+    MAKE: "gmake"
+    REF_PATH: ":"
+
+  build_script: |
+    python setup.py build
+
+  test_script: |
+    PYTHONPATH="$(echo $PWD/build/lib.*)" pytest
+
+
 build_wheels_task:
   only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*"
 
@@ -8,18 +29,21 @@ build_wheels_task:
         architecture: arm64
         platform: linux
       matrix:
-        - name: Build ARM Linux py3.6-9 wheels
+        - name: Build ARM Linux py3.8-9 wheels
+          env:
+            CIBW_BUILD: "cp38-* cp39-*"
+        - name: Build ARM Linux py3.10-11 wheels
           env:
-            CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*"
-        - name: Build ARM Linux py3.10-12 wheels
+            CIBW_BUILD: "cp310-* cp311-*"
+        - name: Build ARM Linux py3.12-13 wheels
           env:
-            CIBW_BUILD: "cp310-* cp311-* cp312-*"
+            CIBW_BUILD: "cp312-* cp313-*"
 
     - name: Build ARM macOS wheels
       macos_instance:
         image: ghcr.io/cirruslabs/macos-sonoma-base:latest
       env:
-        CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
+        CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-* cp313-*"
 
   alias: build_wheels
 
@@ -29,14 +53,17 @@ build_wheels_task:
     VENV: $HOME/relenv
     PATH: $VENV/bin:$PATH
 
-    CIBW_SKIP: "*-musllinux_*"
     CIBW_BUILD_VERBOSITY: 1
 
+    # Avoid linking with non-system library libdeflate.dylib
+    CIBW_ENVIRONMENT_MACOS: HTSLIB_CONFIGURE_OPTIONS="--without-libdeflate"
+
     CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28
+    CIBW_MUSLLINUX_AARCH64_IMAGE: musllinux_1_2
 
   install_script: |
     python3 -m venv $VENV
-    pip3 install cibuildwheel==2.17.0
+    pip3 install cibuildwheel==2.23.3
 
   build_script: |
     cibuildwheel
diff --git a/.gitattributes b/.gitattributes
index 25c63bd72..407fba65d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,7 +1,7 @@
 # Omit these files from GitHub's generated tarballs
+/.cirrus.yml                export-ignore
 /.git*                      export-ignore
 /.readthedocs.yaml          export-ignore
-/.travis.disabled.yml       export-ignore
 
 # Omit imported C files from GitHub's language statistics
 htslib/**                   linguist-vendored
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index c300579c8..7ccc0f5c7 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -8,15 +8,15 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu, macos]
-        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
         exclude:
           # Run only the latest few 3.x versions on macOS
-          - os: macos
-            python-version: 3.7
           - os: macos
             python-version: 3.8
           - os: macos
             python-version: 3.9
+          - os: macos
+            python-version: 3.10
 
     steps:
       - name: Checkout pysam
@@ -28,13 +28,13 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install prerequisite Python libraries
-        run:  pip install cython pytest pytest-pep8 setuptools
+        run:  pip install cython mypy pytest setuptools
 
       - name: Install Linux build prerequisites
         if:   runner.os == 'Linux'
         run: |
           sudo apt-get update
-          sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+          sudo apt-get install -q --no-install-recommends --no-install-suggests libbz2-dev libcurl4-openssl-dev liblzma-dev
 
       - name: Update macOS build prerequisites
         if:   runner.os == 'macOS'
@@ -79,13 +79,13 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install prerequisite Python libraries
-        run:  pip install cython pytest pytest-pep8
+        run:  pip install cython pytest
 
       - name: Install build prerequisites
         if:   runner.os == 'Linux'
         run: |
           sudo apt-get update
-          sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+          sudo apt-get install -q --no-install-recommends --no-install-suggests libbz2-dev libcurl4-openssl-dev liblzma-dev
 
       - name: Create source distribution
         run:  python setup.py sdist --owner=root --group=root
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index e3ba36a79..999b96a01 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -3,25 +3,22 @@ name: Publish wheels
 on:
   push:
     branches:
-      - v[0-9]+.*
+      - release/*
     tags:
       - v[0-9]+.*
-  release:
-    types:
-      - published
 
 jobs:
   build_wheels:
-    runs-on: ${{ matrix.os }}-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu, macos]
-        build: ["cp36-* cp37-* cp38-* cp39-*", "cp310-* cp311-* cp312-*"]
+        os: [ubuntu-latest, macos-13]
+        build: ["cp38-* cp39-*", "cp310-* cp311-*", "cp312-* cp313-*"]
         x64image: [manylinux_2_28]
         nametag: [none]
 
         include:
-          - os: ubuntu
+          - os: ubuntu-latest
             build: "cp38-manylinux_x86_64"
             x64image: manylinux2014
             nametag: focal
@@ -30,19 +27,25 @@ jobs:
       - name: Checkout pysam
         uses: actions/checkout@v4
 
+      - name: Check platform ${{ matrix.os }} is the expected architecture
+        run:  devtools/check-platform.sh ${{ matrix.os }}
+
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.17.0
+        uses: pypa/cibuildwheel@v2.23.3
         env:
           CIBW_BUILD: ${{ matrix.build }}
-          CIBW_SKIP: "*-musllinux_*"
           CIBW_BUILD_VERBOSITY: 1
 
+          # Avoid linking with non-system library libdeflate.dylib
+          CIBW_ENVIRONMENT_MACOS: HTSLIB_CONFIGURE_OPTIONS="--without-libdeflate"
+
           CIBW_ARCHS_LINUX: x86_64
           CIBW_ARCHS_MACOS: x86_64
 
           CIBW_MANYLINUX_X86_64_IMAGE:  ${{ matrix.x64image }}
           CIBW_MANYLINUX_I686_IMAGE:    manylinux2014
           CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28
+          CIBW_MUSLLINUX_X86_64_IMAGE:  musllinux_1_2
 
       - name: Check wheelhouse
         run:  devtools/artifactname.py wheelhouse/*.whl >> $GITHUB_ENV
@@ -75,7 +78,7 @@ jobs:
       - name: Install build prerequisites
         run: |
           sudo apt-get update
-          sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+          sudo apt-get install -q --no-install-recommends --no-install-suggests libbz2-dev libcurl4-openssl-dev liblzma-dev
 
       - name: Create source distribution
         run:  python setup.py sdist --owner=root --group=root
@@ -88,7 +91,7 @@ jobs:
   upload_pypi:
     needs: [build_wheels, build_sdist]
     runs-on: ubuntu-latest
-    environment: ${{ github.event_name == 'release' && 'pypi' || 'testpypi' }}
+    environment: ${{ github.ref_type == 'tag' && 'pypi' || 'testpypi' }}
 
     permissions:
       id-token: write
@@ -101,11 +104,11 @@ jobs:
           path: dist
 
       - name: Publish distribution to Test PyPI
-        if: github.event_name == 'push'
+        if: github.ref_type == 'branch'
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           repository-url: https://test.pypi.org/legacy/
 
       - name: Publish distribution to PyPI
-        if: github.event_name == 'release' && github.event.action == 'published'
+        if: github.ref_type == 'tag'
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.travis.disabled.yml b/.travis.disabled.yml
deleted file mode 100644
index 5b7bcc841..000000000
--- a/.travis.disabled.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-os:
-  - linux
-  - osx
-
-language: c
-
-stages:
-  - test
-  - name: deploy
-    if: tag IS present
-
-env:
-  matrix:
-    - CONDA_PY=2.7
-    - CONDA_PY=3.6
-    - CONDA_PY=3.7
-    - CONDA_PY=3.8
-  global:
-    - PYSAM_LINKING_TEST=1
-    - TWINE_USERNAME=grepall
-    - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
-
-_cibw_common: &cibw_common
-  addons: {}
-  install:
-    - python3 -m pip install cibuildwheel>=1.1.0 twine
-  script:
-    - set -e
-    - cibuildwheel --output-dir dist
-    - twine check dist/*
-    - twine upload --skip-existing dist/*
-
-_cibw_linux: &cibw_linux
-  stage: deploy
-  os: linux
-  language: python
-  python: '3.5'
-  services:
-    - docker
-  <<: *cibw_common
-
-_cibw_linux_aarch64: &cibw_linux_aarch64
-  stage: deploy
-  os: linux
-  arch: arm64
-  language: python
-  python: '3.9'
-  services:
-    - docker
-  <<: *cibw_common
-
-matrix:
-  include:
-    - stage: deploy
-      os: linux
-      language: python
-      python: '3.5'
-      addons:
-        apt:
-          packages:
-            - gcc
-            - g++
-            - libcurl4-openssl-dev  # for libcurl support in sdist
-            - libssl-dev  # for s3 support in sdist
-      install:
-        - python3 -m pip install Cython twine
-      script:
-        - set -e
-        - python3 setup.py build_ext --inplace
-        - python3 setup.py sdist
-        - twine check dist/*
-        - twine upload --skip-existing dist/*
-    - <<: *cibw_linux
-      env:
-        - CIBW_BUILD="*_x86_64"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - <<: *cibw_linux
-      env:
-        - CIBW_BUILD="*_i686"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - <<: *cibw_linux_aarch64
-      env:
-        - CIBW_BUILD="*_aarch64"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - stage: deploy
-      os: osx
-      language: generic
-      env:
-        - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-      <<: *cibw_common
-
-addons:
-  apt:
-    packages:
-    - gcc
-    - g++
-
-script:
-  - ./devtools/run_tests_travis.sh
-
-notifications:
-  email:
-    - andreas.heger@gmail.com
diff --git a/AUTHORS b/AUTHORS
index 4e9c5eb5f..091cd91ab 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,12 +1,12 @@
-Many people have contributed to pysam. The list of github contributors
+Many people have contributed to pysam. The list of GitHub contributors
 is the best place to get a full list of authors and their contributions.
 The list and summary below is a out-of-date and represents the earlier
 stages of the project.
 
 List of contributors:
 
-Andreas Heger, Tildon Grant Belgard, Florian Finkernagel, Leo
-Goodstadt, Martin Goodson all contributed code to pysam.
+Andreas Heger, Tildon Grant Belgard, Florian Finkernagel,
+Leo Goodstadt, Martin Goodson all contributed code to pysam.
 
 John Marshall has been looking after pysam and its community for
 several years, as well as making many code contributions and improving
@@ -17,7 +17,7 @@ reader/writer in htslib.
 
 Gerton Lunter provided a validating VCF parser.
 
-Marcel Martin implemented python 3 compatibility.
+Marcel Martin implemented Python 3 compatibility and added type hints.
 Ben Schiller contributed a Windows compatible clone.
 
 The sources in the directory samtools are from the samtools project:
@@ -31,13 +31,3 @@ Bob Handsaker from the Broad Institute is a major contributor to the
 SAM/BAM specification. He designed and implemented the BGZF format, the
 underlying indexable compression format for the BAM format. BGZF does
 not support arithmetic between file offsets.
-
-Jue Ruan for the Beijing Genome Institute designed and implemented the
-RAZF format, an alternative indexable compression format. RAZF supports
-arithmetic between file offsets, at the cost of increased index file
-size and the full compatibility with gzip. RAZF is optional and only
-used in `faidx' for indexing RAZF compressed fasta files.
-
-Colin Hercus updated novo2sam.pl to support gapped alignment by
-novoalign.
-
diff --git a/Containerfile b/Containerfile
deleted file mode 100644
index 633f07e52..000000000
--- a/Containerfile
+++ /dev/null
@@ -1,13 +0,0 @@
-FROM ubi8:latest
-
-RUN yum update \
-  && yum install -y python3-pip python3-devel pigz \
-  && cd /usr/local/bin \
-  && ln -s /usr/bin/python3 python \
-  && pip3 --no-cache-dir install --upgrade pip \
-  && yum clean all \
-  && echo "system packages installed"
-
-RUN python -m pip install pysam
-
-WORKDIR /opt/
diff --git a/INSTALL b/INSTALL
index 5016dcc75..1f3677ad4 100644
--- a/INSTALL
+++ b/INSTALL
@@ -5,7 +5,7 @@ http://pysam.readthedocs.io/en/latest/installation.html
 Installing pysam
 ================
 
-Pysam can be installed through conda_, pypi_ and from the repository.
+Pysam can be installed through conda_, PyPI_ and from the repository.
 The recommended way to install pysam is through conda/bioconda.
 
 Conda installation
@@ -13,8 +13,8 @@ Conda installation
 
 To install pysam in your current conda_ environment, type::
 
-   conda config --add channels r
    conda config --add channels bioconda
+   conda config --add channels conda-forge
    conda install pysam
 
 This will install pysam from the bioconda_ channel and automatically
@@ -22,7 +22,7 @@ makes sure that dependencies are installed. Also, compilation flags
 will be set automatically, which will potentially save a lot of
 trouble on OS X.
 
-Pypi installation
+PyPI installation
 =================
 
 Pysam provides a python interface to the functionality contained
@@ -32,7 +32,7 @@ can be combined, ``builtin`` and ``external``.
 Builtin
 -------
 
-The typical installation will be through pypi_::
+The typical installation will be through PyPI_::
 
    pip install pysam
 
@@ -86,7 +86,7 @@ To install from repository, type::
 
     python setup.py install
 
-For compilation options, see the section on Pypi installation above.
+For compilation options, see the section on PyPI installation above.
 
 Requirements
 ============
diff --git a/MANIFEST.in b/MANIFEST.in
index 5711f0902..47e77029c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -19,8 +19,7 @@ exclude pysam/config.py
 
 include win32/*.[ch]
 
-# exclude tests from pypi tar-ball - they
-# require additional data
+# exclude tests from sdist tarball as they require additional data
 prune tests/
 
 # samtools
@@ -41,7 +40,7 @@ recursive-include htslib *.[ch]
 exclude htslib/*config*.h
 
 include htslib/configure.ac htslib/m4/*.m4 htslib/*.in
-include htslib/configure htslib/version.sh
+include htslib/configure htslib/config.guess htslib/config.sub htslib/version.sh
 include htslib/Makefile htslib/*.mk
 exclude htslib/config.mk htslib/htscodecs.mk
 
diff --git a/NEWS b/NEWS
index 83ee80324..17c916cc2 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,182 @@
 .. An online version of the release history can be found here:
 .. http://pysam.readthedocs.io/en/latest/release.html
 
+Release 0.23.3
+==============
+
+.. rubric:: 10 June 2025
+
+This is a bugfix release, still wrapping htslib/samtools/bcftools 1.21.
+
+It has been tested with Python versions 3.8 through 3.13, and wheels are
+available via PyPI_ for all of those Python versions. Wheels are built for
+macOS and Linux (manylinux_2_28 and musllinux_1_2) on both ARM and x86-64.
+
+The final pysam release that supported Python 3.6 and 3.7 was v0.23.0.
+
+Bugs fixed:
+
+* Worked around Cython 3.1.2 bug whereby a new Cython-internal function
+  incorrectly triggered the pysam build system's duplicate symbol check.
+
+
+Release 0.23.2
+==============
+
+.. rubric:: 6 June 2025
+
+This is a bugfix release, still wrapping htslib/samtools/bcftools 1.21.
+
+It has been tested with Python versions 3.8 through 3.13, and wheels are
+available via PyPI_ for all of those Python versions. Wheels are built for
+macOS and Linux (manylinux_2_28 and musllinux_1_2) on both ARM and x86-64.
+
+Bugs fixed:
+
+* Pysam 0.23.1 inadvertently broke binary compatibility for Cython projects
+  by changing the size of :class:`.AlignedSegment`. That class has been
+  restored to its previous size, so v0.23.2 restores binary compatibility
+  with Cython projects previously compiled against v0.23.0 and earlier.
+  (This bug does not affect pure Python projects using pysam.)
+
+* Improved I/O exception messages in :class:`.AlignmentFile` and
+  :class:`.VariantFile`, and ensured that error codes returned by all
+  HTSlib function invocations result in an exception being raised.
+
+
+Release 0.23.1
+==============
+
+.. rubric:: 28 May 2025
+
+This is a bugfix release, still wrapping htslib/samtools/bcftools 1.21.
+
+It has been tested with Python versions 3.8 through 3.13, and wheels are
+available via PyPI_ for all of those Python versions. Wheels are built for
+macOS and Linux (manylinux_2_28 and musllinux_1_2) on both ARM and x86-64.
+
+    This release inadvertently broke binary compatibility for Cython projects.
+    This should not affect Python projects using pysam. However Cython projects
+    should update to release 0.23.2.
+
+Bugs fixed:
+
+* Worked around Cython 3.1.0 behaviour change so that ``pysam.CMATCH`` etc
+  remain as synonyms for ``pysam.CIGAR_OPS.CMATCH`` etc. Note that a future
+  release will remove these synonyms to comply with Cython's new convention.
+  (#1339, PR #1340, reported by Adam Taranto, Andy Bond, et al)
+
+* The Python interpreter no longer exits abruptly on x86-64 Linux
+  when ``pysam.bcftools.*()`` commands terminate with error messages.
+  (#1333, #1335, reported by Maximilian Blacher and Ben Habermeyer)
+
+* Fixed :meth:`.AlignedSegment.get_aligned_pairs` type hint, which now
+  has overloaded hints reflecting the different tuples returned.
+  (PR #1342, thanks to Gosuke Shibahara)
+
+* Corrected "invalid type for record" exception message.
+  (PR #1334, thanks to Gaik Tamazian)
+
+* Fixed :class:`.TabixIterator` error handling. (#1328, reported by @Fan-iX)
+
+* Fixed bugs in the caching of :attr:`.AlignedSegment.query_sequence` and
+  :attr:`.AlignedSegment.query_qualities` values.
+
+* Pysam once again builds on CentOS 7, as it now works around limitations
+  in that obsolete platform's obsolete GCC 4.8.x system compiler.
+  (#1327, reported by Mario Fasold)
+
+New functionality:
+
+* New :attr:`.AlignedSegment.query_qualities_str` property enables
+  accessing the QUAL field as an ASCII-encoded base quality string.
+  The existing :attr:`.AlignedSegment.query_qualities` property can
+  now take an array or such a string when setting the value.
+  (PR #1324, PR #1341, requested by Nils Homer)
+
+* Improved :attr:`.AlignedSegment.cigarstring` performance.
+  (PR #1295, thanks to @limwz01)
+
+* Wheels are now built for musllinux in addition to manylinux and macOS.
+  (Requested by Nils Homer)
+
+
+Release 0.23.0
+==============
+
+.. rubric:: 5 February 2025
+
+This pysam release wraps htslib/samtools/bcftools 1.21 (PR #1310).
+
+It has been tested with Python versions 3.6 through 3.13, and wheels are
+available via PyPI_ for all of those Python versions. Python versions 3.6
+through 3.8 are end-of-life; particularly if you use pysam with one of
+these versions, please vote in the version survey at issue #1230.
+
+This is expected to be the last release supporting Python 3.6 and 3.7.
+
+Bugs fixed:
+
+* Fix :func:`pysam.samtools.command(save_stdout=filename) <.samtools.command>`
+  redirection of output to a file. (#677, reported by Haowen Zhang and analysed
+  by Youri Hoogstrate)
+
+* Reinstate HTTPS/S3/GCS support in pre-built Linux wheels when used on Debian
+  and Ubuntu: these wheels now work around the discrepancy between Red Hat and
+  Debian CA Certificate file locations. (#1257, #1268, reported by Daisie Huang
+  and Li Tai Fang)
+
+* Calling :meth:`.VariantHeader.new_record` repeatedly with the same
+  :obj:`samples` object now sets GT correctly every time. (#1308, reported
+  by Arthur Gymer)
+
+* Correct the exception produced when :meth:`.AlignedSegment.set_tags` is
+  used with an invalid value type. (#1233, PR #1235, reported by Weisheng Wu
+  and Marcus Stoiber)
+
+* Many type hinting corrections. (#1298, #1316, PR #1296, PR #1306, PR #1313,
+  PR #1315, thanks to Victor Epain, Arthur Gymer, @mshunjan, and Matt Stone)
+
+* The undocumented :func:`!pysam.samtools.import_` alias for invoking
+  ``samtools import`` has been removed;
+  use :func:`pysam.samtools.fqimport() <.samtools.command>` instead.
+
+* Corrections to several test data files to account for HTSlib 1.20 and later's
+  improved validity checking. (#1291, reported by David Seifert; etc)
+
+New functionality:
+
+* :meth:`.AlignedSegment.get_aligned_pairs` now optionally returns the
+  associated CIGAR operator in each position tuple. (#1292, PR #1294,
+  thanks to Lara Fuhrmann and Ivan Blagoev Topolsky)
+
+* New :meth:`AlignmentFile.flush() <.HTSFile.flush>` and
+  :meth:`VariantFile.flush() <.HTSFile.flush>` methods for flushing buffered
+  output to streams. (#1299, requested by @blex-max)
+
+* Improved :class:`str() <str>` for :class:`.AlignedSegment` now displays
+  reference sequence names when they are available and uses ``#N`` only when
+  necessary. (#1318, requested by Liu)
+
+* Implement :func:`repr` for :class:`.AlignedSegment` so that it displays the
+  most useful fields. (PR #1267, thanks to Marcel Martin)
+
+* Pysam's tests can now run in parallel. (#1284, reported by Yuri Victorovich)
+
+Documentation improvements:
+
+* More complete documentation of invoking SAMtools and BCFtools subcommands
+  via Pysam. (#1096, #1241, PR #1261, PR #1275, #1323, et al, reported by
+  Seung-been "Steven" Lee, Robert Baldwin, Michael Hall, Indraniel Das,
+  and @shokrofont)
+
+* Use the Pysam names for CIGAR operators rather than internal C names.
+  (#1255, reported by Ilya Shlyakhter)
+
+* Mention :meth:`AlignmentFile.fetch("*") <.AlignmentFile.fetch>` in the
+  FAQ entry on fetching unmapped reads. (#424, reported by Ben Weisburd)
+
+
 Release 0.22.1
 ==============
 
@@ -11,24 +187,24 @@ Bugfix release, which still wraps htslib/samtools/bcftools 1.18.
 Bugs fixed:
 
 * Preserve all header field tags defined in the SAM specification (notably TP)
-  in :meth:`.AlignmentHeader.from_dict` and :meth:`.AlignmentHeader.to_dict`
+  in :meth:`.AlignmentHeader.from_dict` and :meth:`.AlignmentHeader.to_dict`.
   (#1237, PR #1238, thanks to Tim Fennell and Nils Homer)
 
 * Adjust HTSlib's Makefile so that ``make distclean`` no longer tries to
-  rebuild the htscodecs configury (PR #1247, reported by Nicola Soranzo)
+  rebuild the htscodecs configury. (PR #1247, reported by Nicola Soranzo)
 
 * Reinstate S3 support in pre-built Linux wheels: support for this protocol
-  was inadvertently omitted from the pre-built 0.22.0 wheels on Linux
+  was inadvertently omitted from the pre-built 0.22.0 wheels on Linux.
   (#1249, #1277, etc varying circumstances; likely it is this that was
   reported by Mathew Baines, Benjamin Sargsyan, et al)
 
-* Add missing :attr:`.AlignedSegment.is_mapped` etc properties to type stubs
+* Add missing :attr:`.AlignedSegment.is_mapped` etc properties to type stubs.
   (PR #1273, thanks to Matt Stone)
 
-* Fix off-by-one NamedTupleProxy, :class:`.asBed`, etc array bounds check
+* Fix off-by-one NamedTupleProxy, :class:`.asBed`, etc array bounds check.
   (#1279, reported by Dan Bolser)
 
-* Make pysam's klib headers compatible with C++ (reported by Martin Grigorov)
+* Make pysam's klib headers compatible with C++. (reported by Martin Grigorov)
 
 
 Release 0.22.0
@@ -39,7 +215,7 @@ Release 0.22.0
 This pysam release wraps htslib/samtools/bcftools 1.18 (PR #1208).
 
 It has been tested with Python versions 3.6 through 3.12, and wheels are
-available via pypi_ for all of those Python versions. Python versions 3.6
+available via PyPI_ for all of those Python versions. Python versions 3.6
 and 3.7 are end-of-life; particularly if you use pysam with either of
 these versions, please vote in the version survey at issue #1230.
 
@@ -47,54 +223,54 @@ The final pysam release that supported Python 2.7 was v0.20.0.
 
 Bugs fixed:
 
-* Remove Cython from runtime dependencies (PR #1186, thanks to Nicola Soranzo,
+* Remove Cython from runtime dependencies. (PR #1186, thanks to Nicola Soranzo,
   also reported by Arya Massarat in PR #1194)
 
-* Miscellaneous dependency improvements (PR #1216, #1217, PR #1218, PR #1219,
+* Miscellaneous dependency improvements. (PR #1216, #1217, PR #1218, PR #1219,
   thanks to Martin Larralde and Arthur Vigil)
 
 * Suppress spurious "Could not retrieve index file" message when opening an
-  AlignmentFile (#939, #1214, reported by ChengYong Tham and Sebastian Röner)
+  AlignmentFile. (#939, #1214, reported by ChengYong Tham and Sebastian Röner)
 
-* Propagate SAM parsing errors encounted in :meth:`.AlignedSegment.fromstring`
+* Propagate SAM parsing errors encounted in :meth:`.AlignedSegment.fromstring`.
   (#1196, reported by DV Klopfenstein)
 
 * Accept invalid MD:A tagged fields produced by HTSeq instead of crashing
   in :meth:`AlignedSegment.get_aligned_pairs(with_seq=True)
-  <.AlignedSegment.get_aligned_pairs>` (#1226, reported by Isaac Vock)
+  <.AlignedSegment.get_aligned_pairs>`. (#1226, reported by Isaac Vock)
 
-* Fix multiarch macOS CI builds by removing brewed liblzma (#1205, reported
+* Fix multiarch macOS CI builds by removing brewed liblzma. (#1205, reported
   by Till Hartmann)
 
-* Fix :attr:`.VariantRecordSample.alleles` type hint (#1179, reported by
+* Fix :attr:`.VariantRecordSample.alleles` type hint. (#1179, reported by
   David Seifert)
 
 New functionality:
 
 * Add optional :meth:`HTSFile.seek(..., whence) <.HTSFile.seek>` parameter
-  and clarify which functions use libc.SEEK_SET vs io.SEEK_SET
+  and clarify which functions use libc.SEEK_SET vs io.SEEK_SET.
   (#1185, requested by luyulin)
 
-* File handling improvements in samtools & bcftools commands (should improve
+* File handling improvements in samtools & bcftools commands. (should improve
   #1193 and #1195, reported by Rob Bierman and Sam Chorlton)
 
-* Improve :class:`.FastxFile` performance (PR #1227, thanks to Fabian Klötzl
+* Improve :class:`.FastxFile` performance. (PR #1227, thanks to Fabian Klötzl
   and Valentyn Bezshapkin)
 
-* Improve the accuracy of type hints for :class:`.AlignmentFile` iteration
+* Improve the accuracy of type hints for :class:`.AlignmentFile` iteration.
   (#1184, PR #1189, reported by @PikalaxALT)
 
 Documentation improvements:
 
-* Clarify that :meth:`.AlignedSegment.get_aligned_pairs` results are 0-based
+* Clarify that :meth:`.AlignedSegment.get_aligned_pairs` results are 0-based.
   (#1180, reported by Nick Semenkovich)
 
-* Clarify :meth:`.AlignedSegment.get_reference_positions` documentation
+* Clarify :meth:`.AlignedSegment.get_reference_positions` documentation.
   (#836, #838, reported by Liang Ou and Nick Stoler)
 
 * Clarify that installation via pip usually uses a wheel, and that configuring
   the build via $HTSLIB_CONFIGURE_OPTIONS etc only applies when installing from
-  an sdist (#1086, reported by Layne Sadler)
+  an sdist. (#1086, reported by Layne Sadler)
 
 A message from pysam's founder, Andreas Heger:
 
@@ -620,11 +796,11 @@ Overview
 The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
 enhancements and bugfixes. See below for a detailed list.
 
-`Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
-comes with additional capabilities for remote file access which depend
-on the presence of optional system libraries. As a consequence, the
-installation script :file:`setup.py` has become more complex. For an
-overview, see :ref:`installation`.  We have tested installation on
+The `Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
+release comes with additional capabilities for remote file access which
+depend on the presence of optional system libraries. As a consequence,
+the installation script :file:`setup.py` has become more complex. For
+an overview, see :ref:`installation`.  We have tested installation on
 linux and OS X, but could not capture all variations. It is possible
 that a 0.9.1 release might follow soon addressing installation issues.
 
diff --git a/README.rst b/README.rst
index b50e2e51b..68f2fd55e 100644
--- a/README.rst
+++ b/README.rst
@@ -15,9 +15,9 @@ includes an interface for tabix_.
 If you are using the conda packaging manager (e.g. miniconda or anaconda),
 you can install pysam from the `bioconda channel <https://bioconda.github.io/>`_::
 
-   conda config --add channels defaults
-   conda config --add channels conda-forge
    conda config --add channels bioconda
+   conda config --add channels conda-forge
+   conda config --set channel_priority strict
    conda install pysam
 
 Installation through bioconda is the recommended way to install pysam
@@ -25,10 +25,10 @@ as it resolves non-python dependencies and uses pre-configured
 compilation options. Especially for OS X this will potentially save a
 lot of trouble.
 
-The current version of pysam wraps 3rd-party code from htslib-1.18, samtools-1.18, and bcftools-1.18.
+The current version of pysam wraps 3rd-party code from htslib-1.21, samtools-1.21, and bcftools-1.21.
 
-Pysam is available through `pypi
-<https://pypi.python.org/pypi/pysam>`_. To install, type::
+Pysam is available through `PyPI <https://pypi.org/project/pysam/>`_.
+To install, type::
 
    pip install pysam
 
diff --git a/bcftools/HMM.h b/bcftools/HMM.h
index 3a6cab30a..75d3f8b49 100644
--- a/bcftools/HMM.h
+++ b/bcftools/HMM.h
@@ -127,7 +127,7 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm);
  *   @sites:    list of positions
  *
  *   Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new
- *   transition probabilities is returned. In this verison, emission
+ *   transition probabilities is returned. In this version, emission
  *   probabilities are not updated.
  */
 double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
diff --git a/bcftools/LICENSE b/bcftools/LICENSE
index 46dc0e0e3..dbe9739ea 100644
--- a/bcftools/LICENSE
+++ b/bcftools/LICENSE
@@ -9,7 +9,7 @@ the INSTALL document), the use of this software is governed by the GPL license.
 
 The MIT/Expat License
 
-Copyright (C) 2012-2023 Genome Research Ltd.
+Copyright (C) 2012-2024 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -772,3 +772,28 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------------------------------------------------------------------
+
+License for edlib.[ch]
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Martin Šošić
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/bcftools/abuf.c b/bcftools/abuf.c
index 7958cf570..b125679b9 100644
--- a/bcftools/abuf.c
+++ b/bcftools/abuf.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2021-2023 Genome Research Ltd.
+   Copyright (c) 2021-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -43,6 +43,7 @@ typedef struct
     kstring_t ref, alt;
     int ial;        // the index of the original ALT allele, 1-based
     int beg, end;   // 0-based inclusive offsets to ref,alt
+    int plen;       // the ref,alt prefix length, eg plen=1 for C>CA
 }
 atom_t;
 
@@ -175,8 +176,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
                 atom->alt.l = 0;
                 kputc(refb, &atom->ref);
                 kputc(refb, &atom->alt);
-                atom->beg = atom->end = i;
-                atom->ial = ial;
+                atom->beg  = atom->end = i;
+                atom->ial  = ial;
+                atom->plen = 1;
             }
             continue;
         }
@@ -202,6 +204,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
     if ( rcmp ) return rcmp;
     return strcasecmp(a->alt.s,b->alt.s);
 }
+
+// returns
+//      0 .. identical beg,ref,alt
+//      1 .. non-overlapping variants, but record may overlap (A>AT vs A>C)
+//      2 .. overlapping (conflicting) variants
+static int _atoms_overlap(const atom_t *a, const atom_t *b)
+{
+    if ( a->beg < b->beg ) return 2;
+    if ( a->beg > b->beg ) return 2;
+
+    // consider SNV followed by DEL as not overlapping
+    //      CC > C      a.plen=1 (ref,alt prefix len=1)
+    //      C  > T      b.plen=0 (ref,alt prefix len=0)
+    if ( a->plen && a->plen >= b->ref.l ) return 1;
+    if ( b->plen && b->plen >= a->ref.l ) return 1;
+
+    int rcmp = strcasecmp(a->ref.s,b->ref.s);
+    if ( rcmp ) return 2;
+
+    // consider SNV followed by INS as not overlapping
+    //      A > AT      a.plen=1 (ref,alt prefix len=1)
+    //      A > C       b.plen=0 (ref,alt prefix len=0)
+    if ( a->plen && a->plen >= b->alt.l ) return 1;
+    if ( b->plen && b->plen >= a->alt.l ) return 1;
+
+    rcmp = strcasecmp(a->alt.s,b->alt.s);
+    if ( rcmp ) return 2;
+    return 0;
+}
 /*
     For reproducibility of tests on different platforms, we need to guarantee the same order of identical
     atoms originating from different source ALTs.  Even though they are consistent, different values can be
@@ -238,7 +269,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom)
 static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
 {
     uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
-    ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
+    int olap = _atoms_overlap(atom,buf->split.atoms[iout]);
+    ptr[atom->ial-1] =  olap > 1 ? 2 : 1;
+
+    // The test test/atomize.split.5.vcf shows why we sometimes can and sometimes
+    // cannot remove the star allele like this
+    //      buf->split.overlaps[iout] = olap > 1 ? 1 : 0;
+    // I forgot the details of the code, so don't immediately see
+    // if this could be made smarter
     buf->split.overlaps[iout] = 1;
 }
 #if 0
@@ -411,13 +449,21 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
             buf->tmp2  = dst.s;
             ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
         }
-        if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+        if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
     }
 }
 static void _split_table_set_history(abuf_t *buf)
 {
-    int i,j;
+    int i,j,ret;
     bcf1_t *rec = buf->split.rec;
+
+    // Don't update if the tag already exists. This is to prevent -a from overwriting -m
+    int m = 0;
+    char *tmp = NULL;
+    ret = bcf_get_info_string(buf->hdr,rec,buf->split.info_tag,&tmp,&m);
+    free(tmp);
+    if ( ret>0 ) return;
+
     buf->tmps.l = 0;
     ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
     for (i=1; i<rec->n_allele; i++)
@@ -441,8 +487,8 @@ static void _split_table_set_history(abuf_t *buf)
             kputc(',',&buf->tmps);
         }
         buf->tmps.s[--buf->tmps.l] = 0;
-        if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
-            error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+        if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+            error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
     }
 }
 static void _split_table_set_gt(abuf_t *buf)
@@ -668,7 +714,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
             #undef BRANCH
             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
         }
-        if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+        if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
     }
 }
 static inline int _is_acgtn(char *seq)
@@ -737,7 +783,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec)
     _split_table_init(buf,rec,buf->natoms);
     for (i=0; i<buf->natoms; i++)
     {
-        if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
+        if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue;
         _split_table_new(buf, &buf->atoms[i]);  // add a new unique output atom
     }
     for (i=0; i<buf->natoms; i++)
diff --git a/bcftools/abuf.c.pysam.c b/bcftools/abuf.c.pysam.c
index 6ac6d1832..6e0e3de5c 100644
--- a/bcftools/abuf.c.pysam.c
+++ b/bcftools/abuf.c.pysam.c
@@ -2,7 +2,7 @@
 
 /* The MIT License
 
-   Copyright (c) 2021-2023 Genome Research Ltd.
+   Copyright (c) 2021-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -45,6 +45,7 @@ typedef struct
     kstring_t ref, alt;
     int ial;        // the index of the original ALT allele, 1-based
     int beg, end;   // 0-based inclusive offsets to ref,alt
+    int plen;       // the ref,alt prefix length, eg plen=1 for C>CA
 }
 atom_t;
 
@@ -177,8 +178,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
                 atom->alt.l = 0;
                 kputc(refb, &atom->ref);
                 kputc(refb, &atom->alt);
-                atom->beg = atom->end = i;
-                atom->ial = ial;
+                atom->beg  = atom->end = i;
+                atom->ial  = ial;
+                atom->plen = 1;
             }
             continue;
         }
@@ -204,6 +206,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
     if ( rcmp ) return rcmp;
     return strcasecmp(a->alt.s,b->alt.s);
 }
+
+// returns
+//      0 .. identical beg,ref,alt
+//      1 .. non-overlapping variants, but record may overlap (A>AT vs A>C)
+//      2 .. overlapping (conflicting) variants
+static int _atoms_overlap(const atom_t *a, const atom_t *b)
+{
+    if ( a->beg < b->beg ) return 2;
+    if ( a->beg > b->beg ) return 2;
+
+    // consider SNV followed by DEL as not overlapping
+    //      CC > C      a.plen=1 (ref,alt prefix len=1)
+    //      C  > T      b.plen=0 (ref,alt prefix len=0)
+    if ( a->plen && a->plen >= b->ref.l ) return 1;
+    if ( b->plen && b->plen >= a->ref.l ) return 1;
+
+    int rcmp = strcasecmp(a->ref.s,b->ref.s);
+    if ( rcmp ) return 2;
+
+    // consider SNV followed by INS as not overlapping
+    //      A > AT      a.plen=1 (ref,alt prefix len=1)
+    //      A > C       b.plen=0 (ref,alt prefix len=0)
+    if ( a->plen && a->plen >= b->alt.l ) return 1;
+    if ( b->plen && b->plen >= a->alt.l ) return 1;
+
+    rcmp = strcasecmp(a->alt.s,b->alt.s);
+    if ( rcmp ) return 2;
+    return 0;
+}
 /*
     For reproducibility of tests on different platforms, we need to guarantee the same order of identical
     atoms originating from different source ALTs.  Even though they are consistent, different values can be
@@ -240,7 +271,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom)
 static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
 {
     uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
-    ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
+    int olap = _atoms_overlap(atom,buf->split.atoms[iout]);
+    ptr[atom->ial-1] =  olap > 1 ? 2 : 1;
+
+    // The test test/atomize.split.5.vcf shows why we sometimes can and sometimes
+    // cannot remove the star allele like this
+    //      buf->split.overlaps[iout] = olap > 1 ? 1 : 0;
+    // I forgot the details of the code, so don't immediately see
+    // if this could be made smarter
     buf->split.overlaps[iout] = 1;
 }
 #if 0
@@ -413,13 +451,21 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
             buf->tmp2  = dst.s;
             ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
         }
-        if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+        if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
     }
 }
 static void _split_table_set_history(abuf_t *buf)
 {
-    int i,j;
+    int i,j,ret;
     bcf1_t *rec = buf->split.rec;
+
+    // Don't update if the tag already exists. This is to prevent -a from overwriting -m
+    int m = 0;
+    char *tmp = NULL;
+    ret = bcf_get_info_string(buf->hdr,rec,buf->split.info_tag,&tmp,&m);
+    free(tmp);
+    if ( ret>0 ) return;
+
     buf->tmps.l = 0;
     ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
     for (i=1; i<rec->n_allele; i++)
@@ -443,8 +489,8 @@ static void _split_table_set_history(abuf_t *buf)
             kputc(',',&buf->tmps);
         }
         buf->tmps.s[--buf->tmps.l] = 0;
-        if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
-            error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+        if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+            error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
     }
 }
 static void _split_table_set_gt(abuf_t *buf)
@@ -670,7 +716,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
             #undef BRANCH
             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
         }
-        if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+        if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
     }
 }
 static inline int _is_acgtn(char *seq)
@@ -739,7 +785,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec)
     _split_table_init(buf,rec,buf->natoms);
     for (i=0; i<buf->natoms; i++)
     {
-        if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
+        if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue;
         _split_table_new(buf, &buf->atoms[i]);  // add a new unique output atom
     }
     for (i=0; i<buf->natoms; i++)
diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c
index 88e25de1f..55c208122 100644
--- a/bcftools/bam2bcf.c
+++ b/bcftools/bam2bcf.c
@@ -1,7 +1,7 @@
 /*  bam2bcf.c -- variant calling.
 
     Copyright (C) 2010-2012 Broad Institute.
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -249,6 +249,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
 {
     int i, n, ref4, is_indel, ori_depth = 0;
 
+#ifdef GLF_DEBUG
+    fprintf(stderr, "Call GLFGEN\n");
+#endif
+
     // clean from previous run
     r->ori_depth = 0;
     r->mq0 = 0;
@@ -268,6 +272,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
         bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
     }
 
+    // Detect if indel occurs anywhere in this sample
+    int indel_in_sample = 0;
+    if (bca->edlib) {
+        for (i = n = 0; i < _n; ++i) {
+            const bam_pileup1_t *p = pl + i;
+            if (p->indel) indel_in_sample = 1;
+        }
+    }
+
     // fill the bases array
     double nqual_over_60 = bca->nqual / 60.0;
     int ADR_ref_missed[4] = {0};
@@ -298,7 +311,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
             b = p->aux>>16&0x3f;        // indel type
             seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
 
-            if ( !bca->indels_v20 )
+            if (bca->edlib) {
+                if (indel_in_sample) {
+                    seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+                } else if (p->aux & 0xff) {
+                    // An indel in another sample, but not this.  So just use
+                    // basic sequence confidences.
+                    q = bam_get_qual(p->b)[p->qpos];
+                    if (q > bca->max_baseQ) q = bca->max_baseQ;
+                    seqQ = 99;
+                }
+            }
+
+            if ( !bca->indels_v20 && !bca->edlib )
             {
                 /*
                     This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain
@@ -330,6 +355,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
                 }
             }
 
+#ifdef GLF_DEBUG
+            fprintf(stderr, "GLF %s\t%d\t%d\n", bam_get_qname(p->b),
+                    bca->indel_types[b], q);
+#endif
             if (q < bca->min_baseQ)
             {
                 if (!p->indel && b < 4) // not an indel read
@@ -341,6 +370,50 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
                 }
                 continue;
             }
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#if 1 // TEST 6
+            if (bca->edlib) {
+                // Deeper data should rely more heavily on counts of data
+                // than quality, as quality can be unreliable and prone to
+                // miscalculations through BAQ, STR analysis, etc.
+                // So we put a cap on how good seqQ can be.
+                //
+                // Is it simply the equivalent of increasing -F filter?
+                // Not quite, as the latter removes many real variants upfront.
+                // This calls them and then post-adjusts quality, potentially
+                // dropping it later or changing genotype. So we still get
+                // calls, but lower qual.
+                seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5));
+
+                if (indel_in_sample && p->indel == 0 && b != 0) {
+                    // This read doesn't contain an indel in CIGAR, but it
+                    // is assigned to an indel now (b != 0),  These are
+                    // reads we've corrected with realignment, but they're
+                    // also enriched for FPs so at high depth we reduce their
+                    // confidence and let the depth do the talking.  If it's
+                    // real and deep, then we don't need every read aligning.
+                    // We also reduce base quality too to reflect the
+                    // chance of our realignment being incorrect.
+
+                    seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5
+
+                    // Finally reduce indel quality.
+                    // This is a blend of indelQ and base QUAL.
+                    q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1);
+                }
+            }
+#endif
+
+            // Note baseQ changes some output fields such as I16, but has no
+            // significant affect on "call".
             baseQ  = p->aux>>8&0xff;
         }
         else
@@ -375,6 +448,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
         }
         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
         if ( !mapQ ) r->mq0++;
+#ifdef GLF_DEBUG
+        fprintf(stderr, "GLF2 %s\t%d\t%d\t%d,%d\n",
+                bam_get_qname(p->b), b, q,
+                seqQ, mapQ);
+#endif
         if (q > seqQ) q = seqQ;
         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
         if (q > mapQ) q = mapQ;
@@ -478,9 +556,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
             for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
     }
 
+    // Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed
+    // ratios.  This is detrimental on Illumina, but beneficial on PacBio CCS.
+    // It's possibly related to the homopolyer error likelihoods or overall
+    // Indel accuracy.  Maybe tie this in to the -h option?
+
     r->ori_depth = ori_depth;
     // glfgen
     errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
+
+    // TODO: account for the number of unassigned reads.  If depth is 50,
+    // but AD is 5,7 then it may look like a variant but it probably
+    // should be low quality.
+
     return n;
 }
 
@@ -1147,10 +1235,30 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
     if ( bc->ori_ref < 0 )
     {
         bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
-        if ( fmt_flag&B2B_INFO_IDV )
-            bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
-        if ( fmt_flag&B2B_INFO_IMF )
-            bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
+        uint32_t idv = bca->max_support;
+        if ( fmt_flag&B2B_INFO_IMF) {
+            float max_frac;
+            // Recompute IDV and IMF based on alignment results for more
+            // accurate counts, but only when in new "--indels-cns" mode.
+            if (bc->ADF && bc->ADR && bca->edlib) {
+                int max_ad = 0;
+                for (int k = 1; k < rec->n_allele; k++) {
+                    if (max_ad < bc->ADF[k] + bc->ADR[k])
+                        max_ad = bc->ADF[k] + bc->ADR[k];
+                }
+                max_frac = (double)(max_ad) / bc->ori_depth;
+                idv = max_ad;
+            } else {
+                max_frac = bca->max_frac;
+            }
+            // Copied here to maintain order for consistency of "make check"
+            if ( fmt_flag&B2B_INFO_IDV )
+                bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
+            bcf_update_info_float(hdr, rec, "IMF", &max_frac, 1);
+        } else {
+            if ( fmt_flag&B2B_INFO_IDV )
+                bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
+        }
     }
     bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
     if ( fmt_flag&B2B_INFO_ADF )
diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c
index 4a6fe4d0e..d0941b2eb 100644
--- a/bcftools/bam2bcf.c.pysam.c
+++ b/bcftools/bam2bcf.c.pysam.c
@@ -3,7 +3,7 @@
 /*  bam2bcf.c -- variant calling.
 
     Copyright (C) 2010-2012 Broad Institute.
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -251,6 +251,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
 {
     int i, n, ref4, is_indel, ori_depth = 0;
 
+#ifdef GLF_DEBUG
+    fprintf(bcftools_stderr, "Call GLFGEN\n");
+#endif
+
     // clean from previous run
     r->ori_depth = 0;
     r->mq0 = 0;
@@ -270,6 +274,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
         bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
     }
 
+    // Detect if indel occurs anywhere in this sample
+    int indel_in_sample = 0;
+    if (bca->edlib) {
+        for (i = n = 0; i < _n; ++i) {
+            const bam_pileup1_t *p = pl + i;
+            if (p->indel) indel_in_sample = 1;
+        }
+    }
+
     // fill the bases array
     double nqual_over_60 = bca->nqual / 60.0;
     int ADR_ref_missed[4] = {0};
@@ -300,7 +313,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
             b = p->aux>>16&0x3f;        // indel type
             seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
 
-            if ( !bca->indels_v20 )
+            if (bca->edlib) {
+                if (indel_in_sample) {
+                    seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+                } else if (p->aux & 0xff) {
+                    // An indel in another sample, but not this.  So just use
+                    // basic sequence confidences.
+                    q = bam_get_qual(p->b)[p->qpos];
+                    if (q > bca->max_baseQ) q = bca->max_baseQ;
+                    seqQ = 99;
+                }
+            }
+
+            if ( !bca->indels_v20 && !bca->edlib )
             {
                 /*
                     This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain
@@ -332,6 +357,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
                 }
             }
 
+#ifdef GLF_DEBUG
+            fprintf(bcftools_stderr, "GLF %s\t%d\t%d\n", bam_get_qname(p->b),
+                    bca->indel_types[b], q);
+#endif
             if (q < bca->min_baseQ)
             {
                 if (!p->indel && b < 4) // not an indel read
@@ -343,6 +372,50 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
                 }
                 continue;
             }
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#if 1 // TEST 6
+            if (bca->edlib) {
+                // Deeper data should rely more heavily on counts of data
+                // than quality, as quality can be unreliable and prone to
+                // miscalculations through BAQ, STR analysis, etc.
+                // So we put a cap on how good seqQ can be.
+                //
+                // Is it simply the equivalent of increasing -F filter?
+                // Not quite, as the latter removes many real variants upfront.
+                // This calls them and then post-adjusts quality, potentially
+                // dropping it later or changing genotype. So we still get
+                // calls, but lower qual.
+                seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5));
+
+                if (indel_in_sample && p->indel == 0 && b != 0) {
+                    // This read doesn't contain an indel in CIGAR, but it
+                    // is assigned to an indel now (b != 0),  These are
+                    // reads we've corrected with realignment, but they're
+                    // also enriched for FPs so at high depth we reduce their
+                    // confidence and let the depth do the talking.  If it's
+                    // real and deep, then we don't need every read aligning.
+                    // We also reduce base quality too to reflect the
+                    // chance of our realignment being incorrect.
+
+                    seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5
+
+                    // Finally reduce indel quality.
+                    // This is a blend of indelQ and base QUAL.
+                    q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1);
+                }
+            }
+#endif
+
+            // Note baseQ changes some output fields such as I16, but has no
+            // significant affect on "call".
             baseQ  = p->aux>>8&0xff;
         }
         else
@@ -377,6 +450,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
         }
         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
         if ( !mapQ ) r->mq0++;
+#ifdef GLF_DEBUG
+        fprintf(bcftools_stderr, "GLF2 %s\t%d\t%d\t%d,%d\n",
+                bam_get_qname(p->b), b, q,
+                seqQ, mapQ);
+#endif
         if (q > seqQ) q = seqQ;
         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
         if (q > mapQ) q = mapQ;
@@ -480,9 +558,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
             for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
     }
 
+    // Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed
+    // ratios.  This is detrimental on Illumina, but beneficial on PacBio CCS.
+    // It's possibly related to the homopolyer error likelihoods or overall
+    // Indel accuracy.  Maybe tie this in to the -h option?
+
     r->ori_depth = ori_depth;
     // glfgen
     errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
+
+    // TODO: account for the number of unassigned reads.  If depth is 50,
+    // but AD is 5,7 then it may look like a variant but it probably
+    // should be low quality.
+
     return n;
 }
 
@@ -1149,10 +1237,30 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
     if ( bc->ori_ref < 0 )
     {
         bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
-        if ( fmt_flag&B2B_INFO_IDV )
-            bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
-        if ( fmt_flag&B2B_INFO_IMF )
-            bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
+        uint32_t idv = bca->max_support;
+        if ( fmt_flag&B2B_INFO_IMF) {
+            float max_frac;
+            // Recompute IDV and IMF based on alignment results for more
+            // accurate counts, but only when in new "--indels-cns" mode.
+            if (bc->ADF && bc->ADR && bca->edlib) {
+                int max_ad = 0;
+                for (int k = 1; k < rec->n_allele; k++) {
+                    if (max_ad < bc->ADF[k] + bc->ADR[k])
+                        max_ad = bc->ADF[k] + bc->ADR[k];
+                }
+                max_frac = (double)(max_ad) / bc->ori_depth;
+                idv = max_ad;
+            } else {
+                max_frac = bca->max_frac;
+            }
+            // Copied here to maintain order for consistency of "make check"
+            if ( fmt_flag&B2B_INFO_IDV )
+                bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
+            bcf_update_info_float(hdr, rec, "IMF", &max_frac, 1);
+        } else {
+            if ( fmt_flag&B2B_INFO_IDV )
+                bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
+        }
     }
     bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
     if ( fmt_flag&B2B_INFO_ADF )
diff --git a/bcftools/bam2bcf.h b/bcftools/bam2bcf.h
index 955c022bf..8f8f8db5a 100644
--- a/bcftools/bam2bcf.h
+++ b/bcftools/bam2bcf.h
@@ -122,14 +122,17 @@ typedef struct __bcf_callaux_t {
     // for internal uses
     int max_bases;
     int indel_types[4];     // indel lengths
-    int indel_win_size, indels_v20;
-    int maxins, indelreg;
+    int indel_win_size, indels_v20, edlib;
+    int seqQ_offset; // edlib mode, seqQ=MIN(seqQ, offset - MIN(20,depth)*5);
+    int maxins, indelreg, poly_mqual;
     int read_len;
     char *inscns;
     uint16_t *bases;        // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
     errmod_t *e;
     void *rghash;
     float indel_bias;  // adjusts indel score threshold; lower => call more.
+    float del_bias;    // (-.9 < x < .9) error profile; >0 => more del, <0 => more ins
+    float vs_ref;      // 0 to 1.  0: score vs next-best. 1: score vs ref
     int32_t *ref_nm, *alt_nm;   // pointers to bcf_call_t.{ref_nm,alt_nm}
     unsigned int nnm[2];        // number of nm observations
     float nm[2];                // cumulative count of mismatches in ref and alt reads
@@ -193,11 +196,35 @@ extern "C" {
                      const bcf_callaux_t *bca, const char *ref);
     int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
     int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
+    int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+                           bcf_callaux_t *bca, const char *ref, int ref_len);
+
     void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call);
 
     int bcf_cgp_l_run(const char *ref, int pos);
     int est_indelreg(int pos, const char *ref, int l, char *ins4);
 
+/* ----------------------------------------------------------------------
+ * Shared between bam2bcf_indel.c and bam2bcf_edlib.c
+ */
+
+// Take a reference position tpos and convert to a query position (returned).
+// This uses the CIGAR string plus alignment c->pos to do the mapping.
+//
+// *_tpos is returned as tpos if query overlaps tpos, but for deletions
+// it'll be either the start (is_left) or end (!is_left) ref position.
+int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos);
+
+// Identify spft-clip length, position in seq, and clipped seq len
+void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+             int *sc_len_r, int *slen_r, int *epos_r, int *end);
+
+// Compute the consensus for this sample 's', minus indels which
+// get added later.
+char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                        int pos, int *types, int n_types,
+                        int max_ins, int s);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/bcftools/bam2bcf_edlib.c b/bcftools/bam2bcf_edlib.c
new file mode 100644
index 000000000..4e0a38c33
--- /dev/null
+++ b/bcftools/bam2bcf_edlib.c
@@ -0,0 +1,1704 @@
+/*  bam2bcf_indel.c -- indel caller.
+
+    Copyright (C) 2010, 2011 Broad Institute.
+    Copyright (C) 2012-2014,2016-2017, 2021-2024 Genome Research Ltd.
+
+    Author: Heng Li <lh3@sanger.ac.uk>
+            Petr Danecek <pd3@sanger.ac.uk>
+	    James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+// Show consensus
+//#define CONS_DEBUG
+
+// Show alignments to consensus
+//#define ALIGN_DEBUG
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
+#include "bam2bcf.h"
+#include "str_finder.h"
+
+#include <htslib/ksort.h>
+// Is there no way to share these between the 3 implementations?
+KSORT_INIT_STATIC_GENERIC(uint32_t)
+
+#define MINUS_CONST 0x10000000
+
+#define MAX_TYPES 64
+
+#ifndef MIN
+#  define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#  define ABS(a) ((a)<0?-(a):(a))
+#endif
+
+#ifndef MAX
+#  define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+// l is the relative gap length and l_run is the length of the homopolymer
+// on the reference.
+//
+// Larger seqQ is good, so increasing tandemQ calls more indels,
+// and longer l_run means fewer calls.  It is capped later at 255.
+// For short l_runs, the qual is simply based on size of indel
+// larger ones being considered more likely to be real.
+// Longer indels get assigned a score based on the relative indel size
+// to homopolymer, where l_run base will have already been verified by
+// the caller to ensure it's compatible.
+static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run, int str_len)
+{
+    int q, qh;
+    // Short indels are more likely sequencing error than large ones.
+    // So "seqQ" scales with size of observation "l".
+    //
+    // Note openQ and extQ are error likelihoods in Phred scale.  Hence high
+    // openQ means we're very unlikely to miscall an indel.
+    // Ie it's not the open/ext "costs" normally used in alignment; more the reverse.
+    //
+    // We use MIN(q,qh) below, so we can remove the q component by specifying
+    // a large -o parameter in mpileup.
+    q = bca->openQ + bca->extQ * (abs(l) - 1);
+
+    // Orig method; best with Illumina (high openQ)
+//    qh = bca->tandemQ * (double)abs(l) / l_run + .499;
+
+    // Penalise longer homopolymers quadratically more, but boost shorter ones.
+    // Best with CCS (low openQ)
+    //qh = 2 * bca->tandemQ * pow((double)abs(l) / l_run, 1.5) + .499;
+
+    // (l/l_run)^1.26 for openQ=25 or ^1 for openQ=40.
+//    double openQ = MIN(40, bca->openQ);
+//    qh = (30/openQ) * bca->tandemQ
+//        * pow((double)abs(l) / l_run, 1/sqrt(openQ/40)) + .499;
+
+    // Linear scaled on openQ too
+    qh = bca->tandemQ * (double)abs(l) / l_run + .499;
+
+    // Generic maybe ?
+    // power = 1/sqrt(MIN(40,bca->openQ)/40.);
+    // qh = ... * pow((double)abs(l)/l_run, power)
+
+    // bam2bcf.c caps has "if q>seqQ) q=seqQ" so it caps base qual 'q'.
+    // A 1bp indel would therefore have a maximum qual it could be considered based
+    // on open+ext.  Hence why openQ is phred score indicating if the base is real
+    // or an over/under-call. (high openQ means high trust in base)
+    return q < qh? q : qh;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Scans the pileup to identify all the different sizes of indels
+// present.
+// types[] returned is sorted by size, from smallest (maybe negative) to largest.
+//
+// Returns types and fills out n_types_r,  max_rd_len_r and ref_type_r,
+//         or NULL on error.
+static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
+                               int pos, bcf_callaux_t *bca, const char *ref,
+                               int *max_rd_len_r, int *n_types_r,
+                               int *ref_type_r, int *N_r) {
+    int i, j, t, s, N, m, max_rd_len, n_types;
+    int n_alt = 0, n_tot = 0, indel_support_ok = 0;
+    uint32_t *aux;
+    int *types;
+
+    // N is the total number of reads
+    for (s = N = 0; s < n; ++s)
+        N += n_plp[s];
+
+    bca->max_support = bca->max_frac = 0;
+    aux = (uint32_t*) calloc(N + 1, 4);
+    if (!aux)
+        return NULL;
+
+    m = max_rd_len = 0;
+    aux[m++] = MINUS_CONST; // zero indel is always a type (REF)
+
+    // Fill out aux[] array with all the non-zero indel sizes.
+    // Also tally number with indels (n_alt) and total (n_tot).
+    for (s = 0; s < n; ++s) {
+        int na = 0, nt = 0;
+        for (i = 0; i < n_plp[s]; ++i) {
+            const bam_pileup1_t *p = plp[s] + i;
+            ++nt;
+            if (p->indel != 0) {
+                ++na;
+                aux[m++] = MINUS_CONST + p->indel;
+            }
+
+            // FIXME: cache me in pileup struct.
+            j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+            if (j > max_rd_len) max_rd_len = j;
+        }
+        double frac = (double)na/nt;
+        if ( !indel_support_ok && na >= bca->min_support
+             && frac >= bca->min_frac )
+            indel_support_ok = 1;
+        if ( na > bca->max_support && frac > 0 )
+            bca->max_support = na, bca->max_frac = frac;
+
+        n_alt += na;
+        n_tot += nt;
+    }
+
+    // Sort aux[] and dedup
+    ks_introsort(uint32_t, m, aux);
+    for (i = 1, n_types = 1; i < m; ++i)
+        if (aux[i] != aux[i-1]) ++n_types;
+
+    // Taking totals makes it hard to call rare indels (IMF filter)
+    if ( !bca->per_sample_flt )
+        indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac
+                             || n_alt < bca->min_support )
+            ? 0 : 1;
+    if ( n_types == 1 || !indel_support_ok ) { // then skip
+        free(aux);
+        return NULL;
+    }
+
+    // Bail out if we have far too many types of indel
+    if (n_types >= MAX_TYPES) {
+        free(aux);
+        // TODO revisit how/whether to control printing this warning
+        if (hts_verbose >= 2)
+            fprintf(stderr, "[%s] excessive INDEL alleles at position %d. "
+                    "Skip the position.\n", __func__, pos + 1);
+        return NULL;
+    }
+
+    // To prevent long stretches of N's to be mistaken for indels
+    // (sometimes thousands of bases), check the number of N's in the
+    // sequence and skip places where half or more reference bases are Ns.
+    int nN=0, i_end = pos + (2*bca->indel_win_size < max_rd_len
+                            ?2*bca->indel_win_size : max_rd_len);
+    for (i=pos; i<i_end && ref[i]; i++)
+        nN += ref[i] == 'N';
+    if ( nN*2>(i-pos) ) {
+        free(aux);
+        return NULL;
+    }
+
+    // Finally fill out the types[] array detailing the size of insertion
+    // or deletion.
+    types = (int*)calloc(n_types, sizeof(int));
+    if (!types) {
+        free(aux);
+        return NULL;
+    }
+    t = 0;
+    for (i = 0; i < m; ++i) {
+        int sz = (int32_t)(aux[i] - MINUS_CONST);
+        int j;
+        for (j = i+1; j < m; j++)
+            if (aux[j] != aux[i])
+                break;
+
+        if (sz == 0
+            || (j-i >= bca->min_support &&
+                // Note, doesn't handle bca->per_sample_flt yet
+                (bca->per_sample_flt
+                 || (double)(j-i) / n_tot >= bca->min_frac)))
+            types[t++] = sz;
+        i = j-1;
+    }
+    free(aux);
+
+    if (t <= 1) {
+        free(types);
+        return NULL;
+    }
+    n_types = t;
+
+    // Find reference type; types[?] == 0)
+    for (t = 0; t < n_types; ++t)
+        if (types[t] == 0) break;
+
+    *ref_type_r   = t;
+    *n_types_r    = n_types;
+    *max_rd_len_r = max_rd_len;
+    *N_r          = N;
+
+    return types;
+}
+
+// Increment ins["str"] and freq["str"]
+#define NI 100 // number of alternative insertion sequences
+// Could use a hash table too, but expectation is a tiny number of alternatives
+typedef struct {
+    char *str[NI];
+    int len[NI];
+    int freq[NI];
+} str_freq;
+
+static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) {
+    int j;
+
+    for (j = 0; j < NI && sf->str[j]; j++) {
+        if (sf->len[j] == len && memcmp(sf->str[j], str, len) == 0)
+            break;
+    }
+    if (j >= NI)
+        return 0; // too many choices; discard
+
+    sf->freq[j]+=freq;
+    if (!sf->str[j]) {
+        // new insertion
+        if (!(sf->str[j] = malloc(len+1)))
+            return -1;
+        memcpy(sf->str[j], str, len);
+        sf->len[j] = len;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the consensus for a specific indel type at pos.
+ *
+ * left_shift is the number of inserted(+) or deleted(-) bases added to
+ * the consensus before we get to pos.  This is necessary so the alignment
+ * band is correct as it's expected to start at left/right edges in
+ * sync
+ *
+ * We accumulate into several buffers for counting base types:
+ * cons_base   - consensus of data with p->indel == type, bases or gap
+ * ref_base    - consensus of data with p->indel != type, bases or gap
+ * cons_ins    - consensus of data with p->indel == type, insertions
+ * ref_ins     - consensus of data with p->indel == type, bases or gap
+ *
+ * The purpose of cons_ins vs cons_base is if we have very low
+ * coverage due to nearly all reads being another type, then we can
+ * still get a robust consensus using the other data.  If we don't
+ * have shallow data, then we'll not use as much of ref_base as we may
+ * have correlated variants.
+ *
+ * Eg:
+ * REF: AGCTATGAGGCTGATA
+ * SEQ: AGGTAGGAGGGTGATA (x1)
+ * SEQ: AGCTACGAGG*TGATA (x24)
+ * SEQ: AGCTACTAGG*TGATA (x24)
+ *
+ * Cons for no-del is Cs not Gs.  Cannot trust it, so use N if shallow.
+ * CON: AGCTACNAGGGTGATA
+ *
+ * There are still some problems in cons_ins vs ref_ins assignment.
+ * We sometimes seem multiple similar-length insertions added at
+ * different locations.  Ideally we'd like to consider these as all
+ * the same insertion if the size is the same and it's comparable seq.
+ */
+#define MAX_INS 8192
+static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
+                                int pos, bcf_callaux_t *bca, const char *ref,
+                                int ref_len, int left, int right,
+                                int sample, int type, int biggest_del,
+                                int *left_shift, int *right_shift,
+                                int *band, int *tcon_len, int *cpos_pos,
+                                int pos_l, int pos_r) {
+    // Map ASCII ACGTN* to 012345
+    static uint8_t base6[256] = {
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,5,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        //A   C       G       *^                     T
+        4,0,4,1,4,4,4,2,  4,4,4,4,4,4,4,4,  4,4,4,4,3,3,4,4,  4,4,4,4,4,4,4,4,
+        4,0,4,1,4,4,4,2,  4,4,4,4,4,4,4,4,  4,4,4,4,3,3,4,4,  4,4,4,4,4,4,4,4,
+
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+    };
+
+    // single base or del
+    int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));
+    // multi-base insertions
+    str_freq *cons_ins  = calloc(right - left + 1, sizeof(*cons_ins));
+
+    // non-indel ref for all reads on this sample, rather than those just
+    // matching type.  We use this for handling the case where we have a
+    // homozygous deletion being studied, but with 1 or 2 reads misaligned
+    // and containing a base there.
+    //
+    // Eg if the type[]=0 consensus is made up of a very small sample size,
+    // which is also enriched for highly error prone data.  We can use
+    // the other reads from type[] != 0 to flesh out the consensus and
+    // improve accuracy.
+    int (*ref_base)[6]  = calloc(right - left + 1, sizeof(*ref_base));
+    str_freq *ref_ins   = calloc(right - left + 1, sizeof(*ref_ins));
+    int i, j, k, s = sample;
+    char **cons = NULL;
+
+    if (!cons_base || !cons_ins || !ref_base || !ref_ins)
+        goto err;
+
+    //--------------------------------------------------
+    // Accumulate sequences into cons_base and cons_ins arrays
+    int local_band_max = 0; // maximum absolute deviation from diagonal
+    int total_span_str = 0;
+    int type_depth = 0;
+    for (i = 0; i < n_plp[s]; i++) {
+        const bam_pileup1_t *p = plp[s] + i;
+        bam1_t *b = p->b;
+        int x = b->core.pos;  // ref coordinate
+        int y = 0;            // seq coordinate
+        uint32_t *cigar = bam_get_cigar(b);
+        uint8_t *seq = bam_get_seq(b);
+
+        int local_band = 0; // current deviation from diagonal
+        for (k = 0; k < b->core.n_cigar; ++k) {
+            int op  = cigar[k] &  BAM_CIGAR_MASK;
+            int len = cigar[k] >> BAM_CIGAR_SHIFT;
+            int base;
+            int skip_to = 0;
+
+            switch(op) {
+            case BAM_CSOFT_CLIP:
+                y += len;
+                break;
+
+            case BAM_CMATCH:
+            case BAM_CEQUAL:
+            case BAM_CDIFF: {
+                // Can short-cut this with j_start and j_end based on
+                // x+len and left,right
+                for (j = 0; j < len; j++, x++, y++) {
+                    if (x < left) continue;
+                    if (x >= right) break;
+
+                    base = bam_seqi(seq, y);
+                    if (p->indel == type)
+                        // Convert 4-bit base ambig code to 0,1,2,3,4 range
+                        cons_base[x-left][seq_nt16_int[base]]++;
+                    else if (x != pos+1) // indel being assessed question
+                        ref_base[x-left][seq_nt16_int[base]]++;
+                }
+                break;
+            }
+
+            case BAM_CINS: {
+                if (x >= left && x < right) {
+                    local_band += p->indel;
+                    if (local_band_max < local_band)
+                        local_band_max = local_band;
+                }
+
+                char ins[MAX_INS];
+                for (j = 0; j < len; j++, y++) {
+                    if (x < left) continue;
+                    if (x >= right)
+                        break;
+                    base = bam_seqi(seq, y);
+                    if (j < MAX_INS)
+                        ins[j] = seq_nt16_int[base];
+                }
+
+                // Insertions come before a ref match.
+                // 5I 5M is IIIIIM M M M M events, not
+                // {IIIII,M} M M M M choice.  So we need to include the
+                // next match in our sequence when choosing the consensus.
+                if (x >= left && x < right) {
+                    int ilen = j<MAX_INS?j:MAX_INS;
+                    if (p->indel == type /*&& x == pos+1*/) {
+                        // Assume any ins of the same size is the same ins.
+                        // (This rescues misaligned insertions.)
+                        if (bcf_cgp_append_cons(&cons_ins[x-left], ins,
+                                                ilen, 1) < 0)
+                            goto err;
+                        type_depth += (x == pos+1);
+                    } else  if (x != pos+1){
+                        if (bcf_cgp_append_cons(&ref_ins[x-left],  ins,
+                                                ilen, 1) < 0)
+                            goto err;
+                    }
+                }
+                break;
+            }
+
+            case BAM_CDEL:
+                if (x >= left && x < right) {
+                    local_band += p->indel;
+                    if (local_band_max < -local_band)
+                        local_band_max = -local_band;
+                }
+
+                // Maybe not perfect for I/D combos, but likely sufficient.
+                for (j = 0; j < len; j++, x++) {
+                    if (x < left) continue;
+                    if (x >= right) break;
+                    if ((p->indel == type && !p->is_del) ||  // starts here
+                        (p->indel == 0 && p->is_del && len == -type)) { // left
+                        cons_base[x-left][5]++;
+                        type_depth += (x == pos+1);
+                    } else if (x+len <= pos+1 || (skip_to && x > skip_to))
+                        ref_base[x-left][5]++;
+                    else if (x <= pos && x+len > pos+1) {
+                        // we have a deletion which overlaps pos, but
+                        // isn't the same "type".  We don't wish to
+                        // include these as they may bias the
+                        // evaluation by confirming against a
+                        // secondary consensus produced with the other
+                        // deletion.  We set a marker for how long to
+                        // skip adding to ref_base.
+                        if (x > skip_to)
+                            skip_to = x+len;
+                    }
+                }
+                break;
+            }
+        }
+
+        if (b->core.pos <= pos_l && x >= pos_r)
+            total_span_str++;
+
+        // Also track the biggest deviation +/- from diagonal.  We use
+        // this band observation in our BAQ alignment step.
+        if (*band < local_band_max)
+            *band = local_band_max;
+    }
+
+    //--------------------------------------------------
+    // Expand cons_base to include depth from ref_base/ref_ins
+    // Caveat: except at pos itself, where true ref is used if type != 0
+
+#if 1 // TEST 1
+    // We could retest this heuristic further maybe.
+    for (i = 0; i < right-left; i++) {
+        // Total observed depth
+        int t = cons_base[i][0] + cons_base[i][1] + cons_base[i][2] +
+            cons_base[i][3] + cons_base[i][4] + cons_base[i][5];
+        for (j = 0; j < NI; j++) {
+            if (!cons_ins[i].str[j])
+                break;
+            t += cons_ins[i].freq[j];
+        }
+
+        // Similarly for depth on the non-ALT calls (NB: not necessarily
+        // REF as maybe it's other ALTs).
+        int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] +
+            ref_base[i][3] + ref_base[i][4] + ref_base[i][5];
+        for (j = 0; j < NI; j++) {
+            if (!ref_ins[i].str[j])
+                break;
+            r += ref_ins[i].freq[j];
+        }
+
+        // When evaluating this particular indel, we don't want to
+        // penalise alignments by SNP errors elsewhere.  This can
+        // happen when we have low depth for a particular 'type'.
+        //
+        // So add in a little data from ref_base/ref_ins.
+        double rfract = (r - t*2)*.75 / (r+1);
+
+        if (rfract < 1.01 / (r+1e-10))
+            rfract = 1.01 / (r+1e-10); // low depth compensation
+//        if (rfract > 0.2)
+//            rfract = 0.2;
+
+        // TODO: consider limiting rfract so we never drown out the
+        // signal.  We want to use the remaining data only to correct
+        // for sequencing errors in low depth alleles.  If we get
+        // conflicts, it's better to use N than to change a base
+        // incase that variant is genuine.
+        if (i+left >= pos+1 && i+left < pos+1-biggest_del) {
+            // We're overlapping the current indel region, so
+            // we don't wish to bring in evidence from the other
+            // "type" data as it'll harm calling.
+            continue;
+        } else {
+            // Otherwise add in a portion of other data to
+            // boost low population numbers.
+            cons_base[i][0] += rfract * ref_base[i][0];
+            cons_base[i][1] += rfract * ref_base[i][1];
+            cons_base[i][2] += rfract * ref_base[i][2];
+            cons_base[i][3] += rfract * ref_base[i][3];
+            cons_base[i][4] += rfract * ref_base[i][4];
+            cons_base[i][5] += rfract * ref_base[i][5];
+        }
+
+        // Similarly for insertions too; consider a different rfract here?
+        for (j = 0; j < NI; j++) {
+            if (!ref_ins[i].str[j])
+                break;
+            if (bcf_cgp_append_cons(&cons_ins[i],
+                                    ref_ins[i].str[j], ref_ins[i].len[j],
+                                    rfract * ref_ins[i].freq[j]) < 0)
+                goto err;
+        }
+    }
+#endif
+
+    //--------------------------------------------------
+    // Allocate consensus buffer, to worst case length
+    int max_len = right-left;
+    for (i = 0; i < right-left; i++) {
+        if (!cons_ins[i].str[0])
+            continue;
+
+        int ins = 0;
+        for (j = 0; j < NI; j++) {
+            if (!cons_ins[i].str[j])
+                break;
+            if (cons_ins[i].str[j] && ins < cons_ins[i].len[j])
+                ins = cons_ins[i].len[j];
+        }
+        max_len += ins;
+    }
+    max_len += MAX(0, type); // incase type inserted bases never occur
+    cons = malloc((max_len+1)*2 + sizeof(char *)*2);
+    if (!cons)
+        goto err;
+    cons[0] = (char *)&cons[2];
+    cons[1] = cons[0] + max_len+1;
+
+    //--------------------------------------------------
+    // Merge insertions where they are the same length but different
+    // sequences.
+    // NB: we could just index by length and have accumulators for each,
+    // instead of storing separately and merging later (here).
+    // Ie str_freq.str is [NI][5] instead.
+    for (i = 0; i < right-left; i++) {
+        int ins[MAX_INS][5];
+        for (j = 0; j < NI; j++) {
+            if (!cons_ins[i].str[j])
+                break;
+
+            if (cons_ins[i].freq[j] == 0)
+                continue; // already merged
+
+            int l;
+            for (l = 0; l < cons_ins[i].len[j]; l++) {
+                // Append to relevant frequency counter, zero all others
+                ins[l][0] = ins[l][1] = ins[l][2] = ins[l][3] = ins[l][4] = 0;
+                uint8_t b = cons_ins[i].str[j][l];
+                ins[l][b] = cons_ins[i].freq[j];
+            }
+
+            // Merge other insertions of the same length to ins[] counters
+            for (k = j+1; k < NI; k++) {
+                if (!cons_ins[i].str[k])
+                    break;
+                if (cons_ins[i].len[k] != cons_ins[i].len[j])
+                    continue;
+                if (cons_ins[i].freq[k] == 0)
+                    continue; // redundant?
+
+                // Merge str[j] and str[k]
+                for (l = 0; l < cons_ins[i].len[k]; l++) {
+                    uint8_t b = cons_ins[i].str[k][l];
+                    ins[l][b] += cons_ins[i].freq[k];
+                }
+                cons_ins[i].freq[j] += cons_ins[i].freq[k];
+                cons_ins[i].freq[k] = 0;
+            }
+
+            // Now replace ins[j] with the consensus insertion of this len.
+            for (l = 0; l < cons_ins[i].len[j]; l++) {
+                int max_v = 0, base = 0;
+                int tot = ins[l][0] + ins[l][1] + ins[l][2]
+                        + ins[l][3] + ins[l][4];
+                if (max_v < ins[l][0]) max_v = ins[l][0], base = 0;
+                if (max_v < ins[l][1]) max_v = ins[l][1], base = 1;
+                if (max_v < ins[l][2]) max_v = ins[l][2], base = 2;
+                if (max_v < ins[l][3]) max_v = ins[l][3], base = 3;
+                if (max_v < ins[l][4]) max_v = ins[l][4], base = 4;
+
+                cons_ins[i].str[j][l] = (max_v > 0.6*tot) ? base : 4;
+            }
+        }
+    }
+
+#define CONS_CUTOFF      .40 // % needed for base vs N
+#define CONS_CUTOFF_DEL  .35 // % to include any het del
+#define CONS_CUTOFF2     .80 // % needed for gap in cons[1]
+#define CONS_CUTOFF_INC  .35 // % to include any insertion cons[0]
+#define CONS_CUTOFF_INC2 .80 // % to include any insertion cons[1] HOM
+#define CONS_CUTOFF_INS  .60 // and then 60% needed for it to be bases vs N
+
+    //--------------------------------------------------
+    // Walk through the frequency arrays to call the consensus.
+    // We produce cons[0] and cons[1].  Both include strongly
+    // homozygous indels.  Both also include the indel at 'pos'.
+    // However for heterozygous indels we call the most likely event
+    // for cons[0] and the less-likely alternative in cons[1].
+    // TODO: a proper phase analysis so multiple events end up
+    // combining together into the correct consensus.
+    *left_shift = 0;
+    *right_shift = 0;
+    int cnum;
+
+    // Het call filled out in cnum==0 (+ve or -ve).
+    // Used in cnum==1 to do the opposite of whichever way we did before.
+    int heti[MAX_INS] = {0}, hetd[MAX_INS] = {0};
+
+    *cpos_pos = -1;
+    for (cnum = 0; cnum < 2; cnum++) {
+        for (i = k = 0; i < right-left; i++) {
+            // Location in consensus matching the indel itself
+            if (i >= pos-left+1 && *cpos_pos == -1)
+                *cpos_pos = k;
+
+            int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0;
+            for (j = 0; j < 6; j++) {
+                // Top 2 consensus calls
+                if (max_v < cons_base[i][j]) {
+                    max_v2 = max_v, max_j2 = max_j;
+                    max_v = cons_base[i][j], max_j = j;
+                } else if (max_v2 < cons_base[i][j]) {
+                    max_v2 = cons_base[i][j], max_j2 = j;
+                }
+                tot += cons_base[i][j];
+            }
+
+            // +INS
+            int max_v_ins = 0, max_j_ins = 0;
+            int tot_ins = 0;
+            for (j = 0; j < NI; j++) {
+                if (i+left==pos+1)
+                if (type > 0 && i+left == pos+1
+                    && cons_ins[i].len[j] < type && j == 0) {
+                    cons_ins[i].str[j] = realloc(cons_ins[i].str[j], type);
+                    if (!cons_ins[i].str[j])
+                        goto err;
+                    memset(cons_ins[i].str[j] + cons_ins[i].len[j],
+                           4, type - cons_ins[i].len[j]);
+                    cons_ins[i].len[j] = type;
+                }
+                if (!cons_ins[i].str[j])
+                    break;
+                if (cons_ins[i].freq[j] == 0)
+                    continue; // previously merged
+
+                if (max_v_ins < cons_ins[i].freq[j])
+                    //if (i != pos-left+1 || cons_ins[i].len[j] == type)
+                    max_v_ins = cons_ins[i].freq[j], max_j_ins = j;
+                tot_ins += cons_ins[i].freq[j];
+            }
+
+            // NB: tot is based on next matching base, so it includes
+            // everything with or without the insertion.
+            int tot_sum = tot;
+            int always_ins =
+                (i == pos-left+1 && type>0) ||       // current eval
+                max_v_ins > CONS_CUTOFF_INC2*tot_sum;// HOM
+            int het_ins = 0;
+            if (!always_ins && max_v_ins >= bca->min_support) {
+                // Candidate HET ins.
+                if (cnum == 0) {
+                    het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum;
+                    if (i < MAX_INS) heti[i] = het_ins
+                                      ? 1
+                                      : (max_v_ins > .3*tot_sum ? -1:0);
+                } else {
+                    // HET but uncalled before
+                    het_ins = i < MAX_INS ? (heti[i] == -1) : 0;
+                }
+            }
+
+            if (always_ins || het_ins) {
+                if (max_v_ins > CONS_CUTOFF_INS*tot_ins) {
+                    // Insert bases
+                    for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) {
+                        if (cnum == 0) {
+                            if (k < pos-left+*left_shift)
+                                (*left_shift)++;
+                            else
+                                (*right_shift)++;
+                        }
+                        cons[cnum][k++] = cons_ins[i].str[max_j_ins][j];
+                    }
+                } else {
+                    for (j = 0; j < cons_ins[i].len[max_j_ins]; j++)
+                        cons[cnum][k++] = 4; // 'N';
+                }
+            }
+
+            // Call deletions & bases
+            int always_del = (type < 0 && i > pos-left && i <= pos-left-type)
+                || cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del
+            int het_del = 0;
+            if (!always_del && cons_base[i][5] >= bca->min_support) {
+                // Candidate HET del.
+                if (cnum == 0) {
+                    int tot2 = tot;
+                    if (i > pos-left && i <= pos-left-biggest_del)
+                        tot2 = total_span_str - type_depth;
+                    het_del = cons_base[i][5] >= CONS_CUTOFF_DEL * tot2;
+
+                    if (i < MAX_INS) {
+                        if (i > pos-left && i <= pos-left-biggest_del)
+                            hetd[i] = 0;
+                        else
+                            hetd[i] = het_del
+                                ? 1
+                                : (cons_base[i][5] >= .3 * tot2 ? -1 : 0);
+                    }
+                } else {
+                    // HET del uncalled on cnum 0
+                    het_del = i < MAX_INS ? (hetd[i] == -1) : 0;
+                    if (max_j == 5 && het_del == 0) {
+                        max_v = max_v2;
+                        max_j = max_j2;
+                    }
+                }
+            }
+            if (always_del || het_del) {
+                // Deletion
+                if (k < pos-left+*left_shift)
+                    (*left_shift)--;
+                else
+                    (*right_shift)++;
+            } else {
+                // Finally the easy case - a non-indel base or an N
+                if (max_v > CONS_CUTOFF*tot)
+                    cons[cnum][k++] = max_j; // "ACGTN*"
+                else if (max_v > 0)
+                    cons[cnum][k++] = 4;     // 'N';
+                else {
+                    cons[cnum][k] = left+k < ref_len
+                        ? base6[(uint8_t)ref[left+k]]
+                        : 4;
+                    k++;
+                }
+            }
+        }
+
+        tcon_len[cnum] = k;
+    }
+
+    // TODO: replace by io_lib's string pool for rapid tidying.
+    // For now this isn't the bottleneck though.
+    for (i = 0; i < right-left; i++) {
+        for (j = 0; j < NI; j++) {
+            if (cons_ins[i].str[j])
+                free(cons_ins[i].str[j]);
+            if (ref_ins[i].str[j])
+                free(ref_ins[i].str[j]);
+        }
+    }
+
+ err:
+    free(cons_base);
+    free(ref_base);
+    free(cons_ins);
+    free(ref_ins);
+
+    return cons;
+}
+
+// A rename of bcf_cgp_calc_cons from bam2bcf_indel.c
+//
+// Compute the insertion consensus for this sample 's' via a basic
+// majority rule.
+//
+// TODO: merge this into bcf_cgp_consensus as another return value?
+static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                                   int pos, int *types, int n_types,
+                                   int max_ins, int s) {
+    return bcf_cgp_calc_cons(n, n_plp, plp, pos, types, n_types, max_ins, s);
+}
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+// Compile with LIBS="-L. -ldl -ledlib" CLD=g++
+
+// This is faster than ksw and BAQ, meaning we can use larger --indel-size and
+// get a more accurate context, improving alignments further.  This *may*
+// compensate for reduced sensitivity.
+#include "edlib.h"
+int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query,
+                 double m, double del_bias)
+{
+    EdlibAlignConfig cfg = 
+        edlibNewAlignConfig(
+                            //ABS(type)+ABS(l_ref-l_query)+10,
+                            -1, // k; use small positive for faster alignment
+                            EDLIB_MODE_HW, // mode
+#ifdef ALIGN_DEBUG
+                            EDLIB_TASK_PATH,
+#else
+                            EDLIB_TASK_LOC,
+#endif
+                            NULL, // additionalEqualities
+                            0); // additionalEqualitiesLength
+    EdlibAlignResult r = 
+        edlibAlign((char *)query, l_query, (char *)ref, l_ref, cfg);
+
+    if (r.status != EDLIB_STATUS_OK || r.numLocations < 1 ||
+        !r.endLocations || !r.startLocations) {
+        edlibFreeAlignResult(r);
+        return INT_MAX;
+    }
+
+#ifdef ALIGN_DEBUG
+    // NB: Needs linking against the C++ libedlib.a as our cut-down C
+    // implementation misses the alignment generation code.
+    {
+        int i, j = 0, pt = r.startLocations[0], pq = 0;
+        char line1[80];
+        char line2[80];
+        char line3[80];
+        for (i = 0; i < r.alignmentLength && pt < r.endLocations[0]; i++) {
+            int n;
+            switch (n = r.alignment[i]) {
+            case 0: // match
+            case 3: // mismatch
+                line1[j] = "ACGTN"[ref[pt++]];
+                line2[j] = "ACGTN"[query[pq++]];
+                line3[j] = " x"[n==3];
+                break;
+            case 2: // insertion to ref
+                line1[j] = "ACGTN"[ref[pt++]];
+                line2[j] = '-';
+                line3[j] = '-';
+                break;
+            case 1: // insertion to query
+                line1[j] = '-';
+                line2[j] = "ACGTN"[query[pq++]];
+                line3[j] = '+';
+                break;
+            }
+
+            if (++j == sizeof(line1)) {
+                fprintf(stderr, "%.*s\n", j, line1);
+                fprintf(stderr, "%.*s\n", j, line2);
+                fprintf(stderr, "%.*s\n", j, line3);
+                j = 0;
+            }
+        }
+        if (j) {
+            fprintf(stderr, "%.*s\n", j, line1);
+            fprintf(stderr, "%.*s\n", j, line2);
+            fprintf(stderr, "%.*s\n", j, line3);
+        }
+    }
+#endif
+
+    // Aligned target length minus query length is an indication of the number
+    // of insertions and/or deletions.
+    // 
+    // For CIGAR 10M1I10M t_len > l_query ("AC"  / "ATC")
+    // For CIGAR 10M1D10M t_len < l_query ("ATC" / "AC")
+    // Hence t_len-l_query is -ve for net insertions and +ve for net deletions.
+    // If we compute nins and ndel directly via walking though EDLIB_TASK_PATH
+    // we'll see t_len-l_query == ndel-nins.
+    // 
+    // If a technology has a significantly higher chance of making deletion
+    // errors than insertion errors, then we would view deletions as less
+    // indicative of this sequence not coming from this candidate allele than
+    // if it had insertion (as the deletions are more likely to be errors
+    // rather than real, relative to the insertions).  Hence we can skew the
+    // score by the net delta of num_del - num_ins.
+    //
+    // Note this is an approximation that doesn't account for multiple
+    // insertions and deletions within the same sequence, but it is much faster
+    // as it doesn't require EDLIB_TASK_PATH to be computed.
+    //
+    // Given editDistance is +1 for every mismatch, insertion and deletion,
+    // provided the t_len-l_query multiplier < 1 then this is always +ve.
+
+    int t_len = *r.endLocations - *r.startLocations + 1;
+    int score = m*(r.editDistance - del_bias*(t_len - l_query));
+
+    edlibFreeAlignResult(r);
+    return score;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Realign using BAQ to get an alignment score of a single read vs
+// a haplotype consensus.  TODO: replace BAQ with something more robust.
+//
+// There are many coordinates, so let's explain them.
+// - left, right, tbeg, tend, r_start and r_end are in aligned reference
+//   coordinates.
+//   left/right start from pos +/- indel_win_size.
+//   r_start/r_end are the BAM first and last mapped coord on the reference.
+//   tbeg and tend are the intersection of the two.
+// - qbeg and qend are in BAM sequence coordinates
+// - qpos is in sequence coordinates, relative to qbeg.
+//
+// To see what this means, we have illustrations with coordinates
+// above the seqs in reference space and below the seqs in BAM seq space.
+//
+// Overlap left:
+//                     tbeg                        tend
+//      r_start        left                 pos    r_end          right
+// REF  :..............|--------------------#------:--------------|...
+// SEQ  :..............|--------------------#------|
+//      0              qbeg                 qpos   qend
+//
+// Overlap right:
+//                        r_start                     tend
+//         left           tbeg  pos                   right       r_end
+// REF  ...|--------------:-----#---------------------|...........:
+// SEQ                    |-----#---------------------|...........:
+//                        qbeg  qpos                  qend
+//                        0
+//
+// The "-" sequence is the bit passed in.
+// Ie ref2 spans left..right and query spans qbeg..qend.
+// We need to adjust ref2 therefore to tbeg..tend.
+//
+// Fills out score
+// Returns 0 on success,
+//        <0 on error
+static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
+                               int type, int band,
+                               uint8_t *ref1, uint8_t *ref2, uint8_t *query,
+                               int r_start, int r_end,
+                               int tbeg, int tend1, int tend2,
+                               int left, int right,
+                               int qbeg, int qend,
+                               int pos, int qpos, int max_deletion,
+                               double qavg, double del_bias, int *score,
+                               int *str_len1_p, int *str_len2_p) {
+    int atype = abs(type);
+    int l, sc1, sc2;
+
+    // Trim poly_Ns at ends of ref.
+    // This helps to keep len(ref) and len(query) similar, to reduce
+    // band size and reduce the chance of -ve BAQ scores.
+    for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++)
+        if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4)
+            break;
+    if (l > atype)
+        tbeg += l-atype;
+
+    for (l = tend1-tbeg-1; l >= 0; l--)
+        if (ref1[l + tbeg-left] != 4)
+            break;
+    l = tend1-tbeg-1 - l;
+    if (l > atype)
+        tend1 -= l-atype;
+
+    for (l = tend2-tbeg-1; l >= 0; l--)
+        if (ref2[l + tbeg-left] != 4)
+            break;
+    l = tend2-tbeg-1 - l;
+    if (l > atype) {
+        tend2 -= l-atype;
+    }
+
+    // The bottom 8 bits are length-normalised score while
+    // the top bits are unnormalised.
+    //
+    // Try original cons and new cons and pick best.
+    // This doesn't reduce FN much (infact maybe adds very slightly),
+    // but it does reduce GT errors and is a slight reduction to FP.
+
+    double mm = 30; // a const average qual for now. Could tune
+    sc2 = edlib_glocal(ref2 + tbeg - left, tend2 - tbeg,
+                       query, qend - qbeg, mm, del_bias);
+
+    if (tend1 != tend2 ||
+        memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left,
+               tend1 - tbeg) != 0)
+        sc1 = edlib_glocal(ref1 + tbeg - left, tend1 - tbeg,
+                           query, qend - qbeg, mm, del_bias);
+    else
+        sc1 = INT_MAX; // skip
+
+    // Find the best of the two alignments
+    if (sc1 < 0 && sc2 < 0) {
+        *score = 0xffffff;
+        return 0;
+    }
+    if (sc1 < 0) {
+        // sc2 is already correct
+    } else if (sc2 < 0) {
+        sc2 = sc1;
+    } else {
+        // sc1 and sc2 both pass, so use best
+        if (sc2 > sc1)
+            sc2 = sc1;
+    }
+
+    // Sc is overall alignment score, in top 24 bits (SeqQ). It's based
+    // purely on the scores for the whole alignment.
+    // We also have a separate indel score in bottom 8 bits (IndelQ).
+    // This is a function of all sorts of attributes of the candidate indel
+    // itself, such as STR length and the presence of poor quality bases.
+
+    // Used for adjusting indelQ below.  Lower l is more likely to call
+    // (--FN, ++FP).  (NB CLI --indel_bias is 1/indel_bias var).
+    // Starts as average score per base, and then adjusted based on seq
+    // complexity / quality.
+
+    l = .5*(100. * sc2 / (qend - qbeg) + .499);
+
+    *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias * .5);
+
+    return 0;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Returns n_alt on success
+//         -1 on failure
+
+// TODO: almost identical to bam2bcf_indel.c's copy, so we could share
+// the code and add a check on bca->edlib.
+static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
+                                  bcf_callaux_t *bca, char *inscns,
+                                  int l_run, int max_ins,
+                                  int ref_type, int *types, int n_types,
+                                  double qavg, int *score,
+                                  int str_len1, int str_len2) {
+    // FIXME: n_types has a maximum; no need to alloc - use a #define?
+    int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp;
+    memset(sumq, 0, n_types * sizeof(int));
+    int sum_indelQ1[100] = {0}; // n
+    int sum_indelQ2[100] = {0}; // n
+
+    // Confusing variable naming and bit usage.
+    //
+    // score[] is low 8  bits normalised (by len) alignment score
+    //            top 24 bits full alignment score
+    // This gets cast into "sct"; mnemonic score-per-indel-type.
+    //
+    // sc = (score<<6) | type  (index to types[] array for indel size)
+    // So sc>>14 = score>>(14-6) = score>>8.  Ie full alignment score
+    for (s = K = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i, ++K) {
+            bam_pileup1_t *p = plp[s] + i;
+            // Labelling is confusing here.
+            //    sct is short for score.
+            //    sc is score + t(type)
+            // Why aren't these variable names reversed?
+            int *sct = &score[K*n_types], seqQ, indelQ1=0, indelQ2=0, indelQ=0;
+            for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+            for (t = 1; t < n_types; ++t) // insertion sort
+                for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+                    tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+
+#ifdef ALIGN_DEBUG
+            fprintf(stderr, "READ %s\tscores ", bam_get_qname(p->b));
+            for (t = 0; t < n_types; ++t) {
+                fprintf(stderr, "%+2d/%-3d ", types[sc[t]&0x3f], sc[t]>>14);
+            }
+#endif
+
+            /* errmod_cal() assumes that if the call is wrong, the
+             * likelihoods of other events are equal. This is about
+             * right for substitutions, but is not desired for
+             * indels. To reuse errmod_cal(), I have to make
+             * compromise for multi-allelic indels.
+             */
+            if ((sc[0]&0x3f) == ref_type) {
+                // sc >> 14 is the total score.  It's been shifted by 8
+                // from normalised score and 6 from type.
+                // &0x3f is type number
+
+                // Best call is REF.  Compare vs best indel
+                indelQ = (sc[1]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run, str_len1);
+            } else {
+                // look for the reference type
+                for (t = 0; t < n_types; ++t) {
+                    if ((sc[t]&0x3f) == ref_type)
+                        break;
+                }
+                indelQ = indelQ1 = (sc[t]>>14) - (sc[0]>>14);
+//                fprintf(stderr, "IndelQ = %d: %d-%d",
+//                        indelQ, (sc[t]>>14), (sc[0]>>14));
+
+                // Best call is non-ref, compare vs next best non-ref,
+                // or ref if it's just 2 choices (most common case).
+                for (t = 1; t < n_types; t++)
+                    if ((sc[t]&0x3f) == ref_type)
+                        continue;
+                    else break;
+                if (t == n_types)
+                    t--; // it's ref, but it'll do as next best.
+                indelQ2 = (sc[t]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1);
+
+#if 1 // TEST 3
+                indelQ = bca->vs_ref*indelQ1 + (1-bca->vs_ref)*indelQ2;
+#endif
+            }
+
+            // So we lower qual in some, but raise the average to keep FN/FP
+            // ratios up.
+            // Is this key diff for PacBio old vs new HiFi?
+            indelQ  /= bca->indel_bias*0.5;
+            indelQ1 /= bca->indel_bias*0.5;
+
+            // Or maybe just *2 if bca->poly_mqual and be done with it?
+            // Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ?
+
+            // Skew SeqQ and IndelQ based on a portion of the minimum quality
+            // found within a homopolymer.  This is useful where the quality
+            // values are a bit mutable and move around in such data, but less
+            // so on clocked sequencing technologies.
+            //
+            // Enabling this causes lots of GT errors on Illumina.
+            // However on PacBio it's key to removal of false positives.
+            // ONT and UG seem somewhere inbetween.
+            if (bca->poly_mqual) { // TEST 4
+                int qpos = p->qpos, l;
+                uint8_t *seq = bam_get_seq(p->b);
+                uint8_t *qual = bam_get_qual(p->b);
+                int min_q = qual[qpos];
+
+                // scan homopolymer left
+                char baseL = bam_seqi(seq, qpos+1 < p->b->core.l_qseq
+                                      ? qpos+1 : qpos);
+                for (l = qpos; l >= 0; l--) {
+                    if (bam_seqi(seq, l) != baseL)
+                        break;
+                    if (min_q > qual[l])
+                        min_q = qual[l];
+                }
+
+                // scan homo-polymer right (including site of indel)
+                char base = bam_seqi(seq, qpos+1);
+                for (l = qpos+1; l < p->b->core.l_qseq; l++) {
+                    if (min_q > qual[l])
+                        min_q = qual[l];
+                    if (bam_seqi(seq, l) != base)
+                        break;
+                }
+
+                // We reduce -h so homopolymers get reduced likelihood of being
+                // called, but then optionally increase or decrease from there
+                // based on base quality.  Hence lack of low quality bases in
+                // homopolymer will rescue the score back again, reducing FNs.
+
+                // The score factors here may also be machine specific, but for
+                // now these work well (tuned on PB HiFi).
+                seqQ   += MIN(qavg/20,  min_q - qavg/10);
+                indelQ += MIN(qavg/20,  min_q - qavg/5);
+                indelQ1+= MIN(qavg/20,  min_q - qavg/5);
+
+                if (seqQ   < 0) seqQ   = 0;
+                if (indelQ < 0) indelQ = 0;
+                if (indelQ1< 0) indelQ1= 0;
+            }
+
+            // This is the length-normalised score from bcf_cgp_align_score
+            tmp = sc[0]>>6 & 0xff;
+
+            // reduce indelQ
+            // high score = bad, low score = good; flip for indelQ
+            // low normalised scores leave indelQ unmodified
+            // high normalised scores set indelQ to 0
+            // inbetween scores have a linear scale from indelQ to 0
+// Altering the MAGIC value below (originally 111, but chosen for unknown
+// reasons) is comparable to altering --indel-bias.
+#define TMP_MAGIC 255.0
+
+            indelQ = tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ + .499);
+            indelQ1= tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ1+ .499);
+
+            indelQ  = MIN(indelQ,  255);
+            indelQ1 = MIN(indelQ1, 255);
+
+            // Doesn't really help accuracy, but permits -h to take
+            // affect still.
+            if (indelQ > seqQ) indelQ = seqQ;
+            if (indelQ > 255) indelQ = 255;
+            if (indelQ1> 255) indelQ1= 255;
+            if (seqQ > 255) seqQ = 255;
+
+            // Use 22 bits in total.
+            // 0-7   IndelQ
+            // 8-15  SeqQ
+            // 16-22 Score-per-base
+            p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ;
+            sumq[sc[0]&0x3f] += indelQ;
+
+#ifdef ALIGN_DEBUG
+            fprintf(stderr, "\t%d\t%d\n", indelQ, seqQ);
+#endif
+
+            // Experiment in p->aux vs sumq.
+            // One gives likelihood of an indel being here, while the other
+            // is likelihood of a specific genotype?  But which is which?
+
+            sum_indelQ1[s] += indelQ1;
+            sum_indelQ2[s] += indelQ;
+        }
+    }
+
+    // Determine bca->indel_types[] and bca->inscns.
+    // Sumq[0] is always reference.
+    // Sumq[1] is best non-ref (and maybe better than ref)
+    bca->maxins = max_ins;
+    bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+    if (bca->maxins && !bca->inscns)
+        return -1;
+    for (t = 0; t < n_types; ++t)
+        sumq[t] = sumq[t]<<6 | t;
+    for (t = 1; t < n_types; ++t) // insertion sort
+        for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+            tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+    for (t = 0; t < n_types; ++t) // look for the reference type
+        if ((sumq[t]&0x3f) == ref_type) break;
+
+    if (t) { // then move the reference type to the first
+        tmp = sumq[t];
+        for (; t > 0; --t) sumq[t] = sumq[t-1];
+        sumq[0] = tmp;
+    }
+
+    for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+    for (t = 0; t < 4 && t < n_types; ++t) {
+        bca->indel_types[t] = types[sumq[t]&0x3f];
+#ifdef ALIGN_DEBUG
+        fprintf(stderr, "TYPE %+2d %d\n", types[t], sumq[t]>>6);
+#endif
+        if (bca->maxins) // potentially an insertion
+            memcpy(&bca->inscns[t * bca->maxins],
+                   &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+    }
+
+    // Update p->aux.
+    // If per-alignment type isn't found, then indelQ/seqQ is 0,
+    // otherwise unchanged.
+    for (s = n_alt = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i) {
+            bam_pileup1_t *p = plp[s] + i;
+            int x = types[p->aux>>16&0x3f];
+            for (j = 0; j < 4; ++j)
+                if (x == bca->indel_types[j]) break;
+            p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+            if ((p->aux>>16&0x3f) > 0) ++n_alt;
+#ifdef ALIGN_DEBUG
+            fprintf(stderr, "FIN %s\t%d\t%d\t%d\n",
+                    bam_get_qname(p->b), (p->aux>>16)&0x3f,
+                    bca->indel_types[(p->aux>>16)&0x3f], p->aux&0xff);
+#endif
+        }
+    }
+
+    return n_alt;
+}
+
+/*
+FIXME: with high number of samples, do we handle IMF correctly?  Is it
+fraction of indels across entire data set, or just fraction for this
+specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt.
+ */
+
+/*
+    notes:
+    - n .. number of samples
+    - the routine sets bam_pileup1_t.aux of each read as follows:
+        - 6: unused
+        - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
+        - 8: estimated sequence quality                     .. (aux>>8)&0xff
+        - 8: indel quality                                  .. aux&0xff
+ */
+int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+		       bcf_callaux_t *bca, const char *ref, int ref_len)
+{
+    if (ref == 0 || bca == 0) return -1;
+
+    int i, s, t, n_types, *types = NULL, max_rd_len, left, right, max_ins;
+    int *score = NULL;
+    int N, K, l_run, ref_type, n_alt = -1;
+    char *inscns = NULL, *query = NULL;
+
+    // determine if there is a gap
+    for (s = N = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i)
+            if (plp[s][i].indel != 0) break;
+        if (i < n_plp[s]) break;
+    }
+    if (s == n)
+        // there is no indel at this position.
+        return -1;
+
+    // Find average base quality over this region
+    double qavg = 30, qsum = 0, qcount = 0;
+    int qmax = 0;
+    for (s = 0; s < n; s++) {
+        for (i = 0; i < n_plp[s]; i++) {
+#define QWIN 50
+            bam_pileup1_t *p = plp[s] + i;
+            int kstart = p->qpos - QWIN > 0 ? p->qpos - QWIN : 0;
+            int kend = p->qpos + QWIN < p->b->core.l_qseq
+                ? p->qpos + QWIN : p->b->core.l_qseq;
+            uint8_t *qual = bam_get_qual(p->b);
+            int k;
+            for (k = kstart; k < kend; k++) {
+                qsum += qual[k];
+                qcount++;
+                if (qmax < qual[k])
+                    qmax = qual[k];
+            }
+        }
+    }
+    qavg = (qsum+1) / (qcount+1);
+
+    // find out how many types of indels are present
+    types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref,
+                               &max_rd_len, &n_types, &ref_type, &N);
+    if (!types)
+        goto err;
+
+
+    // calculate left and right boundary, based on type size for a bit more
+    // speed.
+    int max_indel = 20*MAX(ABS(types[0]), ABS(types[n_types-1]))
+                  + bca->indel_win_size/4;
+    if (max_indel > bca->indel_win_size)
+        max_indel = bca->indel_win_size;
+    left = pos > max_indel ? pos - max_indel : 0;
+    right = pos + max_indel;
+
+    int del_size = types[0]<0 ? -types[0] : 0;
+    right += del_size;
+
+    // in case the alignments stand out the reference
+    for (i = pos; i < right; ++i)
+        if (ref[i] == 0) break;
+    right = i;
+
+    // compute the likelihood given each type of indel for each read
+    max_ins = types[n_types - 1];   // max_ins is at least 0
+
+    // The length of the homopolymer run around the current position
+    l_run = bcf_cgp_l_run(ref, pos);
+    int l_run_base = seq_nt16_table[(uint8_t)ref[pos+1]];
+    int l_run_ins = 0;
+
+    // construct the consensus sequence (minus indels, which are added later)
+    if (max_ins > 0) {
+        // TODO: replace filling inscns[] with calc_consensus return
+        // so the merges of the insertion consensus for type[t] is
+        // reported directly.  (It may need adjustment to avoid N)
+        inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos,
+                                       types, n_types, max_ins, s);
+        if (!inscns)
+            return -1;
+    }
+
+    query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
+    score = (int*) calloc(N * n_types, sizeof(int));
+    bca->indelreg = 0;
+    double nqual_over_60 = bca->nqual / 60.0;
+
+    int biggest_del = 0;
+    int biggest_ins = 0;
+    for (t = 0; t < n_types; t++) {
+        if (biggest_del > types[t])
+            biggest_del = types[t];
+        if (biggest_ins < types[t])
+            biggest_ins = types[t];
+    }
+    int band = biggest_ins - biggest_del; // NB del is -ve
+
+    // Find left & right extents of STR covering pos, from ref
+    int pos_l = pos, pos_r = pos;
+    {
+        rep_ele *reps, *elt, *tmp;
+        int pstart = MAX(0, pos-30);
+        int pmid = pos-pstart;
+        int pend = MIN(ref_len, pos+30);
+        reps = find_STR((char *)&ref[pstart], pend-pstart, 0);
+        DL_FOREACH_SAFE(reps, elt, tmp) {
+            if (elt->end >= pmid && elt->start <= pmid) {
+                if (pos_l > pstart + elt->start)
+                    pos_l = pstart + elt->start;
+                if (pos_r < pstart + elt->end)
+                    pos_r = pstart + elt->end;
+            }
+            DL_DELETE(reps, elt);
+            free(elt);
+        }
+    }
+
+    int str_len1 = l_run, str_len2 = l_run/4;
+    for (t = 0; t < n_types; ++t) {
+        int l, ir;
+
+        // Compute indelreg.  This is the context in the reference.  Eg:
+        //
+        // REF:  AG--TTTC  Inscns   is "TT".
+        // SEQ:  AGTTTTTC  Indelreg is 3; next 3 "TTT" bases
+        //
+        // => GTTT GTTTTT is call.
+        if (types[t] == 0)
+            ir = 0;
+        else if (types[t] > 0)
+            ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+        else
+            ir = est_indelreg(pos, ref, -types[t], 0);
+
+        if (ir > bca->indelreg)
+            bca->indelreg = ir;
+
+        // Realignment score, computed via BAQ
+        for (s = K = 0; s < n; ++s) {
+            char **tcons;
+            int left_shift, right_shift;
+            int tcon_len[2];
+            int cpos_pos;
+            tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_len,
+                                      left, right, s, types[t], biggest_del,
+                                      &left_shift, &right_shift, &band,
+                                      tcon_len, &cpos_pos, pos_l, pos_r);
+            // TODO: Consensus for a deletion shouldn't match the
+            // consensus for type 0.  Eg consider
+            //         vv                          vv
+            // REF:  AATGTGTGAACAA        REF:   AATGTG--AACAA
+            // T0:   AATGTG--AACAA        T0:    AATGTG--AACAA
+            // T-2:  AA--TGTGAATAA        T-2:   AA--TGTGAATAA:
+            //
+            // On left: both T0 and T-2 are the same length, as it's
+            // just a deletion that moved.  We may end up assigning
+            // reads to an indel allele based on the SNP they have and
+            // not the actual indel.
+            // There *is* a deletion here though, but only 1.  How do
+            // we call it once only?  Need to replace entire region
+            // with a reassembly.
+            //
+            // On right: T0 and T-2 have same length again, but there
+            // isn't an indel as it's ins+del vs del+ins. They're
+            // also the same length as the REF for this region.
+            // Hence likelihood of this variant existing is tied in
+            // with their equal and high similarity with/to the ref.
+            //
+            // We could do an alignment of tcons[0] and tcons[1] and check
+            // whether their differences are consistent with (ie the
+            // hamming distance is at least ABS(types[t]/2).  I don't think
+            // it'll rescue many FPs though.
+
+#ifdef CONS_DEBUG
+            {
+                int j;
+                for (j = 0; j < 2; j++) {
+                    int k;
+                    fprintf(stderr, "Cons%d @ %d %4d/%4d ",
+                            j, pos, types[t], left_shift);
+                    for (k = 0; k < tcon_len[j]; k++) {
+                        if (k == cpos_pos)
+                            putc('#', stderr);
+                        putc("ACGTN"[(uint8_t)tcons[j][k]], stderr);
+                    }
+                    putc('\n', stderr);
+                }
+            }
+#endif
+
+            // Scan for base-runs in the insertion.
+            // We use this to avoid over-correction in est_seqQ when the
+            // insertion is not part of the neighbouring homopolymer.
+            int k = tcons[0][cpos_pos], j;
+            for (j = 0; j < types[t]; j++)
+                if (tcons[0][cpos_pos+j] != k)
+                    break;
+            if (j && j == types[t])
+                l_run_ins |= "\x1\x2\x4\x8\xf"[k]; // ACGTN
+            if (types[t] < 0)
+                l_run_ins |= 0xff;
+
+            // align each read to consensus(es)
+            for (i = 0; i < n_plp[s]; ++i, ++K) {
+                bam_pileup1_t *p = plp[s] + i;
+
+                // Some basic ref vs alt stats.
+                int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual;
+                imq *= nqual_over_60;
+
+                int sc_len, slen, epos, sc_end;
+
+                // Only need to gather stats on one type, as it's
+                // identical calculation for all the subsequent ones
+                // and we're sharing the same stats array
+                if (t == 0) {
+                    // Gather stats for INFO field to aid filtering.
+                    // mq and sc_len not very helpful for filtering, but could
+                    // help in assigning a better QUAL value.
+                    //
+                    // Pos is slightly useful.
+                    // Base qual can be useful, but need qual prior to BAQ?
+                    // May need to cache orig quals in aux tag so we can fetch
+                    // them even after mpileup step.
+                    get_pos(bca, p, &sc_len, &slen, &epos, &sc_end);
+
+                    assert(imq >= 0 && imq < bca->nqual);
+                    assert(epos >= 0 && epos < bca->npos);
+                    assert(sc_len >= 0 && sc_len < 100);
+                    if (p->indel) {
+                        bca->ialt_mq[imq]++;
+                        bca->ialt_scl[sc_len]++;
+                        bca->ialt_pos[epos]++;
+                    } else {
+                        bca->iref_mq[imq]++;
+                        bca->iref_scl[sc_len]++;
+                        bca->iref_pos[epos]++;
+                    }
+                }
+
+                int qbeg, qpos, qend, tbeg, tend, kk;
+                uint8_t *seq = bam_get_seq(p->b);
+                uint32_t *cigar = bam_get_cigar(p->b);
+                if (p->b->core.flag & BAM_FUNMAP) continue;
+
+                // FIXME: the following loop should be better moved outside;
+                // nonetheless, realignment should be much slower anyway.
+                for (kk = 0; kk < p->b->core.n_cigar; ++kk)
+                    if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
+                        break;
+                if (kk < p->b->core.n_cigar)
+                    continue;
+
+                // determine the start and end of sequences for alignment
+                int left2 = left, right2 = right;
+                int min_win_size = MAX(-biggest_del, biggest_ins);
+                min_win_size += ABS(left_shift) + ABS(right_shift);
+                {
+                    rep_ele *reps, *elt, *tmp;
+                    reps = find_STR(tcons[0], tcon_len[0], 0);
+                    //int max_str = 0;
+                    int tot_str = 0;
+                    DL_FOREACH_SAFE(reps, elt, tmp) {
+                        // if (max_str < elt->end - elt->start)
+                        //     max_str = elt->end - elt->start;
+                        tot_str += elt->end - elt->start;
+                        DL_DELETE(reps, elt);
+                        free(elt);
+                    }
+
+                    // Ideally max_str should be enough, but it's still not
+                    // sufficient in longer range some repeats.
+                    //min_win_size += max_str;
+                    min_win_size += tot_str;
+                }
+                min_win_size += 10;
+
+// TEST 8
+                if (p->b->core.l_qseq > 1000) {
+                    // long read data needs less context.  It also tends to
+                    // have many more candidate indels to investigate so
+                    // speed here matters more.
+                    if (pos - left >= min_win_size)
+                        left2 = MAX(left2, pos - min_win_size);
+                    if (right-pos >= min_win_size)
+                        right2 = MIN(right2, pos + min_win_size);
+                }
+
+                // Genomic coords for first and last base of query
+                // alignment.  This is only used in bcf_cgp_align_score
+                // for computing scores by looking for the proximity
+                // of STRs with the end of the query alignment.
+                int r_start = p->b->core.pos;
+                int r_end = bam_cigar2rlen(p->b->core.n_cigar,
+                                           bam_get_cigar(p->b));
+                r_end += -1 + r_start;
+
+
+                // Map left2/right2 genomic coordinates to qbeg/qend
+                // query coordinates.  The query may not span the
+                // entire left/right region, so this also returns the
+                // equivalent genomic coords for qbeg/qend in tbeg/tend.
+                qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
+                                 left2, 0, &tbeg);
+                qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
+                                     0, &tend) - qbeg;
+                qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
+                                 right2, 1, &tend);
+
+                int old_tend = tend;
+                int old_tbeg = tbeg;
+
+                // write the query sequence
+                for (l = qbeg; l < qend; ++l)
+                    query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
+
+                // tbeg and tend are the genomic locations equivalent
+                // to qbeg and qend on the sequence.
+                // These may being entirely within our left/right
+                // coordinates over which we've computed the
+                // consensus, or overlapping to left/right.
+                //
+                // We know an estimation of band, plus biggest indel,
+                // so we can trim tbeg/tend to a smaller region if we
+                // wish here.  This speeds up BAQ scoring.
+                int wband = band + MAX(-biggest_del, biggest_ins)*2 + 20;
+                int tend1 = left + tcon_len[0] - (left2-left);
+                int tend2 = left + tcon_len[1] - (left2-left);
+                tend1 = MIN(tend1, old_tend + wband);
+                tend2 = MIN(tend2, old_tend + wband);
+                tbeg = MAX(left2, old_tbeg - wband);
+
+                // do realignment; this is the bottleneck.
+                //
+                // Note low score = good, high score = bad.
+                if (tend1 > tbeg && tend2 > tbeg) {
+                    //fprintf(stderr, "Num %d\n", i);
+                    if (bcf_cgp_align_score(p, bca, types[t], band,
+                                            (uint8_t *)tcons[0] + left2-left,
+                                            (uint8_t *)tcons[1] + left2-left,
+                                            (uint8_t *)query,
+                                            r_start, r_end,
+                                            tbeg, tend1, tend2,
+                                            left2, left + tcon_len[0],
+                                            qbeg, qend, pos,qpos, -biggest_del,
+                                            qavg, bca->del_bias,
+                                            &score[K*n_types + t],
+                                            &str_len1, &str_len2) < 0) {
+                        goto err;
+                    }
+#ifdef ALIGN_DEBUG
+                    fprintf(stderr, "type %d %x / %x\t%s\n",
+                            types[t],
+                            score[K*n_types + t] >> 8,
+                            score[K*n_types + t] & 0xff,
+                            bam_get_qname(p->b));
+#endif
+                } else {
+                    // place holder large cost for reads that cover the
+                    // region entirely within a deletion (thus tend < tbeg).
+                    score[K*n_types + t] = 0xffffff;
+                }
+            }
+            free(tcons);
+        }
+    }
+
+    // compute indelQ
+    if (!(l_run_base & l_run_ins))
+        l_run = 1; // different base type in ins to flanking region.
+    n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
+                                   ref_type, types, n_types, qavg, score,
+                                   str_len1, str_len2);
+
+ err:
+    // free
+    free(query);
+    free(score);
+    free(types);
+    free(inscns);
+
+    return n_alt > 0? 0 : -1;
+}
diff --git a/bcftools/bam2bcf_edlib.c.pysam.c b/bcftools/bam2bcf_edlib.c.pysam.c
new file mode 100644
index 000000000..fa009ff9d
--- /dev/null
+++ b/bcftools/bam2bcf_edlib.c.pysam.c
@@ -0,0 +1,1706 @@
+#include "bcftools.pysam.h"
+
+/*  bam2bcf_indel.c -- indel caller.
+
+    Copyright (C) 2010, 2011 Broad Institute.
+    Copyright (C) 2012-2014,2016-2017, 2021-2024 Genome Research Ltd.
+
+    Author: Heng Li <lh3@sanger.ac.uk>
+            Petr Danecek <pd3@sanger.ac.uk>
+	    James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+// Show consensus
+//#define CONS_DEBUG
+
+// Show alignments to consensus
+//#define ALIGN_DEBUG
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
+#include "bam2bcf.h"
+#include "str_finder.h"
+
+#include <htslib/ksort.h>
+// Is there no way to share these between the 3 implementations?
+KSORT_INIT_STATIC_GENERIC(uint32_t)
+
+#define MINUS_CONST 0x10000000
+
+#define MAX_TYPES 64
+
+#ifndef MIN
+#  define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#  define ABS(a) ((a)<0?-(a):(a))
+#endif
+
+#ifndef MAX
+#  define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+// l is the relative gap length and l_run is the length of the homopolymer
+// on the reference.
+//
+// Larger seqQ is good, so increasing tandemQ calls more indels,
+// and longer l_run means fewer calls.  It is capped later at 255.
+// For short l_runs, the qual is simply based on size of indel
+// larger ones being considered more likely to be real.
+// Longer indels get assigned a score based on the relative indel size
+// to homopolymer, where l_run base will have already been verified by
+// the caller to ensure it's compatible.
+static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run, int str_len)
+{
+    int q, qh;
+    // Short indels are more likely sequencing error than large ones.
+    // So "seqQ" scales with size of observation "l".
+    //
+    // Note openQ and extQ are error likelihoods in Phred scale.  Hence high
+    // openQ means we're very unlikely to miscall an indel.
+    // Ie it's not the open/ext "costs" normally used in alignment; more the reverse.
+    //
+    // We use MIN(q,qh) below, so we can remove the q component by specifying
+    // a large -o parameter in mpileup.
+    q = bca->openQ + bca->extQ * (abs(l) - 1);
+
+    // Orig method; best with Illumina (high openQ)
+//    qh = bca->tandemQ * (double)abs(l) / l_run + .499;
+
+    // Penalise longer homopolymers quadratically more, but boost shorter ones.
+    // Best with CCS (low openQ)
+    //qh = 2 * bca->tandemQ * pow((double)abs(l) / l_run, 1.5) + .499;
+
+    // (l/l_run)^1.26 for openQ=25 or ^1 for openQ=40.
+//    double openQ = MIN(40, bca->openQ);
+//    qh = (30/openQ) * bca->tandemQ
+//        * pow((double)abs(l) / l_run, 1/sqrt(openQ/40)) + .499;
+
+    // Linear scaled on openQ too
+    qh = bca->tandemQ * (double)abs(l) / l_run + .499;
+
+    // Generic maybe ?
+    // power = 1/sqrt(MIN(40,bca->openQ)/40.);
+    // qh = ... * pow((double)abs(l)/l_run, power)
+
+    // bam2bcf.c caps has "if q>seqQ) q=seqQ" so it caps base qual 'q'.
+    // A 1bp indel would therefore have a maximum qual it could be considered based
+    // on open+ext.  Hence why openQ is phred score indicating if the base is real
+    // or an over/under-call. (high openQ means high trust in base)
+    return q < qh? q : qh;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Scans the pileup to identify all the different sizes of indels
+// present.
+// types[] returned is sorted by size, from smallest (maybe negative) to largest.
+//
+// Returns types and fills out n_types_r,  max_rd_len_r and ref_type_r,
+//         or NULL on error.
+static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
+                               int pos, bcf_callaux_t *bca, const char *ref,
+                               int *max_rd_len_r, int *n_types_r,
+                               int *ref_type_r, int *N_r) {
+    int i, j, t, s, N, m, max_rd_len, n_types;
+    int n_alt = 0, n_tot = 0, indel_support_ok = 0;
+    uint32_t *aux;
+    int *types;
+
+    // N is the total number of reads
+    for (s = N = 0; s < n; ++s)
+        N += n_plp[s];
+
+    bca->max_support = bca->max_frac = 0;
+    aux = (uint32_t*) calloc(N + 1, 4);
+    if (!aux)
+        return NULL;
+
+    m = max_rd_len = 0;
+    aux[m++] = MINUS_CONST; // zero indel is always a type (REF)
+
+    // Fill out aux[] array with all the non-zero indel sizes.
+    // Also tally number with indels (n_alt) and total (n_tot).
+    for (s = 0; s < n; ++s) {
+        int na = 0, nt = 0;
+        for (i = 0; i < n_plp[s]; ++i) {
+            const bam_pileup1_t *p = plp[s] + i;
+            ++nt;
+            if (p->indel != 0) {
+                ++na;
+                aux[m++] = MINUS_CONST + p->indel;
+            }
+
+            // FIXME: cache me in pileup struct.
+            j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+            if (j > max_rd_len) max_rd_len = j;
+        }
+        double frac = (double)na/nt;
+        if ( !indel_support_ok && na >= bca->min_support
+             && frac >= bca->min_frac )
+            indel_support_ok = 1;
+        if ( na > bca->max_support && frac > 0 )
+            bca->max_support = na, bca->max_frac = frac;
+
+        n_alt += na;
+        n_tot += nt;
+    }
+
+    // Sort aux[] and dedup
+    ks_introsort(uint32_t, m, aux);
+    for (i = 1, n_types = 1; i < m; ++i)
+        if (aux[i] != aux[i-1]) ++n_types;
+
+    // Taking totals makes it hard to call rare indels (IMF filter)
+    if ( !bca->per_sample_flt )
+        indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac
+                             || n_alt < bca->min_support )
+            ? 0 : 1;
+    if ( n_types == 1 || !indel_support_ok ) { // then skip
+        free(aux);
+        return NULL;
+    }
+
+    // Bail out if we have far too many types of indel
+    if (n_types >= MAX_TYPES) {
+        free(aux);
+        // TODO revisit how/whether to control printing this warning
+        if (hts_verbose >= 2)
+            fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. "
+                    "Skip the position.\n", __func__, pos + 1);
+        return NULL;
+    }
+
+    // To prevent long stretches of N's to be mistaken for indels
+    // (sometimes thousands of bases), check the number of N's in the
+    // sequence and skip places where half or more reference bases are Ns.
+    int nN=0, i_end = pos + (2*bca->indel_win_size < max_rd_len
+                            ?2*bca->indel_win_size : max_rd_len);
+    for (i=pos; i<i_end && ref[i]; i++)
+        nN += ref[i] == 'N';
+    if ( nN*2>(i-pos) ) {
+        free(aux);
+        return NULL;
+    }
+
+    // Finally fill out the types[] array detailing the size of insertion
+    // or deletion.
+    types = (int*)calloc(n_types, sizeof(int));
+    if (!types) {
+        free(aux);
+        return NULL;
+    }
+    t = 0;
+    for (i = 0; i < m; ++i) {
+        int sz = (int32_t)(aux[i] - MINUS_CONST);
+        int j;
+        for (j = i+1; j < m; j++)
+            if (aux[j] != aux[i])
+                break;
+
+        if (sz == 0
+            || (j-i >= bca->min_support &&
+                // Note, doesn't handle bca->per_sample_flt yet
+                (bca->per_sample_flt
+                 || (double)(j-i) / n_tot >= bca->min_frac)))
+            types[t++] = sz;
+        i = j-1;
+    }
+    free(aux);
+
+    if (t <= 1) {
+        free(types);
+        return NULL;
+    }
+    n_types = t;
+
+    // Find reference type; types[?] == 0)
+    for (t = 0; t < n_types; ++t)
+        if (types[t] == 0) break;
+
+    *ref_type_r   = t;
+    *n_types_r    = n_types;
+    *max_rd_len_r = max_rd_len;
+    *N_r          = N;
+
+    return types;
+}
+
+// Increment ins["str"] and freq["str"]
+#define NI 100 // number of alternative insertion sequences
+// Could use a hash table too, but expectation is a tiny number of alternatives
+typedef struct {
+    char *str[NI];
+    int len[NI];
+    int freq[NI];
+} str_freq;
+
+static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) {
+    int j;
+
+    for (j = 0; j < NI && sf->str[j]; j++) {
+        if (sf->len[j] == len && memcmp(sf->str[j], str, len) == 0)
+            break;
+    }
+    if (j >= NI)
+        return 0; // too many choices; discard
+
+    sf->freq[j]+=freq;
+    if (!sf->str[j]) {
+        // new insertion
+        if (!(sf->str[j] = malloc(len+1)))
+            return -1;
+        memcpy(sf->str[j], str, len);
+        sf->len[j] = len;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the consensus for a specific indel type at pos.
+ *
+ * left_shift is the number of inserted(+) or deleted(-) bases added to
+ * the consensus before we get to pos.  This is necessary so the alignment
+ * band is correct as it's expected to start at left/right edges in
+ * sync
+ *
+ * We accumulate into several buffers for counting base types:
+ * cons_base   - consensus of data with p->indel == type, bases or gap
+ * ref_base    - consensus of data with p->indel != type, bases or gap
+ * cons_ins    - consensus of data with p->indel == type, insertions
+ * ref_ins     - consensus of data with p->indel == type, bases or gap
+ *
+ * The purpose of cons_ins vs cons_base is if we have very low
+ * coverage due to nearly all reads being another type, then we can
+ * still get a robust consensus using the other data.  If we don't
+ * have shallow data, then we'll not use as much of ref_base as we may
+ * have correlated variants.
+ *
+ * Eg:
+ * REF: AGCTATGAGGCTGATA
+ * SEQ: AGGTAGGAGGGTGATA (x1)
+ * SEQ: AGCTACGAGG*TGATA (x24)
+ * SEQ: AGCTACTAGG*TGATA (x24)
+ *
+ * Cons for no-del is Cs not Gs.  Cannot trust it, so use N if shallow.
+ * CON: AGCTACNAGGGTGATA
+ *
+ * There are still some problems in cons_ins vs ref_ins assignment.
+ * We sometimes seem multiple similar-length insertions added at
+ * different locations.  Ideally we'd like to consider these as all
+ * the same insertion if the size is the same and it's comparable seq.
+ */
+#define MAX_INS 8192
+static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
+                                int pos, bcf_callaux_t *bca, const char *ref,
+                                int ref_len, int left, int right,
+                                int sample, int type, int biggest_del,
+                                int *left_shift, int *right_shift,
+                                int *band, int *tcon_len, int *cpos_pos,
+                                int pos_l, int pos_r) {
+    // Map ASCII ACGTN* to 012345
+    static uint8_t base6[256] = {
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,5,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        //A   C       G       *^                     T
+        4,0,4,1,4,4,4,2,  4,4,4,4,4,4,4,4,  4,4,4,4,3,3,4,4,  4,4,4,4,4,4,4,4,
+        4,0,4,1,4,4,4,2,  4,4,4,4,4,4,4,4,  4,4,4,4,3,3,4,4,  4,4,4,4,4,4,4,4,
+
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+        4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,  4,4,4,4,4,4,4,4,
+    };
+
+    // single base or del
+    int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));
+    // multi-base insertions
+    str_freq *cons_ins  = calloc(right - left + 1, sizeof(*cons_ins));
+
+    // non-indel ref for all reads on this sample, rather than those just
+    // matching type.  We use this for handling the case where we have a
+    // homozygous deletion being studied, but with 1 or 2 reads misaligned
+    // and containing a base there.
+    //
+    // Eg if the type[]=0 consensus is made up of a very small sample size,
+    // which is also enriched for highly error prone data.  We can use
+    // the other reads from type[] != 0 to flesh out the consensus and
+    // improve accuracy.
+    int (*ref_base)[6]  = calloc(right - left + 1, sizeof(*ref_base));
+    str_freq *ref_ins   = calloc(right - left + 1, sizeof(*ref_ins));
+    int i, j, k, s = sample;
+    char **cons = NULL;
+
+    if (!cons_base || !cons_ins || !ref_base || !ref_ins)
+        goto err;
+
+    //--------------------------------------------------
+    // Accumulate sequences into cons_base and cons_ins arrays
+    int local_band_max = 0; // maximum absolute deviation from diagonal
+    int total_span_str = 0;
+    int type_depth = 0;
+    for (i = 0; i < n_plp[s]; i++) {
+        const bam_pileup1_t *p = plp[s] + i;
+        bam1_t *b = p->b;
+        int x = b->core.pos;  // ref coordinate
+        int y = 0;            // seq coordinate
+        uint32_t *cigar = bam_get_cigar(b);
+        uint8_t *seq = bam_get_seq(b);
+
+        int local_band = 0; // current deviation from diagonal
+        for (k = 0; k < b->core.n_cigar; ++k) {
+            int op  = cigar[k] &  BAM_CIGAR_MASK;
+            int len = cigar[k] >> BAM_CIGAR_SHIFT;
+            int base;
+            int skip_to = 0;
+
+            switch(op) {
+            case BAM_CSOFT_CLIP:
+                y += len;
+                break;
+
+            case BAM_CMATCH:
+            case BAM_CEQUAL:
+            case BAM_CDIFF: {
+                // Can short-cut this with j_start and j_end based on
+                // x+len and left,right
+                for (j = 0; j < len; j++, x++, y++) {
+                    if (x < left) continue;
+                    if (x >= right) break;
+
+                    base = bam_seqi(seq, y);
+                    if (p->indel == type)
+                        // Convert 4-bit base ambig code to 0,1,2,3,4 range
+                        cons_base[x-left][seq_nt16_int[base]]++;
+                    else if (x != pos+1) // indel being assessed question
+                        ref_base[x-left][seq_nt16_int[base]]++;
+                }
+                break;
+            }
+
+            case BAM_CINS: {
+                if (x >= left && x < right) {
+                    local_band += p->indel;
+                    if (local_band_max < local_band)
+                        local_band_max = local_band;
+                }
+
+                char ins[MAX_INS];
+                for (j = 0; j < len; j++, y++) {
+                    if (x < left) continue;
+                    if (x >= right)
+                        break;
+                    base = bam_seqi(seq, y);
+                    if (j < MAX_INS)
+                        ins[j] = seq_nt16_int[base];
+                }
+
+                // Insertions come before a ref match.
+                // 5I 5M is IIIIIM M M M M events, not
+                // {IIIII,M} M M M M choice.  So we need to include the
+                // next match in our sequence when choosing the consensus.
+                if (x >= left && x < right) {
+                    int ilen = j<MAX_INS?j:MAX_INS;
+                    if (p->indel == type /*&& x == pos+1*/) {
+                        // Assume any ins of the same size is the same ins.
+                        // (This rescues misaligned insertions.)
+                        if (bcf_cgp_append_cons(&cons_ins[x-left], ins,
+                                                ilen, 1) < 0)
+                            goto err;
+                        type_depth += (x == pos+1);
+                    } else  if (x != pos+1){
+                        if (bcf_cgp_append_cons(&ref_ins[x-left],  ins,
+                                                ilen, 1) < 0)
+                            goto err;
+                    }
+                }
+                break;
+            }
+
+            case BAM_CDEL:
+                if (x >= left && x < right) {
+                    local_band += p->indel;
+                    if (local_band_max < -local_band)
+                        local_band_max = -local_band;
+                }
+
+                // Maybe not perfect for I/D combos, but likely sufficient.
+                for (j = 0; j < len; j++, x++) {
+                    if (x < left) continue;
+                    if (x >= right) break;
+                    if ((p->indel == type && !p->is_del) ||  // starts here
+                        (p->indel == 0 && p->is_del && len == -type)) { // left
+                        cons_base[x-left][5]++;
+                        type_depth += (x == pos+1);
+                    } else if (x+len <= pos+1 || (skip_to && x > skip_to))
+                        ref_base[x-left][5]++;
+                    else if (x <= pos && x+len > pos+1) {
+                        // we have a deletion which overlaps pos, but
+                        // isn't the same "type".  We don't wish to
+                        // include these as they may bias the
+                        // evaluation by confirming against a
+                        // secondary consensus produced with the other
+                        // deletion.  We set a marker for how long to
+                        // skip adding to ref_base.
+                        if (x > skip_to)
+                            skip_to = x+len;
+                    }
+                }
+                break;
+            }
+        }
+
+        if (b->core.pos <= pos_l && x >= pos_r)
+            total_span_str++;
+
+        // Also track the biggest deviation +/- from diagonal.  We use
+        // this band observation in our BAQ alignment step.
+        if (*band < local_band_max)
+            *band = local_band_max;
+    }
+
+    //--------------------------------------------------
+    // Expand cons_base to include depth from ref_base/ref_ins
+    // Caveat: except at pos itself, where true ref is used if type != 0
+
+#if 1 // TEST 1
+    // We could retest this heuristic further maybe.
+    for (i = 0; i < right-left; i++) {
+        // Total observed depth
+        int t = cons_base[i][0] + cons_base[i][1] + cons_base[i][2] +
+            cons_base[i][3] + cons_base[i][4] + cons_base[i][5];
+        for (j = 0; j < NI; j++) {
+            if (!cons_ins[i].str[j])
+                break;
+            t += cons_ins[i].freq[j];
+        }
+
+        // Similarly for depth on the non-ALT calls (NB: not necessarily
+        // REF as maybe it's other ALTs).
+        int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] +
+            ref_base[i][3] + ref_base[i][4] + ref_base[i][5];
+        for (j = 0; j < NI; j++) {
+            if (!ref_ins[i].str[j])
+                break;
+            r += ref_ins[i].freq[j];
+        }
+
+        // When evaluating this particular indel, we don't want to
+        // penalise alignments by SNP errors elsewhere.  This can
+        // happen when we have low depth for a particular 'type'.
+        //
+        // So add in a little data from ref_base/ref_ins.
+        double rfract = (r - t*2)*.75 / (r+1);
+
+        if (rfract < 1.01 / (r+1e-10))
+            rfract = 1.01 / (r+1e-10); // low depth compensation
+//        if (rfract > 0.2)
+//            rfract = 0.2;
+
+        // TODO: consider limiting rfract so we never drown out the
+        // signal.  We want to use the remaining data only to correct
+        // for sequencing errors in low depth alleles.  If we get
+        // conflicts, it's better to use N than to change a base
+        // incase that variant is genuine.
+        if (i+left >= pos+1 && i+left < pos+1-biggest_del) {
+            // We're overlapping the current indel region, so
+            // we don't wish to bring in evidence from the other
+            // "type" data as it'll harm calling.
+            continue;
+        } else {
+            // Otherwise add in a portion of other data to
+            // boost low population numbers.
+            cons_base[i][0] += rfract * ref_base[i][0];
+            cons_base[i][1] += rfract * ref_base[i][1];
+            cons_base[i][2] += rfract * ref_base[i][2];
+            cons_base[i][3] += rfract * ref_base[i][3];
+            cons_base[i][4] += rfract * ref_base[i][4];
+            cons_base[i][5] += rfract * ref_base[i][5];
+        }
+
+        // Similarly for insertions too; consider a different rfract here?
+        for (j = 0; j < NI; j++) {
+            if (!ref_ins[i].str[j])
+                break;
+            if (bcf_cgp_append_cons(&cons_ins[i],
+                                    ref_ins[i].str[j], ref_ins[i].len[j],
+                                    rfract * ref_ins[i].freq[j]) < 0)
+                goto err;
+        }
+    }
+#endif
+
+    //--------------------------------------------------
+    // Allocate consensus buffer, to worst case length
+    int max_len = right-left;
+    for (i = 0; i < right-left; i++) {
+        if (!cons_ins[i].str[0])
+            continue;
+
+        int ins = 0;
+        for (j = 0; j < NI; j++) {
+            if (!cons_ins[i].str[j])
+                break;
+            if (cons_ins[i].str[j] && ins < cons_ins[i].len[j])
+                ins = cons_ins[i].len[j];
+        }
+        max_len += ins;
+    }
+    max_len += MAX(0, type); // incase type inserted bases never occur
+    cons = malloc((max_len+1)*2 + sizeof(char *)*2);
+    if (!cons)
+        goto err;
+    cons[0] = (char *)&cons[2];
+    cons[1] = cons[0] + max_len+1;
+
+    //--------------------------------------------------
+    // Merge insertions where they are the same length but different
+    // sequences.
+    // NB: we could just index by length and have accumulators for each,
+    // instead of storing separately and merging later (here).
+    // Ie str_freq.str is [NI][5] instead.
+    for (i = 0; i < right-left; i++) {
+        int ins[MAX_INS][5];
+        for (j = 0; j < NI; j++) {
+            if (!cons_ins[i].str[j])
+                break;
+
+            if (cons_ins[i].freq[j] == 0)
+                continue; // already merged
+
+            int l;
+            for (l = 0; l < cons_ins[i].len[j]; l++) {
+                // Append to relevant frequency counter, zero all others
+                ins[l][0] = ins[l][1] = ins[l][2] = ins[l][3] = ins[l][4] = 0;
+                uint8_t b = cons_ins[i].str[j][l];
+                ins[l][b] = cons_ins[i].freq[j];
+            }
+
+            // Merge other insertions of the same length to ins[] counters
+            for (k = j+1; k < NI; k++) {
+                if (!cons_ins[i].str[k])
+                    break;
+                if (cons_ins[i].len[k] != cons_ins[i].len[j])
+                    continue;
+                if (cons_ins[i].freq[k] == 0)
+                    continue; // redundant?
+
+                // Merge str[j] and str[k]
+                for (l = 0; l < cons_ins[i].len[k]; l++) {
+                    uint8_t b = cons_ins[i].str[k][l];
+                    ins[l][b] += cons_ins[i].freq[k];
+                }
+                cons_ins[i].freq[j] += cons_ins[i].freq[k];
+                cons_ins[i].freq[k] = 0;
+            }
+
+            // Now replace ins[j] with the consensus insertion of this len.
+            for (l = 0; l < cons_ins[i].len[j]; l++) {
+                int max_v = 0, base = 0;
+                int tot = ins[l][0] + ins[l][1] + ins[l][2]
+                        + ins[l][3] + ins[l][4];
+                if (max_v < ins[l][0]) max_v = ins[l][0], base = 0;
+                if (max_v < ins[l][1]) max_v = ins[l][1], base = 1;
+                if (max_v < ins[l][2]) max_v = ins[l][2], base = 2;
+                if (max_v < ins[l][3]) max_v = ins[l][3], base = 3;
+                if (max_v < ins[l][4]) max_v = ins[l][4], base = 4;
+
+                cons_ins[i].str[j][l] = (max_v > 0.6*tot) ? base : 4;
+            }
+        }
+    }
+
+#define CONS_CUTOFF      .40 // % needed for base vs N
+#define CONS_CUTOFF_DEL  .35 // % to include any het del
+#define CONS_CUTOFF2     .80 // % needed for gap in cons[1]
+#define CONS_CUTOFF_INC  .35 // % to include any insertion cons[0]
+#define CONS_CUTOFF_INC2 .80 // % to include any insertion cons[1] HOM
+#define CONS_CUTOFF_INS  .60 // and then 60% needed for it to be bases vs N
+
+    //--------------------------------------------------
+    // Walk through the frequency arrays to call the consensus.
+    // We produce cons[0] and cons[1].  Both include strongly
+    // homozygous indels.  Both also include the indel at 'pos'.
+    // However for heterozygous indels we call the most likely event
+    // for cons[0] and the less-likely alternative in cons[1].
+    // TODO: a proper phase analysis so multiple events end up
+    // combining together into the correct consensus.
+    *left_shift = 0;
+    *right_shift = 0;
+    int cnum;
+
+    // Het call filled out in cnum==0 (+ve or -ve).
+    // Used in cnum==1 to do the opposite of whichever way we did before.
+    int heti[MAX_INS] = {0}, hetd[MAX_INS] = {0};
+
+    *cpos_pos = -1;
+    for (cnum = 0; cnum < 2; cnum++) {
+        for (i = k = 0; i < right-left; i++) {
+            // Location in consensus matching the indel itself
+            if (i >= pos-left+1 && *cpos_pos == -1)
+                *cpos_pos = k;
+
+            int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0;
+            for (j = 0; j < 6; j++) {
+                // Top 2 consensus calls
+                if (max_v < cons_base[i][j]) {
+                    max_v2 = max_v, max_j2 = max_j;
+                    max_v = cons_base[i][j], max_j = j;
+                } else if (max_v2 < cons_base[i][j]) {
+                    max_v2 = cons_base[i][j], max_j2 = j;
+                }
+                tot += cons_base[i][j];
+            }
+
+            // +INS
+            int max_v_ins = 0, max_j_ins = 0;
+            int tot_ins = 0;
+            for (j = 0; j < NI; j++) {
+                if (i+left==pos+1)
+                if (type > 0 && i+left == pos+1
+                    && cons_ins[i].len[j] < type && j == 0) {
+                    cons_ins[i].str[j] = realloc(cons_ins[i].str[j], type);
+                    if (!cons_ins[i].str[j])
+                        goto err;
+                    memset(cons_ins[i].str[j] + cons_ins[i].len[j],
+                           4, type - cons_ins[i].len[j]);
+                    cons_ins[i].len[j] = type;
+                }
+                if (!cons_ins[i].str[j])
+                    break;
+                if (cons_ins[i].freq[j] == 0)
+                    continue; // previously merged
+
+                if (max_v_ins < cons_ins[i].freq[j])
+                    //if (i != pos-left+1 || cons_ins[i].len[j] == type)
+                    max_v_ins = cons_ins[i].freq[j], max_j_ins = j;
+                tot_ins += cons_ins[i].freq[j];
+            }
+
+            // NB: tot is based on next matching base, so it includes
+            // everything with or without the insertion.
+            int tot_sum = tot;
+            int always_ins =
+                (i == pos-left+1 && type>0) ||       // current eval
+                max_v_ins > CONS_CUTOFF_INC2*tot_sum;// HOM
+            int het_ins = 0;
+            if (!always_ins && max_v_ins >= bca->min_support) {
+                // Candidate HET ins.
+                if (cnum == 0) {
+                    het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum;
+                    if (i < MAX_INS) heti[i] = het_ins
+                                      ? 1
+                                      : (max_v_ins > .3*tot_sum ? -1:0);
+                } else {
+                    // HET but uncalled before
+                    het_ins = i < MAX_INS ? (heti[i] == -1) : 0;
+                }
+            }
+
+            if (always_ins || het_ins) {
+                if (max_v_ins > CONS_CUTOFF_INS*tot_ins) {
+                    // Insert bases
+                    for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) {
+                        if (cnum == 0) {
+                            if (k < pos-left+*left_shift)
+                                (*left_shift)++;
+                            else
+                                (*right_shift)++;
+                        }
+                        cons[cnum][k++] = cons_ins[i].str[max_j_ins][j];
+                    }
+                } else {
+                    for (j = 0; j < cons_ins[i].len[max_j_ins]; j++)
+                        cons[cnum][k++] = 4; // 'N';
+                }
+            }
+
+            // Call deletions & bases
+            int always_del = (type < 0 && i > pos-left && i <= pos-left-type)
+                || cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del
+            int het_del = 0;
+            if (!always_del && cons_base[i][5] >= bca->min_support) {
+                // Candidate HET del.
+                if (cnum == 0) {
+                    int tot2 = tot;
+                    if (i > pos-left && i <= pos-left-biggest_del)
+                        tot2 = total_span_str - type_depth;
+                    het_del = cons_base[i][5] >= CONS_CUTOFF_DEL * tot2;
+
+                    if (i < MAX_INS) {
+                        if (i > pos-left && i <= pos-left-biggest_del)
+                            hetd[i] = 0;
+                        else
+                            hetd[i] = het_del
+                                ? 1
+                                : (cons_base[i][5] >= .3 * tot2 ? -1 : 0);
+                    }
+                } else {
+                    // HET del uncalled on cnum 0
+                    het_del = i < MAX_INS ? (hetd[i] == -1) : 0;
+                    if (max_j == 5 && het_del == 0) {
+                        max_v = max_v2;
+                        max_j = max_j2;
+                    }
+                }
+            }
+            if (always_del || het_del) {
+                // Deletion
+                if (k < pos-left+*left_shift)
+                    (*left_shift)--;
+                else
+                    (*right_shift)++;
+            } else {
+                // Finally the easy case - a non-indel base or an N
+                if (max_v > CONS_CUTOFF*tot)
+                    cons[cnum][k++] = max_j; // "ACGTN*"
+                else if (max_v > 0)
+                    cons[cnum][k++] = 4;     // 'N';
+                else {
+                    cons[cnum][k] = left+k < ref_len
+                        ? base6[(uint8_t)ref[left+k]]
+                        : 4;
+                    k++;
+                }
+            }
+        }
+
+        tcon_len[cnum] = k;
+    }
+
+    // TODO: replace by io_lib's string pool for rapid tidying.
+    // For now this isn't the bottleneck though.
+    for (i = 0; i < right-left; i++) {
+        for (j = 0; j < NI; j++) {
+            if (cons_ins[i].str[j])
+                free(cons_ins[i].str[j]);
+            if (ref_ins[i].str[j])
+                free(ref_ins[i].str[j]);
+        }
+    }
+
+ err:
+    free(cons_base);
+    free(ref_base);
+    free(cons_ins);
+    free(ref_ins);
+
+    return cons;
+}
+
+// A rename of bcf_cgp_calc_cons from bam2bcf_indel.c
+//
+// Compute the insertion consensus for this sample 's' via a basic
+// majority rule.
+//
+// TODO: merge this into bcf_cgp_consensus as another return value?
+static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                                   int pos, int *types, int n_types,
+                                   int max_ins, int s) {
+    return bcf_cgp_calc_cons(n, n_plp, plp, pos, types, n_types, max_ins, s);
+}
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+// Compile with LIBS="-L. -ldl -ledlib" CLD=g++
+
+// This is faster than ksw and BAQ, meaning we can use larger --indel-size and
+// get a more accurate context, improving alignments further.  This *may*
+// compensate for reduced sensitivity.
+#include "edlib.h"
+int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query,
+                 double m, double del_bias)
+{
+    EdlibAlignConfig cfg = 
+        edlibNewAlignConfig(
+                            //ABS(type)+ABS(l_ref-l_query)+10,
+                            -1, // k; use small positive for faster alignment
+                            EDLIB_MODE_HW, // mode
+#ifdef ALIGN_DEBUG
+                            EDLIB_TASK_PATH,
+#else
+                            EDLIB_TASK_LOC,
+#endif
+                            NULL, // additionalEqualities
+                            0); // additionalEqualitiesLength
+    EdlibAlignResult r = 
+        edlibAlign((char *)query, l_query, (char *)ref, l_ref, cfg);
+
+    if (r.status != EDLIB_STATUS_OK || r.numLocations < 1 ||
+        !r.endLocations || !r.startLocations) {
+        edlibFreeAlignResult(r);
+        return INT_MAX;
+    }
+
+#ifdef ALIGN_DEBUG
+    // NB: Needs linking against the C++ libedlib.a as our cut-down C
+    // implementation misses the alignment generation code.
+    {
+        int i, j = 0, pt = r.startLocations[0], pq = 0;
+        char line1[80];
+        char line2[80];
+        char line3[80];
+        for (i = 0; i < r.alignmentLength && pt < r.endLocations[0]; i++) {
+            int n;
+            switch (n = r.alignment[i]) {
+            case 0: // match
+            case 3: // mismatch
+                line1[j] = "ACGTN"[ref[pt++]];
+                line2[j] = "ACGTN"[query[pq++]];
+                line3[j] = " x"[n==3];
+                break;
+            case 2: // insertion to ref
+                line1[j] = "ACGTN"[ref[pt++]];
+                line2[j] = '-';
+                line3[j] = '-';
+                break;
+            case 1: // insertion to query
+                line1[j] = '-';
+                line2[j] = "ACGTN"[query[pq++]];
+                line3[j] = '+';
+                break;
+            }
+
+            if (++j == sizeof(line1)) {
+                fprintf(bcftools_stderr, "%.*s\n", j, line1);
+                fprintf(bcftools_stderr, "%.*s\n", j, line2);
+                fprintf(bcftools_stderr, "%.*s\n", j, line3);
+                j = 0;
+            }
+        }
+        if (j) {
+            fprintf(bcftools_stderr, "%.*s\n", j, line1);
+            fprintf(bcftools_stderr, "%.*s\n", j, line2);
+            fprintf(bcftools_stderr, "%.*s\n", j, line3);
+        }
+    }
+#endif
+
+    // Aligned target length minus query length is an indication of the number
+    // of insertions and/or deletions.
+    // 
+    // For CIGAR 10M1I10M t_len > l_query ("AC"  / "ATC")
+    // For CIGAR 10M1D10M t_len < l_query ("ATC" / "AC")
+    // Hence t_len-l_query is -ve for net insertions and +ve for net deletions.
+    // If we compute nins and ndel directly via walking though EDLIB_TASK_PATH
+    // we'll see t_len-l_query == ndel-nins.
+    // 
+    // If a technology has a significantly higher chance of making deletion
+    // errors than insertion errors, then we would view deletions as less
+    // indicative of this sequence not coming from this candidate allele than
+    // if it had insertion (as the deletions are more likely to be errors
+    // rather than real, relative to the insertions).  Hence we can skew the
+    // score by the net delta of num_del - num_ins.
+    //
+    // Note this is an approximation that doesn't account for multiple
+    // insertions and deletions within the same sequence, but it is much faster
+    // as it doesn't require EDLIB_TASK_PATH to be computed.
+    //
+    // Given editDistance is +1 for every mismatch, insertion and deletion,
+    // provided the t_len-l_query multiplier < 1 then this is always +ve.
+
+    int t_len = *r.endLocations - *r.startLocations + 1;
+    int score = m*(r.editDistance - del_bias*(t_len - l_query));
+
+    edlibFreeAlignResult(r);
+    return score;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Realign using BAQ to get an alignment score of a single read vs
+// a haplotype consensus.  TODO: replace BAQ with something more robust.
+//
+// There are many coordinates, so let's explain them.
+// - left, right, tbeg, tend, r_start and r_end are in aligned reference
+//   coordinates.
+//   left/right start from pos +/- indel_win_size.
+//   r_start/r_end are the BAM first and last mapped coord on the reference.
+//   tbeg and tend are the intersection of the two.
+// - qbeg and qend are in BAM sequence coordinates
+// - qpos is in sequence coordinates, relative to qbeg.
+//
+// To see what this means, we have illustrations with coordinates
+// above the seqs in reference space and below the seqs in BAM seq space.
+//
+// Overlap left:
+//                     tbeg                        tend
+//      r_start        left                 pos    r_end          right
+// REF  :..............|--------------------#------:--------------|...
+// SEQ  :..............|--------------------#------|
+//      0              qbeg                 qpos   qend
+//
+// Overlap right:
+//                        r_start                     tend
+//         left           tbeg  pos                   right       r_end
+// REF  ...|--------------:-----#---------------------|...........:
+// SEQ                    |-----#---------------------|...........:
+//                        qbeg  qpos                  qend
+//                        0
+//
+// The "-" sequence is the bit passed in.
+// Ie ref2 spans left..right and query spans qbeg..qend.
+// We need to adjust ref2 therefore to tbeg..tend.
+//
+// Fills out score
+// Returns 0 on success,
+//        <0 on error
+static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
+                               int type, int band,
+                               uint8_t *ref1, uint8_t *ref2, uint8_t *query,
+                               int r_start, int r_end,
+                               int tbeg, int tend1, int tend2,
+                               int left, int right,
+                               int qbeg, int qend,
+                               int pos, int qpos, int max_deletion,
+                               double qavg, double del_bias, int *score,
+                               int *str_len1_p, int *str_len2_p) {
+    int atype = abs(type);
+    int l, sc1, sc2;
+
+    // Trim poly_Ns at ends of ref.
+    // This helps to keep len(ref) and len(query) similar, to reduce
+    // band size and reduce the chance of -ve BAQ scores.
+    for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++)
+        if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4)
+            break;
+    if (l > atype)
+        tbeg += l-atype;
+
+    for (l = tend1-tbeg-1; l >= 0; l--)
+        if (ref1[l + tbeg-left] != 4)
+            break;
+    l = tend1-tbeg-1 - l;
+    if (l > atype)
+        tend1 -= l-atype;
+
+    for (l = tend2-tbeg-1; l >= 0; l--)
+        if (ref2[l + tbeg-left] != 4)
+            break;
+    l = tend2-tbeg-1 - l;
+    if (l > atype) {
+        tend2 -= l-atype;
+    }
+
+    // The bottom 8 bits are length-normalised score while
+    // the top bits are unnormalised.
+    //
+    // Try original cons and new cons and pick best.
+    // This doesn't reduce FN much (infact maybe adds very slightly),
+    // but it does reduce GT errors and is a slight reduction to FP.
+
+    double mm = 30; // a const average qual for now. Could tune
+    sc2 = edlib_glocal(ref2 + tbeg - left, tend2 - tbeg,
+                       query, qend - qbeg, mm, del_bias);
+
+    if (tend1 != tend2 ||
+        memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left,
+               tend1 - tbeg) != 0)
+        sc1 = edlib_glocal(ref1 + tbeg - left, tend1 - tbeg,
+                           query, qend - qbeg, mm, del_bias);
+    else
+        sc1 = INT_MAX; // skip
+
+    // Find the best of the two alignments
+    if (sc1 < 0 && sc2 < 0) {
+        *score = 0xffffff;
+        return 0;
+    }
+    if (sc1 < 0) {
+        // sc2 is already correct
+    } else if (sc2 < 0) {
+        sc2 = sc1;
+    } else {
+        // sc1 and sc2 both pass, so use best
+        if (sc2 > sc1)
+            sc2 = sc1;
+    }
+
+    // Sc is overall alignment score, in top 24 bits (SeqQ). It's based
+    // purely on the scores for the whole alignment.
+    // We also have a separate indel score in bottom 8 bits (IndelQ).
+    // This is a function of all sorts of attributes of the candidate indel
+    // itself, such as STR length and the presence of poor quality bases.
+
+    // Used for adjusting indelQ below.  Lower l is more likely to call
+    // (--FN, ++FP).  (NB CLI --indel_bias is 1/indel_bias var).
+    // Starts as average score per base, and then adjusted based on seq
+    // complexity / quality.
+
+    l = .5*(100. * sc2 / (qend - qbeg) + .499);
+
+    *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias * .5);
+
+    return 0;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Returns n_alt on success
+//         -1 on failure
+
+// TODO: almost identical to bam2bcf_indel.c's copy, so we could share
+// the code and add a check on bca->edlib.
+static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
+                                  bcf_callaux_t *bca, char *inscns,
+                                  int l_run, int max_ins,
+                                  int ref_type, int *types, int n_types,
+                                  double qavg, int *score,
+                                  int str_len1, int str_len2) {
+    // FIXME: n_types has a maximum; no need to alloc - use a #define?
+    int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp;
+    memset(sumq, 0, n_types * sizeof(int));
+    int sum_indelQ1[100] = {0}; // n
+    int sum_indelQ2[100] = {0}; // n
+
+    // Confusing variable naming and bit usage.
+    //
+    // score[] is low 8  bits normalised (by len) alignment score
+    //            top 24 bits full alignment score
+    // This gets cast into "sct"; mnemonic score-per-indel-type.
+    //
+    // sc = (score<<6) | type  (index to types[] array for indel size)
+    // So sc>>14 = score>>(14-6) = score>>8.  Ie full alignment score
+    for (s = K = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i, ++K) {
+            bam_pileup1_t *p = plp[s] + i;
+            // Labelling is confusing here.
+            //    sct is short for score.
+            //    sc is score + t(type)
+            // Why aren't these variable names reversed?
+            int *sct = &score[K*n_types], seqQ, indelQ1=0, indelQ2=0, indelQ=0;
+            for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+            for (t = 1; t < n_types; ++t) // insertion sort
+                for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+                    tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+
+#ifdef ALIGN_DEBUG
+            fprintf(bcftools_stderr, "READ %s\tscores ", bam_get_qname(p->b));
+            for (t = 0; t < n_types; ++t) {
+                fprintf(bcftools_stderr, "%+2d/%-3d ", types[sc[t]&0x3f], sc[t]>>14);
+            }
+#endif
+
+            /* errmod_cal() assumes that if the call is wrong, the
+             * likelihoods of other events are equal. This is about
+             * right for substitutions, but is not desired for
+             * indels. To reuse errmod_cal(), I have to make
+             * compromise for multi-allelic indels.
+             */
+            if ((sc[0]&0x3f) == ref_type) {
+                // sc >> 14 is the total score.  It's been shifted by 8
+                // from normalised score and 6 from type.
+                // &0x3f is type number
+
+                // Best call is REF.  Compare vs best indel
+                indelQ = (sc[1]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run, str_len1);
+            } else {
+                // look for the reference type
+                for (t = 0; t < n_types; ++t) {
+                    if ((sc[t]&0x3f) == ref_type)
+                        break;
+                }
+                indelQ = indelQ1 = (sc[t]>>14) - (sc[0]>>14);
+//                fprintf(bcftools_stderr, "IndelQ = %d: %d-%d",
+//                        indelQ, (sc[t]>>14), (sc[0]>>14));
+
+                // Best call is non-ref, compare vs next best non-ref,
+                // or ref if it's just 2 choices (most common case).
+                for (t = 1; t < n_types; t++)
+                    if ((sc[t]&0x3f) == ref_type)
+                        continue;
+                    else break;
+                if (t == n_types)
+                    t--; // it's ref, but it'll do as next best.
+                indelQ2 = (sc[t]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1);
+
+#if 1 // TEST 3
+                indelQ = bca->vs_ref*indelQ1 + (1-bca->vs_ref)*indelQ2;
+#endif
+            }
+
+            // So we lower qual in some, but raise the average to keep FN/FP
+            // ratios up.
+            // Is this key diff for PacBio old vs new HiFi?
+            indelQ  /= bca->indel_bias*0.5;
+            indelQ1 /= bca->indel_bias*0.5;
+
+            // Or maybe just *2 if bca->poly_mqual and be done with it?
+            // Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ?
+
+            // Skew SeqQ and IndelQ based on a portion of the minimum quality
+            // found within a homopolymer.  This is useful where the quality
+            // values are a bit mutable and move around in such data, but less
+            // so on clocked sequencing technologies.
+            //
+            // Enabling this causes lots of GT errors on Illumina.
+            // However on PacBio it's key to removal of false positives.
+            // ONT and UG seem somewhere inbetween.
+            if (bca->poly_mqual) { // TEST 4
+                int qpos = p->qpos, l;
+                uint8_t *seq = bam_get_seq(p->b);
+                uint8_t *qual = bam_get_qual(p->b);
+                int min_q = qual[qpos];
+
+                // scan homopolymer left
+                char baseL = bam_seqi(seq, qpos+1 < p->b->core.l_qseq
+                                      ? qpos+1 : qpos);
+                for (l = qpos; l >= 0; l--) {
+                    if (bam_seqi(seq, l) != baseL)
+                        break;
+                    if (min_q > qual[l])
+                        min_q = qual[l];
+                }
+
+                // scan homo-polymer right (including site of indel)
+                char base = bam_seqi(seq, qpos+1);
+                for (l = qpos+1; l < p->b->core.l_qseq; l++) {
+                    if (min_q > qual[l])
+                        min_q = qual[l];
+                    if (bam_seqi(seq, l) != base)
+                        break;
+                }
+
+                // We reduce -h so homopolymers get reduced likelihood of being
+                // called, but then optionally increase or decrease from there
+                // based on base quality.  Hence lack of low quality bases in
+                // homopolymer will rescue the score back again, reducing FNs.
+
+                // The score factors here may also be machine specific, but for
+                // now these work well (tuned on PB HiFi).
+                seqQ   += MIN(qavg/20,  min_q - qavg/10);
+                indelQ += MIN(qavg/20,  min_q - qavg/5);
+                indelQ1+= MIN(qavg/20,  min_q - qavg/5);
+
+                if (seqQ   < 0) seqQ   = 0;
+                if (indelQ < 0) indelQ = 0;
+                if (indelQ1< 0) indelQ1= 0;
+            }
+
+            // This is the length-normalised score from bcf_cgp_align_score
+            tmp = sc[0]>>6 & 0xff;
+
+            // reduce indelQ
+            // high score = bad, low score = good; flip for indelQ
+            // low normalised scores leave indelQ unmodified
+            // high normalised scores set indelQ to 0
+            // inbetween scores have a linear scale from indelQ to 0
+// Altering the MAGIC value below (originally 111, but chosen for unknown
+// reasons) is comparable to altering --indel-bias.
+#define TMP_MAGIC 255.0
+
+            indelQ = tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ + .499);
+            indelQ1= tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ1+ .499);
+
+            indelQ  = MIN(indelQ,  255);
+            indelQ1 = MIN(indelQ1, 255);
+
+            // Doesn't really help accuracy, but permits -h to take
+            // affect still.
+            if (indelQ > seqQ) indelQ = seqQ;
+            if (indelQ > 255) indelQ = 255;
+            if (indelQ1> 255) indelQ1= 255;
+            if (seqQ > 255) seqQ = 255;
+
+            // Use 22 bits in total.
+            // 0-7   IndelQ
+            // 8-15  SeqQ
+            // 16-22 Score-per-base
+            p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ;
+            sumq[sc[0]&0x3f] += indelQ;
+
+#ifdef ALIGN_DEBUG
+            fprintf(bcftools_stderr, "\t%d\t%d\n", indelQ, seqQ);
+#endif
+
+            // Experiment in p->aux vs sumq.
+            // One gives likelihood of an indel being here, while the other
+            // is likelihood of a specific genotype?  But which is which?
+
+            sum_indelQ1[s] += indelQ1;
+            sum_indelQ2[s] += indelQ;
+        }
+    }
+
+    // Determine bca->indel_types[] and bca->inscns.
+    // Sumq[0] is always reference.
+    // Sumq[1] is best non-ref (and maybe better than ref)
+    bca->maxins = max_ins;
+    bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+    if (bca->maxins && !bca->inscns)
+        return -1;
+    for (t = 0; t < n_types; ++t)
+        sumq[t] = sumq[t]<<6 | t;
+    for (t = 1; t < n_types; ++t) // insertion sort
+        for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+            tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+    for (t = 0; t < n_types; ++t) // look for the reference type
+        if ((sumq[t]&0x3f) == ref_type) break;
+
+    if (t) { // then move the reference type to the first
+        tmp = sumq[t];
+        for (; t > 0; --t) sumq[t] = sumq[t-1];
+        sumq[0] = tmp;
+    }
+
+    for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+    for (t = 0; t < 4 && t < n_types; ++t) {
+        bca->indel_types[t] = types[sumq[t]&0x3f];
+#ifdef ALIGN_DEBUG
+        fprintf(bcftools_stderr, "TYPE %+2d %d\n", types[t], sumq[t]>>6);
+#endif
+        if (bca->maxins) // potentially an insertion
+            memcpy(&bca->inscns[t * bca->maxins],
+                   &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+    }
+
+    // Update p->aux.
+    // If per-alignment type isn't found, then indelQ/seqQ is 0,
+    // otherwise unchanged.
+    for (s = n_alt = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i) {
+            bam_pileup1_t *p = plp[s] + i;
+            int x = types[p->aux>>16&0x3f];
+            for (j = 0; j < 4; ++j)
+                if (x == bca->indel_types[j]) break;
+            p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+            if ((p->aux>>16&0x3f) > 0) ++n_alt;
+#ifdef ALIGN_DEBUG
+            fprintf(bcftools_stderr, "FIN %s\t%d\t%d\t%d\n",
+                    bam_get_qname(p->b), (p->aux>>16)&0x3f,
+                    bca->indel_types[(p->aux>>16)&0x3f], p->aux&0xff);
+#endif
+        }
+    }
+
+    return n_alt;
+}
+
+/*
+FIXME: with high number of samples, do we handle IMF correctly?  Is it
+fraction of indels across entire data set, or just fraction for this
+specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt.
+ */
+
+/*
+    notes:
+    - n .. number of samples
+    - the routine sets bam_pileup1_t.aux of each read as follows:
+        - 6: unused
+        - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
+        - 8: estimated sequence quality                     .. (aux>>8)&0xff
+        - 8: indel quality                                  .. aux&0xff
+ */
+int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+		       bcf_callaux_t *bca, const char *ref, int ref_len)
+{
+    if (ref == 0 || bca == 0) return -1;
+
+    int i, s, t, n_types, *types = NULL, max_rd_len, left, right, max_ins;
+    int *score = NULL;
+    int N, K, l_run, ref_type, n_alt = -1;
+    char *inscns = NULL, *query = NULL;
+
+    // determine if there is a gap
+    for (s = N = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i)
+            if (plp[s][i].indel != 0) break;
+        if (i < n_plp[s]) break;
+    }
+    if (s == n)
+        // there is no indel at this position.
+        return -1;
+
+    // Find average base quality over this region
+    double qavg = 30, qsum = 0, qcount = 0;
+    int qmax = 0;
+    for (s = 0; s < n; s++) {
+        for (i = 0; i < n_plp[s]; i++) {
+#define QWIN 50
+            bam_pileup1_t *p = plp[s] + i;
+            int kstart = p->qpos - QWIN > 0 ? p->qpos - QWIN : 0;
+            int kend = p->qpos + QWIN < p->b->core.l_qseq
+                ? p->qpos + QWIN : p->b->core.l_qseq;
+            uint8_t *qual = bam_get_qual(p->b);
+            int k;
+            for (k = kstart; k < kend; k++) {
+                qsum += qual[k];
+                qcount++;
+                if (qmax < qual[k])
+                    qmax = qual[k];
+            }
+        }
+    }
+    qavg = (qsum+1) / (qcount+1);
+
+    // find out how many types of indels are present
+    types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref,
+                               &max_rd_len, &n_types, &ref_type, &N);
+    if (!types)
+        goto err;
+
+
+    // calculate left and right boundary, based on type size for a bit more
+    // speed.
+    int max_indel = 20*MAX(ABS(types[0]), ABS(types[n_types-1]))
+                  + bca->indel_win_size/4;
+    if (max_indel > bca->indel_win_size)
+        max_indel = bca->indel_win_size;
+    left = pos > max_indel ? pos - max_indel : 0;
+    right = pos + max_indel;
+
+    int del_size = types[0]<0 ? -types[0] : 0;
+    right += del_size;
+
+    // in case the alignments stand out the reference
+    for (i = pos; i < right; ++i)
+        if (ref[i] == 0) break;
+    right = i;
+
+    // compute the likelihood given each type of indel for each read
+    max_ins = types[n_types - 1];   // max_ins is at least 0
+
+    // The length of the homopolymer run around the current position
+    l_run = bcf_cgp_l_run(ref, pos);
+    int l_run_base = seq_nt16_table[(uint8_t)ref[pos+1]];
+    int l_run_ins = 0;
+
+    // construct the consensus sequence (minus indels, which are added later)
+    if (max_ins > 0) {
+        // TODO: replace filling inscns[] with calc_consensus return
+        // so the merges of the insertion consensus for type[t] is
+        // reported directly.  (It may need adjustment to avoid N)
+        inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos,
+                                       types, n_types, max_ins, s);
+        if (!inscns)
+            return -1;
+    }
+
+    query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
+    score = (int*) calloc(N * n_types, sizeof(int));
+    bca->indelreg = 0;
+    double nqual_over_60 = bca->nqual / 60.0;
+
+    int biggest_del = 0;
+    int biggest_ins = 0;
+    for (t = 0; t < n_types; t++) {
+        if (biggest_del > types[t])
+            biggest_del = types[t];
+        if (biggest_ins < types[t])
+            biggest_ins = types[t];
+    }
+    int band = biggest_ins - biggest_del; // NB del is -ve
+
+    // Find left & right extents of STR covering pos, from ref
+    int pos_l = pos, pos_r = pos;
+    {
+        rep_ele *reps, *elt, *tmp;
+        int pstart = MAX(0, pos-30);
+        int pmid = pos-pstart;
+        int pend = MIN(ref_len, pos+30);
+        reps = find_STR((char *)&ref[pstart], pend-pstart, 0);
+        DL_FOREACH_SAFE(reps, elt, tmp) {
+            if (elt->end >= pmid && elt->start <= pmid) {
+                if (pos_l > pstart + elt->start)
+                    pos_l = pstart + elt->start;
+                if (pos_r < pstart + elt->end)
+                    pos_r = pstart + elt->end;
+            }
+            DL_DELETE(reps, elt);
+            free(elt);
+        }
+    }
+
+    int str_len1 = l_run, str_len2 = l_run/4;
+    for (t = 0; t < n_types; ++t) {
+        int l, ir;
+
+        // Compute indelreg.  This is the context in the reference.  Eg:
+        //
+        // REF:  AG--TTTC  Inscns   is "TT".
+        // SEQ:  AGTTTTTC  Indelreg is 3; next 3 "TTT" bases
+        //
+        // => GTTT GTTTTT is call.
+        if (types[t] == 0)
+            ir = 0;
+        else if (types[t] > 0)
+            ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+        else
+            ir = est_indelreg(pos, ref, -types[t], 0);
+
+        if (ir > bca->indelreg)
+            bca->indelreg = ir;
+
+        // Realignment score, computed via BAQ
+        for (s = K = 0; s < n; ++s) {
+            char **tcons;
+            int left_shift, right_shift;
+            int tcon_len[2];
+            int cpos_pos;
+            tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_len,
+                                      left, right, s, types[t], biggest_del,
+                                      &left_shift, &right_shift, &band,
+                                      tcon_len, &cpos_pos, pos_l, pos_r);
+            // TODO: Consensus for a deletion shouldn't match the
+            // consensus for type 0.  Eg consider
+            //         vv                          vv
+            // REF:  AATGTGTGAACAA        REF:   AATGTG--AACAA
+            // T0:   AATGTG--AACAA        T0:    AATGTG--AACAA
+            // T-2:  AA--TGTGAATAA        T-2:   AA--TGTGAATAA:
+            //
+            // On left: both T0 and T-2 are the same length, as it's
+            // just a deletion that moved.  We may end up assigning
+            // reads to an indel allele based on the SNP they have and
+            // not the actual indel.
+            // There *is* a deletion here though, but only 1.  How do
+            // we call it once only?  Need to replace entire region
+            // with a reassembly.
+            //
+            // On right: T0 and T-2 have same length again, but there
+            // isn't an indel as it's ins+del vs del+ins. They're
+            // also the same length as the REF for this region.
+            // Hence likelihood of this variant existing is tied in
+            // with their equal and high similarity with/to the ref.
+            //
+            // We could do an alignment of tcons[0] and tcons[1] and check
+            // whether their differences are consistent with (ie the
+            // hamming distance is at least ABS(types[t]/2).  I don't think
+            // it'll rescue many FPs though.
+
+#ifdef CONS_DEBUG
+            {
+                int j;
+                for (j = 0; j < 2; j++) {
+                    int k;
+                    fprintf(bcftools_stderr, "Cons%d @ %d %4d/%4d ",
+                            j, pos, types[t], left_shift);
+                    for (k = 0; k < tcon_len[j]; k++) {
+                        if (k == cpos_pos)
+                            putc('#', bcftools_stderr);
+                        putc("ACGTN"[(uint8_t)tcons[j][k]], bcftools_stderr);
+                    }
+                    putc('\n', bcftools_stderr);
+                }
+            }
+#endif
+
+            // Scan for base-runs in the insertion.
+            // We use this to avoid over-correction in est_seqQ when the
+            // insertion is not part of the neighbouring homopolymer.
+            int k = tcons[0][cpos_pos], j;
+            for (j = 0; j < types[t]; j++)
+                if (tcons[0][cpos_pos+j] != k)
+                    break;
+            if (j && j == types[t])
+                l_run_ins |= "\x1\x2\x4\x8\xf"[k]; // ACGTN
+            if (types[t] < 0)
+                l_run_ins |= 0xff;
+
+            // align each read to consensus(es)
+            for (i = 0; i < n_plp[s]; ++i, ++K) {
+                bam_pileup1_t *p = plp[s] + i;
+
+                // Some basic ref vs alt stats.
+                int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual;
+                imq *= nqual_over_60;
+
+                int sc_len, slen, epos, sc_end;
+
+                // Only need to gather stats on one type, as it's
+                // identical calculation for all the subsequent ones
+                // and we're sharing the same stats array
+                if (t == 0) {
+                    // Gather stats for INFO field to aid filtering.
+                    // mq and sc_len not very helpful for filtering, but could
+                    // help in assigning a better QUAL value.
+                    //
+                    // Pos is slightly useful.
+                    // Base qual can be useful, but need qual prior to BAQ?
+                    // May need to cache orig quals in aux tag so we can fetch
+                    // them even after mpileup step.
+                    get_pos(bca, p, &sc_len, &slen, &epos, &sc_end);
+
+                    assert(imq >= 0 && imq < bca->nqual);
+                    assert(epos >= 0 && epos < bca->npos);
+                    assert(sc_len >= 0 && sc_len < 100);
+                    if (p->indel) {
+                        bca->ialt_mq[imq]++;
+                        bca->ialt_scl[sc_len]++;
+                        bca->ialt_pos[epos]++;
+                    } else {
+                        bca->iref_mq[imq]++;
+                        bca->iref_scl[sc_len]++;
+                        bca->iref_pos[epos]++;
+                    }
+                }
+
+                int qbeg, qpos, qend, tbeg, tend, kk;
+                uint8_t *seq = bam_get_seq(p->b);
+                uint32_t *cigar = bam_get_cigar(p->b);
+                if (p->b->core.flag & BAM_FUNMAP) continue;
+
+                // FIXME: the following loop should be better moved outside;
+                // nonetheless, realignment should be much slower anyway.
+                for (kk = 0; kk < p->b->core.n_cigar; ++kk)
+                    if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
+                        break;
+                if (kk < p->b->core.n_cigar)
+                    continue;
+
+                // determine the start and end of sequences for alignment
+                int left2 = left, right2 = right;
+                int min_win_size = MAX(-biggest_del, biggest_ins);
+                min_win_size += ABS(left_shift) + ABS(right_shift);
+                {
+                    rep_ele *reps, *elt, *tmp;
+                    reps = find_STR(tcons[0], tcon_len[0], 0);
+                    //int max_str = 0;
+                    int tot_str = 0;
+                    DL_FOREACH_SAFE(reps, elt, tmp) {
+                        // if (max_str < elt->end - elt->start)
+                        //     max_str = elt->end - elt->start;
+                        tot_str += elt->end - elt->start;
+                        DL_DELETE(reps, elt);
+                        free(elt);
+                    }
+
+                    // Ideally max_str should be enough, but it's still not
+                    // sufficient in longer range some repeats.
+                    //min_win_size += max_str;
+                    min_win_size += tot_str;
+                }
+                min_win_size += 10;
+
+// TEST 8
+                if (p->b->core.l_qseq > 1000) {
+                    // long read data needs less context.  It also tends to
+                    // have many more candidate indels to investigate so
+                    // speed here matters more.
+                    if (pos - left >= min_win_size)
+                        left2 = MAX(left2, pos - min_win_size);
+                    if (right-pos >= min_win_size)
+                        right2 = MIN(right2, pos + min_win_size);
+                }
+
+                // Genomic coords for first and last base of query
+                // alignment.  This is only used in bcf_cgp_align_score
+                // for computing scores by looking for the proximity
+                // of STRs with the end of the query alignment.
+                int r_start = p->b->core.pos;
+                int r_end = bam_cigar2rlen(p->b->core.n_cigar,
+                                           bam_get_cigar(p->b));
+                r_end += -1 + r_start;
+
+
+                // Map left2/right2 genomic coordinates to qbeg/qend
+                // query coordinates.  The query may not span the
+                // entire left/right region, so this also returns the
+                // equivalent genomic coords for qbeg/qend in tbeg/tend.
+                qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
+                                 left2, 0, &tbeg);
+                qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
+                                     0, &tend) - qbeg;
+                qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
+                                 right2, 1, &tend);
+
+                int old_tend = tend;
+                int old_tbeg = tbeg;
+
+                // write the query sequence
+                for (l = qbeg; l < qend; ++l)
+                    query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
+
+                // tbeg and tend are the genomic locations equivalent
+                // to qbeg and qend on the sequence.
+                // These may being entirely within our left/right
+                // coordinates over which we've computed the
+                // consensus, or overlapping to left/right.
+                //
+                // We know an estimation of band, plus biggest indel,
+                // so we can trim tbeg/tend to a smaller region if we
+                // wish here.  This speeds up BAQ scoring.
+                int wband = band + MAX(-biggest_del, biggest_ins)*2 + 20;
+                int tend1 = left + tcon_len[0] - (left2-left);
+                int tend2 = left + tcon_len[1] - (left2-left);
+                tend1 = MIN(tend1, old_tend + wband);
+                tend2 = MIN(tend2, old_tend + wband);
+                tbeg = MAX(left2, old_tbeg - wband);
+
+                // do realignment; this is the bottleneck.
+                //
+                // Note low score = good, high score = bad.
+                if (tend1 > tbeg && tend2 > tbeg) {
+                    //fprintf(bcftools_stderr, "Num %d\n", i);
+                    if (bcf_cgp_align_score(p, bca, types[t], band,
+                                            (uint8_t *)tcons[0] + left2-left,
+                                            (uint8_t *)tcons[1] + left2-left,
+                                            (uint8_t *)query,
+                                            r_start, r_end,
+                                            tbeg, tend1, tend2,
+                                            left2, left + tcon_len[0],
+                                            qbeg, qend, pos,qpos, -biggest_del,
+                                            qavg, bca->del_bias,
+                                            &score[K*n_types + t],
+                                            &str_len1, &str_len2) < 0) {
+                        goto err;
+                    }
+#ifdef ALIGN_DEBUG
+                    fprintf(bcftools_stderr, "type %d %x / %x\t%s\n",
+                            types[t],
+                            score[K*n_types + t] >> 8,
+                            score[K*n_types + t] & 0xff,
+                            bam_get_qname(p->b));
+#endif
+                } else {
+                    // place holder large cost for reads that cover the
+                    // region entirely within a deletion (thus tend < tbeg).
+                    score[K*n_types + t] = 0xffffff;
+                }
+            }
+            free(tcons);
+        }
+    }
+
+    // compute indelQ
+    if (!(l_run_base & l_run_ins))
+        l_run = 1; // different base type in ins to flanking region.
+    n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
+                                   ref_type, types, n_types, qavg, score,
+                                   str_len1, str_len2);
+
+ err:
+    // free
+    free(query);
+    free(score);
+    free(types);
+    free(inscns);
+
+    return n_alt > 0? 0 : -1;
+}
diff --git a/bcftools/bam2bcf_iaux.c b/bcftools/bam2bcf_iaux.c
index 2e0add15a..3fe4fdea7 100644
--- a/bcftools/bam2bcf_iaux.c
+++ b/bcftools/bam2bcf_iaux.c
@@ -396,7 +396,7 @@ static int iaux_set_consensus(indel_aux_t *iaux, int ismpl)
 // Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no
 // such value, the largest index with value smaller than pos. Starts at initial guess ioff.
 // This could use a binary search but the assumption is that the initial guess is indel-size close
-// to the actuall coordinate.
+// to the actual coordinate.
 //
 // TODO: remove this function and seq_pos from cns creation as it seems unnecessary
 static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff)
diff --git a/bcftools/bam2bcf_iaux.c.pysam.c b/bcftools/bam2bcf_iaux.c.pysam.c
index c8bea99cb..5309dca3b 100644
--- a/bcftools/bam2bcf_iaux.c.pysam.c
+++ b/bcftools/bam2bcf_iaux.c.pysam.c
@@ -398,7 +398,7 @@ static int iaux_set_consensus(indel_aux_t *iaux, int ismpl)
 // Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no
 // such value, the largest index with value smaller than pos. Starts at initial guess ioff.
 // This could use a binary search but the assumption is that the initial guess is indel-size close
-// to the actuall coordinate.
+// to the actual coordinate.
 //
 // TODO: remove this function and seq_pos from cns creation as it seems unnecessary
 static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff)
diff --git a/bcftools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c
index faedc3fef..975504f8a 100644
--- a/bcftools/bam2bcf_indel.c
+++ b/bcftools/bam2bcf_indel.c
@@ -45,7 +45,7 @@ KSORT_INIT_GENERIC(uint32_t)
 //
 // *_tpos is returned as tpos if query overlaps tpos, but for deletions
 // it'll be either the start (is_left) or end (!is_left) ref position.
-static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
 {
     // x = pos in ref, y = pos in query seq
     int k, x = c->pos, y = 0, last_y = 0;
@@ -98,8 +98,8 @@ inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
 }
 
 // Identify spft-clip length, position in seq, and clipped seq len
-static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
-                           int *sc_len_r, int *slen_r, int *epos_r, int *end) {
+void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+             int *sc_len_r, int *slen_r, int *epos_r, int *end) {
     bam1_t *b = p->b;
     int sc_len = 0, sc_dist = -1, at_left = 1;
     int epos = p->qpos, slen = b->core.l_qseq;
@@ -155,6 +155,7 @@ static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
 //
 // Scans the pileup to identify all the different sizes of indels
 // present.
+// types[] returned is sorted by size, from smallest (maybe negative) to largest.
 //
 // Returns types and fills out n_types_r,  max_rd_len_r and ref_type_r,
 //         or NULL on error.
@@ -429,9 +430,9 @@ int bcf_cgp_l_run(const char *ref, int pos) {
 
 // Compute the consensus for this sample 's', minus indels which
 // get added later.
-static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
-                               int pos, int *types, int n_types,
-                               int max_ins, int s) {
+char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                        int pos, int *types, int n_types,
+                        int max_ins, int s) {
     int i, j, t, k;
     int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int));
     if (!inscns_aux)
diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c
index 65a7179df..aded528b3 100644
--- a/bcftools/bam2bcf_indel.c.pysam.c
+++ b/bcftools/bam2bcf_indel.c.pysam.c
@@ -47,7 +47,7 @@ KSORT_INIT_GENERIC(uint32_t)
 //
 // *_tpos is returned as tpos if query overlaps tpos, but for deletions
 // it'll be either the start (is_left) or end (!is_left) ref position.
-static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
 {
     // x = pos in ref, y = pos in query seq
     int k, x = c->pos, y = 0, last_y = 0;
@@ -100,8 +100,8 @@ inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
 }
 
 // Identify spft-clip length, position in seq, and clipped seq len
-static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
-                           int *sc_len_r, int *slen_r, int *epos_r, int *end) {
+void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+             int *sc_len_r, int *slen_r, int *epos_r, int *end) {
     bam1_t *b = p->b;
     int sc_len = 0, sc_dist = -1, at_left = 1;
     int epos = p->qpos, slen = b->core.l_qseq;
@@ -157,6 +157,7 @@ static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
 //
 // Scans the pileup to identify all the different sizes of indels
 // present.
+// types[] returned is sorted by size, from smallest (maybe negative) to largest.
 //
 // Returns types and fills out n_types_r,  max_rd_len_r and ref_type_r,
 //         or NULL on error.
@@ -431,9 +432,9 @@ int bcf_cgp_l_run(const char *ref, int pos) {
 
 // Compute the consensus for this sample 's', minus indels which
 // get added later.
-static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
-                               int pos, int *types, int n_types,
-                               int max_ins, int s) {
+char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                        int pos, int *types, int n_types,
+                        int max_ins, int s) {
     int i, j, t, k;
     int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int));
     if (!inscns_aux)
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h
index bba71e3b6..51c2d040f 100644
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -1,6 +1,6 @@
 /*  bcftools.h -- utility function declarations.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -29,6 +29,7 @@ THE SOFTWARE.  */
 #include <htslib/hts_defs.h>
 #include <htslib/vcf.h>
 #include <htslib/synced_bcf_reader.h>
+#include <htslib/kfunc.h>
 #include <math.h>
 
 #define FT_TAB_TEXT 0       // custom tab-delimited text file
@@ -50,7 +51,13 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
 void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
 
 // For on the fly index creation with --write-index
-int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
+int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname, int idx_fmt);
+int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname);
+
+// Used to set args->write_index in CLI.
+// It will be true if set correctly.
+// Note due to HTS_FMT_CSI being zero we have to use an additional bit.
+int write_index_parse(char *arg);
 
 void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
 const char *hts_bcf_wmode(int file_type);
@@ -60,6 +67,10 @@ char *init_tmp_prefix(const char *prefix);
 int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq);
 int parse_overlap_option(const char *arg);
 
+// Default sort order: chr,pos,alleles
+int cmp_bcf_pos(const void *aptr, const void *bptr);
+int cmp_bcf_pos_ref_alt(const void *aptr, const void *bptr);
+
 static inline int iupac2bitmask(char iupac)
 {
     const int A = 1;
@@ -121,6 +132,23 @@ static inline double phred_score(double prob)
     return prob>99 ? 99 : prob;
 }
 
+static inline double calc_binom_two_sided(int na, int nb, double aprob)
+{
+    if ( !na && !nb ) return -1;
+    if ( na==nb ) return 1;
+
+    // kfunc.h implements kf_betai, which is the regularized beta function  P(X<=k/N;p) = I_{1-p}(N-k,k+1)
+
+    double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob);
+
+    if ( prob > 1 ) prob = 1;   // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
+    return prob;
+}
+static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge)
+{
+    return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob);
+}
+
 static const uint64_t bcf_double_missing    = 0x7ff0000000000001;
 static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
 static inline void bcf_double_set(double *ptr, uint64_t value)
@@ -141,4 +169,16 @@ static inline int bcf_double_test(double d, uint64_t value)
 #define bcf_double_is_missing(x)     bcf_double_test((x),bcf_double_missing)
 #define bcf_double_is_missing_or_vector_end(x)     (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
 
+static inline int get_unseen_allele(bcf1_t *line)
+{
+    int i;
+    for (i=1; i<line->n_allele; i++)
+    {
+        if ( !strcmp(line->d.allele[i],"<*>") ) return i;
+        if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return i;
+        if ( !strcmp(line->d.allele[i],"<X>") ) return i;
+    }
+    return 0;
+}
+
 #endif
diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h
index e6717bb72..a4e2a4232 100644
--- a/bcftools/bcftools.pysam.h
+++ b/bcftools/bcftools.pysam.h
@@ -69,6 +69,12 @@ extern int bcftools_main(int argc, char *argv[]);
 #define bam_smpl_destroy bcftools_bam_smpl_destroy
 #define read_file_list bcftools_read_file_list
 
+/*! A non-static error() function name is used in bcftools, which collides
+    with glibc's error() function and leads to the wrong function being called
+    on some platforms. #define this name with a prefix to avoid this collision.
+ */
+#define error bcftools_error
+
 #endif
 
 #endif
diff --git a/bcftools/call.h b/bcftools/call.h
index 16bf0b68e..090ac019a 100644
--- a/bcftools/call.h
+++ b/bcftools/call.h
@@ -33,7 +33,7 @@ THE SOFTWARE.  */
 #define CALL_VARONLY        (1<<1)
 #define CALL_CONSTR_TRIO    (1<<2)
 #define CALL_CONSTR_ALLELES (1<<3)
-//
+#define CALL_KEEP_UNSEEN    (1<<4)
 #define CALL_FMT_PV4        (1<<5)
 #define CALL_FMT_GQ         (1<<6)
 #define CALL_FMT_GP         (1<<7)
@@ -125,8 +125,7 @@ call_t;
 void error(const char *format, ...);
 
 /*
- *  call() - return -1 value on critical error; -2 to skip the site; or the number of non-reference
- *            alleles on success.
+ *  call() - return -1 value on critical error; -2 to skip the site; or the number of alleles on success
  */
 int mcall(call_t *call, bcf1_t *rec);    // multiallic and rare-variant calling model
 int ccall(call_t *call, bcf1_t *rec);    // the default consensus calling model
diff --git a/bcftools/consensus.c b/bcftools/consensus.c
index 2b58670c7..54f17c221 100644
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2014-2023 Genome Research Ltd.
+   Copyright (c) 2014-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -118,7 +118,7 @@ typedef struct
     char **argv;
     int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied;
     uint8_t *iupac_bitmask, *iupac_als;
-    int miupac_bitmask, miupac_als;
+    int miupac_bitmask, miupac_als, regions_overlap;
     char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
     char mark_del, mark_ins, mark_snv;
     smpl_ilist_t *smpl;
@@ -229,7 +229,14 @@ static void init_data(args_t *args)
     args->hdr = args->files->readers[0].header;
     args->isample = -1;
     if ( !args->sample )
+    {
         args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
+        if ( !args->smpl->n )
+        {
+            smpl_ilist_destroy(args->smpl);
+            args->smpl = NULL;
+        }
+    }
     else if ( args->sample && strcmp("-",args->sample) )
     {
         args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE);
@@ -244,12 +251,22 @@ static void init_data(args_t *args)
     {
         if ( args->haplotype || args->allele )
         {
-            if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n");
+            if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H; check the -s,-S options\n");
             args->isample = args->smpl->idx[0];
         }
         else
+        {
             args->iupac_GTs = 1;
+            if ( args->smpl->n==1 )
+                fprintf(stderr,"Note: applying IUPAC codes based on FORMAT/GT in sample %s\n",bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->smpl->idx[0]));
+            else
+                fprintf(stderr,"Note: applying IUPAC codes based on FORMAT/GT in %d samples\n",args->smpl->n);
+        }
     }
+    else if ( args->output_iupac )
+        fprintf(stderr,"Note: applying IUPAC codes based on REF,ALT%s\n",bcf_hdr_nsamples(args->hdr)?", ignoring samples":"");
+    else
+        fprintf(stderr,"Note: applying REF,ALT variants%s\n",bcf_hdr_nsamples(args->hdr)?", ignoring samples":"");
     int i;
     for (i=0; i<args->nmask; i++)
     {
@@ -272,7 +289,6 @@ static void init_data(args_t *args)
         if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
     }
     else args->fp_out = stdout;
-    if ( args->isample<0 && !args->iupac_GTs ) fprintf(stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n");
     if ( args->filter_str )
         args->filter = filter_init(args->hdr, args->filter_str);
     args->rid = -1;
@@ -348,20 +364,28 @@ static void init_region(args_t *args, char *line)
     args->prev_base_pos = -1;
     args->fa_buf.l  = 0;
     args->fa_length = 0;
-    args->fa_end_pos = to;
-    args->fa_ori_pos = from;
-    args->fa_src_pos = from;
+    args->fa_end_pos = to;      // 0-based
+    args->fa_ori_pos = from;    // 0-based
+    args->fa_src_pos = from;    // 0-based
     args->fa_mod_off = 0;
     args->fa_frz_pos = -1;
     args->fa_frz_mod = -1;
     args->fa_case    = -1;
     args->vcf_rbuf.n = 0;
 
+
+    // bcf_sr_set_regions accepts 1-based coordinates
     kstring_t str = {0,0,0};
-    if ( from==0 ) from = 1;
-    if ( to==0 ) to = HTS_POS_MAX;
+    if ( !from ) from = 1;
+    else from++;
+#ifndef MAX_CSI_COOR
+#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1)
+#endif
+    if ( to==0 ) to = MAX_CSI_COOR - 1;
+    else to++;
     ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
-    bcf_sr_set_regions(args->files,line,0);
+    bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
+    bcf_sr_set_regions(args->files,str.s,0);
     free(str.s);
 
     if ( tmp_ptr ) *tmp_ptr = tmp;
@@ -773,6 +797,19 @@ static void apply_variant(args_t *args, bcf1_t *rec)
     else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
 
     // Overlapping variant?
+    if ( ialt==0 && rec->pos <= args->fa_frz_pos && rec->pos + rec->rlen - 1 > args->fa_frz_pos )
+    {
+        // Applying the reference allele which overlaps a previous deletion. If we are here, it
+        // means it goes beyond the freezed position, hence the record can be trimmed and moved
+        // forward
+        int ntrim = args->fa_frz_pos - rec->pos + 1;
+        int nref  = strlen(rec->d.allele[0]);
+        assert( ntrim < nref );
+        rec->pos  += ntrim;
+        rec->rlen -= ntrim;
+        memmove(rec->d.allele[0],rec->d.allele[0]+ntrim,nref-ntrim);
+        rec->d.allele[0][nref-ntrim] = 0;
+    }
     if ( rec->pos <= args->fa_frz_pos )
     {
         // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
@@ -787,18 +824,44 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
             return;
         }
-
     }
 
+    char *ref_allele = rec->d.allele[0];
     char *alt_allele = rec->d.allele[ialt];
     int rmme_alt = 0;
 
     int len_diff = 0, alen = 0;
-    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
+    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;   // position of the variant within the modified fasta sequence
     if ( idx<0 )
     {
-        fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
-        return;
+        if ( alt_allele[0]=='<' )   // symbolic allele
+        {
+            rec->pos  -= idx + 1;
+            rec->rlen += idx + 1;
+            idx = -1;
+        }
+        else if ( strlen(ref_allele) < -idx )   // the ref allele is shorter but overlaps the fa sequence? This should never happen
+        {
+            assert(0);
+            fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+            return;
+        }
+        else if ( strlen(alt_allele) > -idx )  // the ref allele overlaps the fa and so does the alt allele
+        {
+            rec->pos   -= idx;
+            rec->rlen  += idx;
+            ref_allele -= idx;
+            alt_allele -= idx;
+            idx = 0;
+        }
+        else    // the ref allele overlaps the fa but alt allele does not: trim to leave one base before
+        {
+            rec->pos   -= idx + 1;
+            rec->rlen  += idx + 1;
+            ref_allele -= idx + 1;
+            alt_allele += strlen(alt_allele) - 1;
+            idx = -1;
+        }
     }
     if ( rec->rlen > args->fa_buf.l - idx )
     {
@@ -813,8 +876,9 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             }
         }
     }
-    if ( idx>=args->fa_buf.l )
-        error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
+
+    // the variant is beyond the available fasta sequence
+    if ( idx>0 && idx>=args->fa_buf.l ) return;
 
     // sanity check the reference base
     if ( alt_allele[0]=='<' )
@@ -828,7 +892,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         if ( !strcasecmp(alt_allele,"<DEL>") )
         {
             static int multibase_ref_del_warned = 0;
-            if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
+            if ( ref_allele[1]!=0 && !multibase_ref_del_warned )
             {
                 fprintf(stderr,
                     "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
@@ -837,7 +901,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             }
             if ( args->mark_del )   // insert dashes instead of delete sequence
             {
-                alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
+                alt_allele = mark_del(ref_allele, rec->rlen, NULL, args->mark_del);
                 alen = rec->rlen;
                 len_diff = 0;
                 rmme_alt = 1;
@@ -845,7 +909,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             else
             {
                 len_diff = 1-rec->rlen;
-                alt_allele = rec->d.allele[0];     // according to VCF spec, the first REF base must precede the event
+                alt_allele = ref_allele;     // according to VCF spec, the first REF base must precede the event
                 alen = 1;
             }
         }
@@ -856,7 +920,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             return;
         }
     }
-    else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
+    else if ( idx>=0 && strncasecmp(ref_allele,args->fa_buf.s+idx,rec->rlen) )
     {
         // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA),
         // the reference base in fa_buf is lost and the check fails. We do not keep a buffer
@@ -864,10 +928,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         // one base overlap
 
         int fail = 1;
-        if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
+        if ( args->prev_base_pos==rec->pos && toupper(ref_allele[0])==toupper(args->prev_base) )
         {
             if ( rec->rlen==1 ) fail = 0;
-            else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
+            else if ( !strncasecmp(ref_allele+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
         }
 
         if ( fail )
@@ -883,7 +947,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
                     "   REF .vcf: [%s]\n"
                     "   ALT .vcf: [%s]\n"
                     "   REF .fa : [%s]%c%s\n",
-                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
+                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, ref_allele, alt_allele, args->fa_buf.s+idx,
                     tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
                  );
         }
@@ -892,7 +956,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
 
         if ( args->mark_del && len_diff<0 )
         {
-            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del);
             alen = rec->rlen;
             len_diff = 0;
             rmme_alt = 1;
@@ -905,23 +969,24 @@ static void apply_variant(args_t *args, bcf1_t *rec)
 
         if ( args->mark_del && len_diff<0 )
         {
-            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del);
             alen = rec->rlen;
             len_diff = 0;
             rmme_alt = 1;
         }
     }
 
-    args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
+    int safe_idx = idx<0 ? 0 : idx; // idx can be negative in case of overlapping deletion
+    args->fa_case = toupper(args->fa_buf.s[safe_idx])==args->fa_buf.s[safe_idx] ? TO_UPPER : TO_LOWER;
     if ( args->fa_case==TO_UPPER )
         for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
     else
         for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
 
     if ( args->mark_ins && len_diff>0 )
-        mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
+        mark_ins(ref_allele, alt_allele, args->mark_ins);
     if ( args->mark_snv )
-        mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
+        mark_snv(ref_allele, alt_allele, args->mark_snv);
 
     if ( len_diff <= 0 )
     {
@@ -947,13 +1012,13 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
         memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
 
-        // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceeding variants.
+        // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceding variants.
         // For example, here we want to get TAA:
         //      POS REF ALT
         //      1   C   T
         //      1   C   CAA
         int ibeg = 0;
-        while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
+        while ( ibeg<alen && ref_allele[ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
         for (i=ibeg; i<alen; i++)
             args->fa_buf.s[idx+i] = alt_allele[i];
 
@@ -962,7 +1027,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
     if (args->chain && len_diff != 0)
     {
         // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
-        if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
+        if ( strncasecmp(ref_allele,alt_allele,1) == 0)
         {
             // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
             push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
@@ -1135,6 +1200,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -M, --missing CHAR             Output CHAR instead of skipping a missing genotype \"./.\"\n");
     fprintf(stderr, "    -o, --output FILE              Write output to a file [standard output]\n");
     fprintf(stderr, "    -p, --prefix STRING            Prefix to add to output sequence names\n");
+    fprintf(stderr, "        --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(stderr, "    -s, --samples LIST             Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
     fprintf(stderr, "    -S, --samples-file FILE        File of samples to include\n");
     fprintf(stderr, "Examples:\n");
@@ -1151,6 +1217,7 @@ int main_consensus(int argc, char *argv[])
 {
     args_t *args = (args_t*) calloc(1,sizeof(args_t));
     args->argc   = argc; args->argv = argv;
+    args->regions_overlap = 1;
 
     static struct option loptions[] =
     {
@@ -1172,6 +1239,7 @@ int main_consensus(int argc, char *argv[])
         {"absent",1,0,'a'},
         {"chain",1,0,'c'},
         {"prefix",required_argument,0,'p'},
+        {"regions-overlap",required_argument,0,5},
         {0,0,0,0}
     };
     int c;
@@ -1192,6 +1260,10 @@ int main_consensus(int argc, char *argv[])
                 else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
                 else error("The argument is not recognised: --mark-snv %s\n",optarg);
                 break;
+            case  5 :
+                args->regions_overlap = parse_overlap_option(optarg);
+                if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
+                break;
             case 'p': args->chr_prefix = optarg; break;
             case 's': args->sample = optarg; break;
             case 'S': args->sample_fname = optarg; break;
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c
index 9f0826b71..a004f0041 100644
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -2,7 +2,7 @@
 
 /* The MIT License
 
-   Copyright (c) 2014-2023 Genome Research Ltd.
+   Copyright (c) 2014-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -120,7 +120,7 @@ typedef struct
     char **argv;
     int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied;
     uint8_t *iupac_bitmask, *iupac_als;
-    int miupac_bitmask, miupac_als;
+    int miupac_bitmask, miupac_als, regions_overlap;
     char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
     char mark_del, mark_ins, mark_snv;
     smpl_ilist_t *smpl;
@@ -231,7 +231,14 @@ static void init_data(args_t *args)
     args->hdr = args->files->readers[0].header;
     args->isample = -1;
     if ( !args->sample )
+    {
         args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
+        if ( !args->smpl->n )
+        {
+            smpl_ilist_destroy(args->smpl);
+            args->smpl = NULL;
+        }
+    }
     else if ( args->sample && strcmp("-",args->sample) )
     {
         args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE);
@@ -246,12 +253,22 @@ static void init_data(args_t *args)
     {
         if ( args->haplotype || args->allele )
         {
-            if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n");
+            if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H; check the -s,-S options\n");
             args->isample = args->smpl->idx[0];
         }
         else
+        {
             args->iupac_GTs = 1;
+            if ( args->smpl->n==1 )
+                fprintf(bcftools_stderr,"Note: applying IUPAC codes based on FORMAT/GT in sample %s\n",bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->smpl->idx[0]));
+            else
+                fprintf(bcftools_stderr,"Note: applying IUPAC codes based on FORMAT/GT in %d samples\n",args->smpl->n);
+        }
     }
+    else if ( args->output_iupac )
+        fprintf(bcftools_stderr,"Note: applying IUPAC codes based on REF,ALT%s\n",bcf_hdr_nsamples(args->hdr)?", ignoring samples":"");
+    else
+        fprintf(bcftools_stderr,"Note: applying REF,ALT variants%s\n",bcf_hdr_nsamples(args->hdr)?", ignoring samples":"");
     int i;
     for (i=0; i<args->nmask; i++)
     {
@@ -274,7 +291,6 @@ static void init_data(args_t *args)
         if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
     }
     else args->fp_out = bcftools_stdout;
-    if ( args->isample<0 && !args->iupac_GTs ) fprintf(bcftools_stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n");
     if ( args->filter_str )
         args->filter = filter_init(args->hdr, args->filter_str);
     args->rid = -1;
@@ -350,20 +366,28 @@ static void init_region(args_t *args, char *line)
     args->prev_base_pos = -1;
     args->fa_buf.l  = 0;
     args->fa_length = 0;
-    args->fa_end_pos = to;
-    args->fa_ori_pos = from;
-    args->fa_src_pos = from;
+    args->fa_end_pos = to;      // 0-based
+    args->fa_ori_pos = from;    // 0-based
+    args->fa_src_pos = from;    // 0-based
     args->fa_mod_off = 0;
     args->fa_frz_pos = -1;
     args->fa_frz_mod = -1;
     args->fa_case    = -1;
     args->vcf_rbuf.n = 0;
 
+
+    // bcf_sr_set_regions accepts 1-based coordinates
     kstring_t str = {0,0,0};
-    if ( from==0 ) from = 1;
-    if ( to==0 ) to = HTS_POS_MAX;
+    if ( !from ) from = 1;
+    else from++;
+#ifndef MAX_CSI_COOR
+#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1)
+#endif
+    if ( to==0 ) to = MAX_CSI_COOR - 1;
+    else to++;
     ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
-    bcf_sr_set_regions(args->files,line,0);
+    bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
+    bcf_sr_set_regions(args->files,str.s,0);
     free(str.s);
 
     if ( tmp_ptr ) *tmp_ptr = tmp;
@@ -775,6 +799,19 @@ static void apply_variant(args_t *args, bcf1_t *rec)
     else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
 
     // Overlapping variant?
+    if ( ialt==0 && rec->pos <= args->fa_frz_pos && rec->pos + rec->rlen - 1 > args->fa_frz_pos )
+    {
+        // Applying the reference allele which overlaps a previous deletion. If we are here, it
+        // means it goes beyond the freezed position, hence the record can be trimmed and moved
+        // forward
+        int ntrim = args->fa_frz_pos - rec->pos + 1;
+        int nref  = strlen(rec->d.allele[0]);
+        assert( ntrim < nref );
+        rec->pos  += ntrim;
+        rec->rlen -= ntrim;
+        memmove(rec->d.allele[0],rec->d.allele[0]+ntrim,nref-ntrim);
+        rec->d.allele[0][nref-ntrim] = 0;
+    }
     if ( rec->pos <= args->fa_frz_pos )
     {
         // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
@@ -789,18 +826,44 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
             return;
         }
-
     }
 
+    char *ref_allele = rec->d.allele[0];
     char *alt_allele = rec->d.allele[ialt];
     int rmme_alt = 0;
 
     int len_diff = 0, alen = 0;
-    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
+    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;   // position of the variant within the modified fasta sequence
     if ( idx<0 )
     {
-        fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
-        return;
+        if ( alt_allele[0]=='<' )   // symbolic allele
+        {
+            rec->pos  -= idx + 1;
+            rec->rlen += idx + 1;
+            idx = -1;
+        }
+        else if ( strlen(ref_allele) < -idx )   // the ref allele is shorter but overlaps the fa sequence? This should never happen
+        {
+            assert(0);
+            fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+            return;
+        }
+        else if ( strlen(alt_allele) > -idx )  // the ref allele overlaps the fa and so does the alt allele
+        {
+            rec->pos   -= idx;
+            rec->rlen  += idx;
+            ref_allele -= idx;
+            alt_allele -= idx;
+            idx = 0;
+        }
+        else    // the ref allele overlaps the fa but alt allele does not: trim to leave one base before
+        {
+            rec->pos   -= idx + 1;
+            rec->rlen  += idx + 1;
+            ref_allele -= idx + 1;
+            alt_allele += strlen(alt_allele) - 1;
+            idx = -1;
+        }
     }
     if ( rec->rlen > args->fa_buf.l - idx )
     {
@@ -815,8 +878,9 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             }
         }
     }
-    if ( idx>=args->fa_buf.l )
-        error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
+
+    // the variant is beyond the available fasta sequence
+    if ( idx>0 && idx>=args->fa_buf.l ) return;
 
     // sanity check the reference base
     if ( alt_allele[0]=='<' )
@@ -830,7 +894,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         if ( !strcasecmp(alt_allele,"<DEL>") )
         {
             static int multibase_ref_del_warned = 0;
-            if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
+            if ( ref_allele[1]!=0 && !multibase_ref_del_warned )
             {
                 fprintf(bcftools_stderr,
                     "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
@@ -839,7 +903,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             }
             if ( args->mark_del )   // insert dashes instead of delete sequence
             {
-                alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
+                alt_allele = mark_del(ref_allele, rec->rlen, NULL, args->mark_del);
                 alen = rec->rlen;
                 len_diff = 0;
                 rmme_alt = 1;
@@ -847,7 +911,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             else
             {
                 len_diff = 1-rec->rlen;
-                alt_allele = rec->d.allele[0];     // according to VCF spec, the first REF base must precede the event
+                alt_allele = ref_allele;     // according to VCF spec, the first REF base must precede the event
                 alen = 1;
             }
         }
@@ -858,7 +922,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
             return;
         }
     }
-    else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
+    else if ( idx>=0 && strncasecmp(ref_allele,args->fa_buf.s+idx,rec->rlen) )
     {
         // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA),
         // the reference base in fa_buf is lost and the check fails. We do not keep a buffer
@@ -866,10 +930,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         // one base overlap
 
         int fail = 1;
-        if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
+        if ( args->prev_base_pos==rec->pos && toupper(ref_allele[0])==toupper(args->prev_base) )
         {
             if ( rec->rlen==1 ) fail = 0;
-            else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
+            else if ( !strncasecmp(ref_allele+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
         }
 
         if ( fail )
@@ -885,7 +949,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
                     "   REF .vcf: [%s]\n"
                     "   ALT .vcf: [%s]\n"
                     "   REF .fa : [%s]%c%s\n",
-                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
+                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, ref_allele, alt_allele, args->fa_buf.s+idx,
                     tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
                  );
         }
@@ -894,7 +958,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
 
         if ( args->mark_del && len_diff<0 )
         {
-            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del);
             alen = rec->rlen;
             len_diff = 0;
             rmme_alt = 1;
@@ -907,23 +971,24 @@ static void apply_variant(args_t *args, bcf1_t *rec)
 
         if ( args->mark_del && len_diff<0 )
         {
-            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del);
             alen = rec->rlen;
             len_diff = 0;
             rmme_alt = 1;
         }
     }
 
-    args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
+    int safe_idx = idx<0 ? 0 : idx; // idx can be negative in case of overlapping deletion
+    args->fa_case = toupper(args->fa_buf.s[safe_idx])==args->fa_buf.s[safe_idx] ? TO_UPPER : TO_LOWER;
     if ( args->fa_case==TO_UPPER )
         for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
     else
         for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
 
     if ( args->mark_ins && len_diff>0 )
-        mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
+        mark_ins(ref_allele, alt_allele, args->mark_ins);
     if ( args->mark_snv )
-        mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
+        mark_snv(ref_allele, alt_allele, args->mark_snv);
 
     if ( len_diff <= 0 )
     {
@@ -949,13 +1014,13 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
         memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
 
-        // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceeding variants.
+        // This can get tricky, make sure the bases unchanged by the insertion do not overwrite preceding variants.
         // For example, here we want to get TAA:
         //      POS REF ALT
         //      1   C   T
         //      1   C   CAA
         int ibeg = 0;
-        while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
+        while ( ibeg<alen && ref_allele[ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
         for (i=ibeg; i<alen; i++)
             args->fa_buf.s[idx+i] = alt_allele[i];
 
@@ -964,7 +1029,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
     if (args->chain && len_diff != 0)
     {
         // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
-        if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
+        if ( strncasecmp(ref_allele,alt_allele,1) == 0)
         {
             // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
             push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
@@ -1137,6 +1202,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "    -M, --missing CHAR             Output CHAR instead of skipping a missing genotype \"./.\"\n");
     fprintf(bcftools_stderr, "    -o, --output FILE              Write output to a file [standard output]\n");
     fprintf(bcftools_stderr, "    -p, --prefix STRING            Prefix to add to output sequence names\n");
+    fprintf(bcftools_stderr, "        --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(bcftools_stderr, "    -s, --samples LIST             Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
     fprintf(bcftools_stderr, "    -S, --samples-file FILE        File of samples to include\n");
     fprintf(bcftools_stderr, "Examples:\n");
@@ -1153,6 +1219,7 @@ int main_consensus(int argc, char *argv[])
 {
     args_t *args = (args_t*) calloc(1,sizeof(args_t));
     args->argc   = argc; args->argv = argv;
+    args->regions_overlap = 1;
 
     static struct option loptions[] =
     {
@@ -1174,6 +1241,7 @@ int main_consensus(int argc, char *argv[])
         {"absent",1,0,'a'},
         {"chain",1,0,'c'},
         {"prefix",required_argument,0,'p'},
+        {"regions-overlap",required_argument,0,5},
         {0,0,0,0}
     };
     int c;
@@ -1194,6 +1262,10 @@ int main_consensus(int argc, char *argv[])
                 else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
                 else error("The argument is not recognised: --mark-snv %s\n",optarg);
                 break;
+            case  5 :
+                args->regions_overlap = parse_overlap_option(optarg);
+                if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
+                break;
             case 'p': args->chr_prefix = optarg; break;
             case 's': args->sample = optarg; break;
             case 'S': args->sample_fname = optarg; break;
diff --git a/bcftools/convert.c b/bcftools/convert.c
index 07ff01862..c459c8387 100644
--- a/bcftools/convert.c
+++ b/bcftools/convert.c
@@ -1,6 +1,6 @@
 /*  convert.c -- functions for converting between VCF/BCF and related formats.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -31,6 +31,7 @@ THE SOFTWARE.  */
 #include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <stdint.h>
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
 #include <math.h>
@@ -39,6 +40,7 @@ THE SOFTWARE.  */
 #include <htslib/vcfutils.h>
 #include <htslib/kfunc.h>
 #include <htslib/khash_str2int.h>
+#include <htslib/hts_endian.h>
 #include "bcftools.h"
 #include "variantkey.h"
 #include "convert.h"
@@ -104,9 +106,12 @@ struct _convert_t
     char *undef_info_tag;
     void *used_tags_hash;
     char **used_tags_list;
+    char *print_filtered;
     int nused_tags;
     int allow_undef_tags;
     int force_newline;
+    int header_samples;
+    int no_hdr_indices;
     uint8_t **subset_samples;
 };
 
@@ -172,23 +177,23 @@ static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
     }
     else kputc('.', str);
 }
-static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
+static inline int32_t bcf_array_ivalue(uint8_t *bcf_array, int type, int idx)
 {
     if ( type==BCF_BT_INT8 )
     {
-        int8_t val = ((int8_t*)bcf_array)[idx];
+        int8_t val = le_to_i8(&bcf_array[idx * sizeof(val)]);
         if ( val==bcf_int8_missing ) return bcf_int32_missing;
         if ( val==bcf_int8_vector_end ) return bcf_int32_vector_end;
         return val;
     }
     if ( type==BCF_BT_INT16 )
     {
-        int16_t val = ((int16_t*)bcf_array)[idx];
+        int16_t val = le_to_i16(&bcf_array[idx * sizeof(val)]);
         if ( val==bcf_int16_missing ) return bcf_int32_missing;
         if ( val==bcf_int16_vector_end ) return bcf_int32_vector_end;
         return val;
     }
-    return ((int32_t*)bcf_array)[idx];
+    return le_to_i32(&bcf_array[idx * sizeof(int32_t)]);
 }
 static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
 {
@@ -286,17 +291,17 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
             kputc('.', str);
             return;
         }
-        #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
-            type_t val = ((type_t *) info->vptr)[fmt->subscript]; \
+        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
+            type_t val = convert(&info->vptr[fmt->subscript * sizeof(type_t)]); \
             if ( is_missing || is_vector_end ) kputc('.',str); \
             else kprint; \
         }
         switch (info->type)
         {
-            case BCF_BT_INT8:  BRANCH(int8_t,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  kputw(val, str)); break;
-            case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
-            case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
-            case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  kputw(val, str)); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
+            case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
             case BCF_BT_CHAR:  _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
             default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
         }
@@ -384,11 +389,12 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
         }
         if ( fmt->fmt->type == BCF_BT_FLOAT )
         {
-            float *ptr = (float*)(fmt->fmt->p + isample*fmt->fmt->size);
-            if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
+            uint8_t *ptr = fmt->fmt->p + isample*fmt->fmt->size;
+            float val = le_to_float(&ptr[fmt->subscript * sizeof(float)]);
+            if ( bcf_float_is_missing(val) || bcf_float_is_vector_end(val) )
                 kputc('.', str);
             else
-                kputd(ptr[fmt->subscript], str);
+                kputd(val, str);
         }
         else if ( fmt->fmt->type != BCF_BT_CHAR )
         {
@@ -501,14 +507,14 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
 
     int mask = fmt->subscript==0 ? 3 : 1;   // merge both haplotypes if subscript==0
 
-    #define BRANCH(type_t, nbits) { \
-        type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \
+    #define BRANCH(type_t, convert, nbits) { \
+        uint8_t *x = fmt->fmt->p + isample*fmt->fmt->size; \
         int i,j; \
         if ( fmt->subscript<=0 || fmt->subscript==1 ) \
         { \
             for (j=0; j < fmt->fmt->n; j++) \
             { \
-                type_t val = x[j]; \
+                type_t val = convert(&x[j * sizeof(type_t)]); \
                 if ( !val ) continue; \
                 for (i=0; i<nbits; i+=2) \
                     if ( val & (mask<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
@@ -518,7 +524,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
         { \
             for (j=0; j < fmt->fmt->n; j++) \
             { \
-                type_t val = x[j]; \
+                type_t val = convert(&x[j * sizeof(type_t)]); \
                 if ( !val ) continue; \
                 for (i=1; i<nbits; i+=2) \
                     if ( val & (1<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
@@ -527,9 +533,9 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
     }
     switch (fmt->fmt->type)
     {
-        case BCF_BT_INT8:  BRANCH(uint8_t, 8); break;
-        case BCF_BT_INT16: BRANCH(uint16_t,16); break;
-        case BCF_BT_INT32: BRANCH(uint32_t,30); break;  // 2 bytes unused to account for the reserved BCF values
+        case BCF_BT_INT8:  BRANCH(uint8_t,  le_to_u8,   8); break;
+        case BCF_BT_INT16: BRANCH(uint16_t, le_to_u16, 16); break;
+        case BCF_BT_INT32: BRANCH(uint32_t, le_to_u32, 30); break;  // 2 bits unused to account for the reserved BCF values
         default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
     }
     #undef BRANCH
@@ -1185,16 +1191,16 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
         int al = bcf_gt_allele(gt[i]);
         if ( al > line->n_allele || al >= fmt->fmt->n ) goto invalid;
 
-        #define BRANCH(type_t, missing, vector_end) { \
-            type_t val = ((type_t *) fmt->fmt->p)[al + isample*fmt->fmt->n]; \
+        #define BRANCH(type_t, convert, missing, vector_end) { \
+            type_t val = convert(&fmt->fmt->p[(al + isample*fmt->fmt->n)*sizeof(type_t)]); \
             if ( val==missing || val==vector_end ) goto invalid; \
             else n[i] = val; \
         }
         switch (fmt->fmt->type)
         {
-            case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_missing,  bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,  bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
             default: goto invalid; break;
         }
         #undef BRANCH
@@ -1203,11 +1209,11 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
     if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str);
     else
     {
-        double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5);
-        pval *= 2;
-        if ( pval>=1 ) pval = 0;     // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
-        else
-            pval = -4.34294481903*log(pval);
+        double pval = calc_binom_two_sided(n[0],n[1],0.5);
+
+        // convrt to phred
+        if ( pval>=1 ) pval = 0;
+        else pval = -4.34294481903*log(pval);
         kputd(pval, str);
     }
     return;
@@ -1550,6 +1556,7 @@ void convert_destroy(convert_t *convert)
         free(convert->used_tags_list);
     }
     khash_str2int_destroy(convert->used_tags_hash);
+    free(convert->print_filtered);
     free(convert->fmt);
     free(convert->undef_info_tag);
     free(convert->dat);
@@ -1562,8 +1569,9 @@ void convert_destroy(convert_t *convert)
 int convert_header(convert_t *convert, kstring_t *str)
 {
     int i, icol = 0, l_ori = str->l;
+    bcf_hdr_t *hdr = convert->header;
 
-    // Supress the header output if LINE is present
+    // Suppress the header output if LINE is present
     for (i=0; i<convert->nfmt; i++)
         if ( convert->fmt[i].type == T_LINE ) break;
     if ( i!=convert->nfmt )
@@ -1585,6 +1593,7 @@ int convert_header(convert_t *convert, kstring_t *str)
             while ( convert->fmt[j].is_gt_field ) j++;
             for (js=0; js<convert->nsamples; js++)
             {
+                int ks = convert->samples[js];
                 for (k=i; k<j; k++)
                 {
                     if ( convert->fmt[k].type == T_SEP )
@@ -1600,10 +1609,29 @@ int convert_header(convert_t *convert, kstring_t *str)
                             }
                         }
                     }
+                    else if ( convert->header_samples )
+                    {
+                        icol++;
+                        if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol);
+                        ksprintf(str,"%s:%s", hdr->samples[ks], convert->fmt[k].key);
+                    }
                     else
-                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
+                    {
+                        icol++;
+                        if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol);
+                        ksprintf(str,"%s", convert->fmt[k].key);
+                    }
+                }
+                if ( has_fmt_newline )
+                {
+                    if ( !convert->header_samples ) break;
+
+                    // this is unfortunate: the formatting expression breaks the per-sample output into separate lines,
+                    // therefore including a sample name in the header makes no sense anymore
+                    convert->header_samples = 0;
+                    str->l = l_ori;
+                    return convert_header(convert, str);
                 }
-                if ( has_fmt_newline ) break;
             }
             i = j-1;
             continue;
@@ -1614,7 +1642,9 @@ int convert_header(convert_t *convert, kstring_t *str)
             if ( convert->fmt[i].key ) kputs(convert->fmt[i].key, str);
             continue;
         }
-        ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
+        icol++;
+        if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol);
+        ksprintf(str,"%s", convert->fmt[i].key);
     }
     if ( has_fmt_newline ) kputc('\n',str);
     return str->l - l_ori;
@@ -1653,7 +1683,17 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
             {
                 // Skip samples when filtering was requested
                 int ks = convert->samples[js];
-                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
+                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] )
+                {
+                    if ( !convert->print_filtered ) continue;
+
+                    for (k=i; k<j; k++)
+                        if ( convert->fmt[k].type==T_SEP )
+                            convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+                        else
+                            kputs(convert->print_filtered, str);
+                    continue;
+                }
 
                 // Here comes a hack designed for TBCSQ. When running on large files,
                 // such as 1000GP, there are too many empty fields in the output and
@@ -1709,29 +1749,18 @@ static void force_newline_(convert_t *convert)
     }
     if ( has_newline ) return;
 
-    // A newline is not present, force it. But where to add it?
-    // Consider
-    //      -f'%CHROM[ %SAMPLE]\n'
-    // vs
-    //      -f'[%CHROM %SAMPLE\n]'
-    for (i=0; i<convert->nfmt; i++)
-        if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
-
-    if ( i < convert->nfmt )
-        register_tag(convert, "\n", 0, T_SEP);  // the first case
-    else
-    {
-        // the second case
-        i = convert->nfmt - 1;
-        if ( !convert->fmt[i].key )
-        {
-            convert->fmt[i].key = strdup("\n");
-            convert->fmt[i].is_gt_field = 1;
-            register_tag(convert, NULL, 0, T_SEP);
-        }
-        else
-            register_tag(convert, "\n", 1, T_SEP);
-    }
+    // A newline is not present, force it. But where to add it? Always at the end.
+    //
+    // Briefly, in 1.18, we considered the following automatic behavior, which for
+    // per-site output it would add it at the end of the expression and for per-sample
+    // output it would add it inside the square brackets:
+    //           -f'%CHROM[ %SAMPLE]\n'
+    //           -f'[%CHROM %SAMPLE\n]'
+    //
+    // However, this is an annoyance for users, as it is not entirely clear what
+    // will happen unless one understands the internals well (#1969)
+
+    register_tag(convert, "\n", 0, T_SEP);
 }
 
 int convert_set_option(convert_t *convert, enum convert_option opt, ...)
@@ -1748,10 +1777,19 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
         case subset_samples:
             convert->subset_samples = va_arg(args, uint8_t**);
             break;
+        case header_samples:
+            convert->header_samples = va_arg(args, int);
+            break;
+        case print_filtered:
+            convert->print_filtered = strdup(va_arg(args, char*));
+            break;
         case force_newline:
             convert->force_newline = va_arg(args, int);
             if ( convert->force_newline ) force_newline_(convert);
             break;
+        case no_hdr_indices:
+            convert->no_hdr_indices = va_arg(args, int);
+            break;
         default:
             ret = -1;
     }
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c
index 09a7648cc..e7d2905c2 100644
--- a/bcftools/convert.c.pysam.c
+++ b/bcftools/convert.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  convert.c -- functions for converting between VCF/BCF and related formats.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -33,6 +33,7 @@ THE SOFTWARE.  */
 #include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <stdint.h>
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
 #include <math.h>
@@ -41,6 +42,7 @@ THE SOFTWARE.  */
 #include <htslib/vcfutils.h>
 #include <htslib/kfunc.h>
 #include <htslib/khash_str2int.h>
+#include <htslib/hts_endian.h>
 #include "bcftools.h"
 #include "variantkey.h"
 #include "convert.h"
@@ -106,9 +108,12 @@ struct _convert_t
     char *undef_info_tag;
     void *used_tags_hash;
     char **used_tags_list;
+    char *print_filtered;
     int nused_tags;
     int allow_undef_tags;
     int force_newline;
+    int header_samples;
+    int no_hdr_indices;
     uint8_t **subset_samples;
 };
 
@@ -174,23 +179,23 @@ static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
     }
     else kputc('.', str);
 }
-static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
+static inline int32_t bcf_array_ivalue(uint8_t *bcf_array, int type, int idx)
 {
     if ( type==BCF_BT_INT8 )
     {
-        int8_t val = ((int8_t*)bcf_array)[idx];
+        int8_t val = le_to_i8(&bcf_array[idx * sizeof(val)]);
         if ( val==bcf_int8_missing ) return bcf_int32_missing;
         if ( val==bcf_int8_vector_end ) return bcf_int32_vector_end;
         return val;
     }
     if ( type==BCF_BT_INT16 )
     {
-        int16_t val = ((int16_t*)bcf_array)[idx];
+        int16_t val = le_to_i16(&bcf_array[idx * sizeof(val)]);
         if ( val==bcf_int16_missing ) return bcf_int32_missing;
         if ( val==bcf_int16_vector_end ) return bcf_int32_vector_end;
         return val;
     }
-    return ((int32_t*)bcf_array)[idx];
+    return le_to_i32(&bcf_array[idx * sizeof(int32_t)]);
 }
 static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
 {
@@ -288,17 +293,17 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
             kputc('.', str);
             return;
         }
-        #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
-            type_t val = ((type_t *) info->vptr)[fmt->subscript]; \
+        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
+            type_t val = convert(&info->vptr[fmt->subscript * sizeof(type_t)]); \
             if ( is_missing || is_vector_end ) kputc('.',str); \
             else kprint; \
         }
         switch (info->type)
         {
-            case BCF_BT_INT8:  BRANCH(int8_t,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  kputw(val, str)); break;
-            case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
-            case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
-            case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  kputw(val, str)); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
+            case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
             case BCF_BT_CHAR:  _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
             default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
         }
@@ -386,11 +391,12 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
         }
         if ( fmt->fmt->type == BCF_BT_FLOAT )
         {
-            float *ptr = (float*)(fmt->fmt->p + isample*fmt->fmt->size);
-            if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
+            uint8_t *ptr = fmt->fmt->p + isample*fmt->fmt->size;
+            float val = le_to_float(&ptr[fmt->subscript * sizeof(float)]);
+            if ( bcf_float_is_missing(val) || bcf_float_is_vector_end(val) )
                 kputc('.', str);
             else
-                kputd(ptr[fmt->subscript], str);
+                kputd(val, str);
         }
         else if ( fmt->fmt->type != BCF_BT_CHAR )
         {
@@ -503,14 +509,14 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
 
     int mask = fmt->subscript==0 ? 3 : 1;   // merge both haplotypes if subscript==0
 
-    #define BRANCH(type_t, nbits) { \
-        type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \
+    #define BRANCH(type_t, convert, nbits) { \
+        uint8_t *x = fmt->fmt->p + isample*fmt->fmt->size; \
         int i,j; \
         if ( fmt->subscript<=0 || fmt->subscript==1 ) \
         { \
             for (j=0; j < fmt->fmt->n; j++) \
             { \
-                type_t val = x[j]; \
+                type_t val = convert(&x[j * sizeof(type_t)]); \
                 if ( !val ) continue; \
                 for (i=0; i<nbits; i+=2) \
                     if ( val & (mask<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
@@ -520,7 +526,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
         { \
             for (j=0; j < fmt->fmt->n; j++) \
             { \
-                type_t val = x[j]; \
+                type_t val = convert(&x[j * sizeof(type_t)]); \
                 if ( !val ) continue; \
                 for (i=1; i<nbits; i+=2) \
                     if ( val & (1<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
@@ -529,9 +535,9 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
     }
     switch (fmt->fmt->type)
     {
-        case BCF_BT_INT8:  BRANCH(uint8_t, 8); break;
-        case BCF_BT_INT16: BRANCH(uint16_t,16); break;
-        case BCF_BT_INT32: BRANCH(uint32_t,30); break;  // 2 bytes unused to account for the reserved BCF values
+        case BCF_BT_INT8:  BRANCH(uint8_t,  le_to_u8,   8); break;
+        case BCF_BT_INT16: BRANCH(uint16_t, le_to_u16, 16); break;
+        case BCF_BT_INT32: BRANCH(uint32_t, le_to_u32, 30); break;  // 2 bits unused to account for the reserved BCF values
         default: error("Unexpected type: %d\n", fmt->fmt->type); bcftools_exit(1); break;
     }
     #undef BRANCH
@@ -1187,16 +1193,16 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
         int al = bcf_gt_allele(gt[i]);
         if ( al > line->n_allele || al >= fmt->fmt->n ) goto invalid;
 
-        #define BRANCH(type_t, missing, vector_end) { \
-            type_t val = ((type_t *) fmt->fmt->p)[al + isample*fmt->fmt->n]; \
+        #define BRANCH(type_t, convert, missing, vector_end) { \
+            type_t val = convert(&fmt->fmt->p[(al + isample*fmt->fmt->n)*sizeof(type_t)]); \
             if ( val==missing || val==vector_end ) goto invalid; \
             else n[i] = val; \
         }
         switch (fmt->fmt->type)
         {
-            case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_missing,  bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,  bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
             default: goto invalid; break;
         }
         #undef BRANCH
@@ -1205,11 +1211,11 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
     if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str);
     else
     {
-        double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5);
-        pval *= 2;
-        if ( pval>=1 ) pval = 0;     // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
-        else
-            pval = -4.34294481903*log(pval);
+        double pval = calc_binom_two_sided(n[0],n[1],0.5);
+
+        // convrt to phred
+        if ( pval>=1 ) pval = 0;
+        else pval = -4.34294481903*log(pval);
         kputd(pval, str);
     }
     return;
@@ -1552,6 +1558,7 @@ void convert_destroy(convert_t *convert)
         free(convert->used_tags_list);
     }
     khash_str2int_destroy(convert->used_tags_hash);
+    free(convert->print_filtered);
     free(convert->fmt);
     free(convert->undef_info_tag);
     free(convert->dat);
@@ -1564,8 +1571,9 @@ void convert_destroy(convert_t *convert)
 int convert_header(convert_t *convert, kstring_t *str)
 {
     int i, icol = 0, l_ori = str->l;
+    bcf_hdr_t *hdr = convert->header;
 
-    // Supress the header output if LINE is present
+    // Suppress the header output if LINE is present
     for (i=0; i<convert->nfmt; i++)
         if ( convert->fmt[i].type == T_LINE ) break;
     if ( i!=convert->nfmt )
@@ -1587,6 +1595,7 @@ int convert_header(convert_t *convert, kstring_t *str)
             while ( convert->fmt[j].is_gt_field ) j++;
             for (js=0; js<convert->nsamples; js++)
             {
+                int ks = convert->samples[js];
                 for (k=i; k<j; k++)
                 {
                     if ( convert->fmt[k].type == T_SEP )
@@ -1602,10 +1611,29 @@ int convert_header(convert_t *convert, kstring_t *str)
                             }
                         }
                     }
+                    else if ( convert->header_samples )
+                    {
+                        icol++;
+                        if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol);
+                        ksprintf(str,"%s:%s", hdr->samples[ks], convert->fmt[k].key);
+                    }
                     else
-                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
+                    {
+                        icol++;
+                        if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol);
+                        ksprintf(str,"%s", convert->fmt[k].key);
+                    }
+                }
+                if ( has_fmt_newline )
+                {
+                    if ( !convert->header_samples ) break;
+
+                    // this is unfortunate: the formatting expression breaks the per-sample output into separate lines,
+                    // therefore including a sample name in the header makes no sense anymore
+                    convert->header_samples = 0;
+                    str->l = l_ori;
+                    return convert_header(convert, str);
                 }
-                if ( has_fmt_newline ) break;
             }
             i = j-1;
             continue;
@@ -1616,7 +1644,9 @@ int convert_header(convert_t *convert, kstring_t *str)
             if ( convert->fmt[i].key ) kputs(convert->fmt[i].key, str);
             continue;
         }
-        ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
+        icol++;
+        if ( !convert->no_hdr_indices ) ksprintf(str,"[%d]",icol);
+        ksprintf(str,"%s", convert->fmt[i].key);
     }
     if ( has_fmt_newline ) kputc('\n',str);
     return str->l - l_ori;
@@ -1655,7 +1685,17 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
             {
                 // Skip samples when filtering was requested
                 int ks = convert->samples[js];
-                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
+                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] )
+                {
+                    if ( !convert->print_filtered ) continue;
+
+                    for (k=i; k<j; k++)
+                        if ( convert->fmt[k].type==T_SEP )
+                            convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+                        else
+                            kputs(convert->print_filtered, str);
+                    continue;
+                }
 
                 // Here comes a hack designed for TBCSQ. When running on large files,
                 // such as 1000GP, there are too many empty fields in the output and
@@ -1711,29 +1751,18 @@ static void force_newline_(convert_t *convert)
     }
     if ( has_newline ) return;
 
-    // A newline is not present, force it. But where to add it?
-    // Consider
-    //      -f'%CHROM[ %SAMPLE]\n'
-    // vs
-    //      -f'[%CHROM %SAMPLE\n]'
-    for (i=0; i<convert->nfmt; i++)
-        if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
-
-    if ( i < convert->nfmt )
-        register_tag(convert, "\n", 0, T_SEP);  // the first case
-    else
-    {
-        // the second case
-        i = convert->nfmt - 1;
-        if ( !convert->fmt[i].key )
-        {
-            convert->fmt[i].key = strdup("\n");
-            convert->fmt[i].is_gt_field = 1;
-            register_tag(convert, NULL, 0, T_SEP);
-        }
-        else
-            register_tag(convert, "\n", 1, T_SEP);
-    }
+    // A newline is not present, force it. But where to add it? Always at the end.
+    //
+    // Briefly, in 1.18, we considered the following automatic behavior, which for
+    // per-site output it would add it at the end of the expression and for per-sample
+    // output it would add it inside the square brackets:
+    //           -f'%CHROM[ %SAMPLE]\n'
+    //           -f'[%CHROM %SAMPLE\n]'
+    //
+    // However, this is an annoyance for users, as it is not entirely clear what
+    // will happen unless one understands the internals well (#1969)
+
+    register_tag(convert, "\n", 0, T_SEP);
 }
 
 int convert_set_option(convert_t *convert, enum convert_option opt, ...)
@@ -1750,10 +1779,19 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
         case subset_samples:
             convert->subset_samples = va_arg(args, uint8_t**);
             break;
+        case header_samples:
+            convert->header_samples = va_arg(args, int);
+            break;
+        case print_filtered:
+            convert->print_filtered = strdup(va_arg(args, char*));
+            break;
         case force_newline:
             convert->force_newline = va_arg(args, int);
             if ( convert->force_newline ) force_newline_(convert);
             break;
+        case no_hdr_indices:
+            convert->no_hdr_indices = va_arg(args, int);
+            break;
         default:
             ret = -1;
     }
diff --git a/bcftools/convert.h b/bcftools/convert.h
index 062607093..150751481 100644
--- a/bcftools/convert.h
+++ b/bcftools/convert.h
@@ -1,6 +1,6 @@
 /*  convert.h -- functions for converting between VCF/BCF and related formats.
 
-    Copyright (C) 2014-2023 Genome Research Ltd.
+    Copyright (C) 2014-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -30,9 +30,12 @@ THE SOFTWARE.  */
 typedef struct _convert_t convert_t;
 enum convert_option
 {
-    allow_undef_tags,
-    subset_samples,
-    force_newline,
+    allow_undef_tags,       // see `bcftools query --allow-undef-tags`, throws an error if tag is not defined otherwise
+    subset_samples,         // in bracketed expressions (e.g. [ %GT]) consider only marked samples
+    header_samples,         // include sample name in bracketed tags (e.g. SAMPLE1:GT SAMPLE2:GT for [ %GT])
+    force_newline,          // automatically insert a newline when not part of the formatting expression
+    print_filtered,         // print the provided string instead of discarding samples not included in subset_samples
+    no_hdr_indices,         // drop column indices when printing header, i.e. "#CHROM", not "#[1]CHROM"
 };
 
 convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
diff --git a/bcftools/csq.c b/bcftools/csq.c
index f619e061a..b38eba107 100644
--- a/bcftools/csq.c
+++ b/bcftools/csq.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2016-2023 Genome Research Ltd.
+   Copyright (c) 2016-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -34,7 +34,7 @@
 
     Read about transcript types here
         http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
-        http://www.ensembl.org/info/genome/variation/predicted_data.html
+        https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
         https://www.gencodegenes.org/pages/biotypes.html
 
     List of supported biotypes
@@ -149,6 +149,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <ctype.h>
+#include <strings.h>
 #include "bcftools.h"
 #include "filter.h"
 #include "regidx.h"
@@ -554,7 +555,9 @@ void init_data(args_t *args)
         if ( args->hdr_nsmpl )
             bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
         if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
-        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+        if ( init_index2(args->out_fh,args->hdr,args->output_fname,
+                         &args->index_fn, args->write_index) < 0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname);
     }
     if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
 }
@@ -623,7 +626,7 @@ void destroy_data(args_t *args)
 }
 
 /*
-    The splice_* functions are for consquences around splice sites: start,stop,splice_*
+    The splice_* functions are for consequences around splice sites: start,stop,splice_*
  */
 #define SPLICE_VAR_REF 0   // ref: ACGT>ACGT, csq not applicable, skip completely
 #define SPLICE_OUTSIDE 1   // splice acceptor or similar; csq set and is done, does not overlap the region
@@ -791,7 +794,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
 static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
 {
 #if XDBG
-fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
+fprintf(stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type);
 #endif
     if ( !type ) return;
     csq_t csq;
@@ -963,7 +966,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
     if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0;
 
 #if XDBG
-    fprintf(stderr,"shifted_del_synonymous: %d-%d  %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev");
+    fprintf(stderr,"shifted_del_synonymous: %d-%d  %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":(tr->strand==STRAND_REV?"rev":"unk"));
     fprintf(stderr,"   %d  ..  %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt);
 #endif
 
@@ -996,7 +999,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
         while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
         if ( ptr_vcf[i] ) return 0;       // the deleted sequence cannot be replaced
     }
-    else
+    else if ( tr->strand==STRAND_FWD )
     {
         // STRAND_FWD
         int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel;        // the position of the first base of the ref block that could potentially replace the deletion
@@ -1179,7 +1182,9 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%
         splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
         if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
         {
-            splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
+            int ref_beg = splice->ref_beg + splice->kalt.l - 1;     // 0 for AAA>A, 1 for AAA>AC
+            if ( ref_beg < splice->ref_end )
+                splice->csq |= (splice->ref_end - ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
             return SPLICE_OVERLAP;
         }
     }
@@ -1269,13 +1274,13 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
     {
         if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
         if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
-        else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+        else if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
     }
     if ( splice->ref_end > ex_end - 3 )
     {
         if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
         if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
-        else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+        else if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
     }
     if ( splice->set_refalt )
     {
@@ -1336,17 +1341,17 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
     if ( !(tr->trim & TRIM_5PRIME) )
     {
         if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
-        else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
+        else if ( tr->strand==STRAND_REV ) { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
     }
     if ( !(tr->trim & TRIM_3PRIME) )
     {
         if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
-        else { if ( child->icds==0 ) splice.check_stop = 1; }
+        else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; }
     }
     if ( splice.check_start )   // do not check starts in incomplete CDS, defined as not starting with M
     {
         if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
-        else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+        else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
     }
     if ( child->icds!=0 ) splice.check_region_beg = 1;
     if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -1584,7 +1589,7 @@ fprintf(stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,rend,fill,
             }
         }
     }
-    else    // STRAND_REV
+    else if ( strand==STRAND_REV )
     {
         // right padding - number of bases to take from ref
         npad = (seq.m - (sbeg + seq.l)) % 3;
@@ -1671,6 +1676,7 @@ fprintf(stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,rend,fill,
             }
         }
     }
+    else error("Should not happen: %d\n", strand);
     kputc_(0,tseq); tseq->l--;
 #if DBG
  fprintf(stderr,"    tseq: %s\n", tseq->s);
@@ -1705,7 +1711,7 @@ void tscript_splice_ref(gf_tscript_t *tr)
 int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
 {
 #if XDBG
-fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
+fprintf(stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type);
 #endif
     khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
     vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
@@ -1856,7 +1862,7 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
     kputs(gf_type2gff_string(csq->biotype), str);
 
     if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
-        kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
+        kputs(csq->strand==STRAND_FWD ? "|+" : (csq->strand==STRAND_REV ? "|-" : "|."), str);
 
     if ( csq->vstr.l )
         kputs(csq->vstr.s, str);
@@ -1880,6 +1886,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
 {
     int i;
     gf_tscript_t *tr = hap->tr;
+    assert( tr->strand==STRAND_FWD || tr->strand==STRAND_REV );
     int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
     int icsq = node->ncsq_list++;
     hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2175,7 +2182,7 @@ void hap_finalize(args_t *args, hap_t *hap)
                 indel = 0;
             }
         }
-        else
+        else if ( tr->strand==STRAND_REV )
         {
             i = istack + 1, ibeg = -1;
             while ( --i > 0 )
@@ -3328,14 +3335,14 @@ static const char *usage(void)
         "       --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
         "       --threads INT                 Use multithreading with <int> worker threads [0]\n"
         "   -v, --verbose INT                 Verbosity level 0-2 [1]\n"
-        "       --write-index                 Automatically index the output files [off]\n"
+        "   -W, --write-index[=FMT]           Automatically index the output files [off]\n"
         "\n"
         "Example:\n"
-        "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
+        "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.87.gff3.gz in.vcf\n"
         "\n"
         "   # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
-        "   ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
-        "   ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
+        "   http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
+        "   http://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/\n"
         "\n";
 }
 
@@ -3379,7 +3386,7 @@ int main_csq(int argc, char *argv[])
         {"targets-file",1,0,'T'},
         {"targets-overlap",required_argument,NULL,5},
         {"no-version",no_argument,NULL,3},
-        {"write-index",no_argument,NULL,6},
+        {"write-index",optional_argument,NULL,'W'},
         {"dump-gff",required_argument,NULL,7},
         {"unify-chr-names",required_argument,NULL,8},
         {0,0,0,0}
@@ -3388,7 +3395,7 @@ int main_csq(int argc, char *argv[])
     int regions_overlap = 1;
     int targets_overlap = 0;
     char *targets_list = NULL, *regions_list = NULL, *tmp;
-    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:W::",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -3470,7 +3477,10 @@ int main_csq(int argc, char *argv[])
                 targets_overlap = parse_overlap_option(optarg);
                 if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
-            case  6 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case  7 : args->dump_gff = optarg; break;
             case  8 :
                 if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
@@ -3490,7 +3500,7 @@ int main_csq(int argc, char *argv[])
     }
     else fname = argv[optind];
     if ( argc - optind>1 ) error("%s", usage());
-    if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
+    if ( !args->fa_fname ) error("Missing the --fasta-ref option\n");
     if ( !args->gff_fname ) error("Missing the --gff option\n");
     args->sr = bcf_sr_init();
     if ( targets_list )
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c
index 5f590d16f..3f482fdf7 100644
--- a/bcftools/csq.c.pysam.c
+++ b/bcftools/csq.c.pysam.c
@@ -2,7 +2,7 @@
 
 /* The MIT License
 
-   Copyright (c) 2016-2023 Genome Research Ltd.
+   Copyright (c) 2016-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -36,7 +36,7 @@
 
     Read about transcript types here
         http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
-        http://www.ensembl.org/info/genome/variation/predicted_data.html
+        https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
         https://www.gencodegenes.org/pages/biotypes.html
 
     List of supported biotypes
@@ -151,6 +151,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <ctype.h>
+#include <strings.h>
 #include "bcftools.h"
 #include "filter.h"
 #include "regidx.h"
@@ -556,7 +557,9 @@ void init_data(args_t *args)
         if ( args->hdr_nsmpl )
             bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
         if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
-        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+        if ( init_index2(args->out_fh,args->hdr,args->output_fname,
+                         &args->index_fn, args->write_index) < 0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname);
     }
     if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n");
 }
@@ -625,7 +628,7 @@ void destroy_data(args_t *args)
 }
 
 /*
-    The splice_* functions are for consquences around splice sites: start,stop,splice_*
+    The splice_* functions are for consequences around splice sites: start,stop,splice_*
  */
 #define SPLICE_VAR_REF 0   // ref: ACGT>ACGT, csq not applicable, skip completely
 #define SPLICE_OUTSIDE 1   // splice acceptor or similar; csq set and is done, does not overlap the region
@@ -793,7 +796,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
 static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
 {
 #if XDBG
-fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
+fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type);
 #endif
     if ( !type ) return;
     csq_t csq;
@@ -965,7 +968,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
     if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0;
 
 #if XDBG
-    fprintf(bcftools_stderr,"shifted_del_synonymous: %d-%d  %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev");
+    fprintf(bcftools_stderr,"shifted_del_synonymous: %d-%d  %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":(tr->strand==STRAND_REV?"rev":"unk"));
     fprintf(bcftools_stderr,"   %d  ..  %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt);
 #endif
 
@@ -998,7 +1001,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
         while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
         if ( ptr_vcf[i] ) return 0;       // the deleted sequence cannot be replaced
     }
-    else
+    else if ( tr->strand==STRAND_FWD )
     {
         // STRAND_FWD
         int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel;        // the position of the first base of the ref block that could potentially replace the deletion
@@ -1181,7 +1184,9 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,
         splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
         if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
         {
-            splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
+            int ref_beg = splice->ref_beg + splice->kalt.l - 1;     // 0 for AAA>A, 1 for AAA>AC
+            if ( ref_beg < splice->ref_end )
+                splice->csq |= (splice->ref_end - ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
             return SPLICE_OVERLAP;
         }
     }
@@ -1271,13 +1276,13 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d
     {
         if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
         if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
-        else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+        else if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
     }
     if ( splice->ref_end > ex_end - 3 )
     {
         if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
         if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
-        else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+        else if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
     }
     if ( splice->set_refalt )
     {
@@ -1338,17 +1343,17 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
     if ( !(tr->trim & TRIM_5PRIME) )
     {
         if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
-        else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
+        else if ( tr->strand==STRAND_REV ) { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
     }
     if ( !(tr->trim & TRIM_3PRIME) )
     {
         if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
-        else { if ( child->icds==0 ) splice.check_stop = 1; }
+        else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; }
     }
     if ( splice.check_start )   // do not check starts in incomplete CDS, defined as not starting with M
     {
         if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
-        else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+        else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
     }
     if ( child->icds!=0 ) splice.check_region_beg = 1;
     if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -1586,7 +1591,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,r
             }
         }
     }
-    else    // STRAND_REV
+    else if ( strand==STRAND_REV )
     {
         // right padding - number of bases to take from ref
         npad = (seq.m - (sbeg + seq.l)) % 3;
@@ -1673,6 +1678,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,r
             }
         }
     }
+    else error("Should not happen: %d\n", strand);
     kputc_(0,tseq); tseq->l--;
 #if DBG
  fprintf(bcftools_stderr,"    tseq: %s\n", tseq->s);
@@ -1707,7 +1713,7 @@ void tscript_splice_ref(gf_tscript_t *tr)
 int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
 {
 #if XDBG
-fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
+fprintf(bcftools_stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type);
 #endif
     khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
     vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
@@ -1858,7 +1864,7 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
     kputs(gf_type2gff_string(csq->biotype), str);
 
     if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
-        kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
+        kputs(csq->strand==STRAND_FWD ? "|+" : (csq->strand==STRAND_REV ? "|-" : "|."), str);
 
     if ( csq->vstr.l )
         kputs(csq->vstr.s, str);
@@ -1882,6 +1888,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
 {
     int i;
     gf_tscript_t *tr = hap->tr;
+    assert( tr->strand==STRAND_FWD || tr->strand==STRAND_REV );
     int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
     int icsq = node->ncsq_list++;
     hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2177,7 +2184,7 @@ void hap_finalize(args_t *args, hap_t *hap)
                 indel = 0;
             }
         }
-        else
+        else if ( tr->strand==STRAND_REV )
         {
             i = istack + 1, ibeg = -1;
             while ( --i > 0 )
@@ -3330,14 +3337,14 @@ static const char *usage(void)
         "       --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
         "       --threads INT                 Use multithreading with <int> worker threads [0]\n"
         "   -v, --verbose INT                 Verbosity level 0-2 [1]\n"
-        "       --write-index                 Automatically index the output files [off]\n"
+        "   -W, --write-index[=FMT]           Automatically index the output files [off]\n"
         "\n"
         "Example:\n"
-        "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
+        "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.87.gff3.gz in.vcf\n"
         "\n"
         "   # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
-        "   ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
-        "   ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
+        "   http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
+        "   http://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/\n"
         "\n";
 }
 
@@ -3381,7 +3388,7 @@ int main_csq(int argc, char *argv[])
         {"targets-file",1,0,'T'},
         {"targets-overlap",required_argument,NULL,5},
         {"no-version",no_argument,NULL,3},
-        {"write-index",no_argument,NULL,6},
+        {"write-index",optional_argument,NULL,'W'},
         {"dump-gff",required_argument,NULL,7},
         {"unify-chr-names",required_argument,NULL,8},
         {0,0,0,0}
@@ -3390,7 +3397,7 @@ int main_csq(int argc, char *argv[])
     int regions_overlap = 1;
     int targets_overlap = 0;
     char *targets_list = NULL, *regions_list = NULL, *tmp;
-    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:W::",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -3472,7 +3479,10 @@ int main_csq(int argc, char *argv[])
                 targets_overlap = parse_overlap_option(optarg);
                 if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                 break;
-            case  6 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case  7 : args->dump_gff = optarg; break;
             case  8 :
                 if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
@@ -3492,7 +3502,7 @@ int main_csq(int argc, char *argv[])
     }
     else fname = argv[optind];
     if ( argc - optind>1 ) error("%s", usage());
-    if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
+    if ( !args->fa_fname ) error("Missing the --fasta-ref option\n");
     if ( !args->gff_fname ) error("Missing the --gff option\n");
     args->sr = bcf_sr_init();
     if ( targets_list )
diff --git a/bcftools/edlib.c b/bcftools/edlib.c
new file mode 100644
index 000000000..5421fee48
--- /dev/null
+++ b/bcftools/edlib.c
@@ -0,0 +1,662 @@
+/*
+ * A cut down C translated of the C++ edlib.cpp file.
+ * Taken from edlib v0.1.0-166-g931be2b
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "edlib.h"
+
+typedef uint64_t Word;
+static const int WORD_SIZE = 64; // Size of Word in bits
+static const Word WORD_1 = (Word)1;
+static const Word HIGH_BIT_MASK = 1LL << 63;  // 100..00
+//#define MAX_UCHAR 255
+#define MAX_UCHAR 7 // better cache usage for our data
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+typedef struct Block {
+    Word P;  // Pvin
+    Word M;  // Mvin
+    int score; // score of last cell in block;
+} Block;
+
+
+/**
+ * Defines equality relation on alphabet characters.
+ * By default each character is always equal only to itself, but you can also provide additional equalities.
+ */
+typedef struct EqualityDefinition {
+    bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1];
+} EqualityDefinition;
+
+static EqualityDefinition *
+CreateEqualityDefinition(const char *alphabet, int alphabet_size,
+			 const EdlibEqualityPair* additionalEqualities,
+			 const int additionalEqualitiesLength) {
+    EqualityDefinition *ed = malloc(sizeof(*ed));
+
+    for (size_t i = 0; i < alphabet_size; i++) {
+	for (size_t j = 0; j < alphabet_size; j++) {
+	    ed->matrix[i][j] = (i == j);
+	}
+    }
+    if (additionalEqualities != NULL) {
+	for (int i = 0; i < additionalEqualitiesLength; i++) {
+	    const char *firstTransformed = strchr(alphabet, additionalEqualities[i].first);
+	    const char *secondTransformed = strchr(alphabet, additionalEqualities[i].second);
+	    if (firstTransformed && alphabet_size) {
+		ed->matrix[firstTransformed - alphabet][secondTransformed - alphabet] =
+		ed->matrix[secondTransformed - alphabet][firstTransformed - alphabet]
+		    = true;
+	    }
+	}
+    }
+
+    return ed;
+}
+
+/**
+ * @param a  Element from transformed sequence.
+ * @param b  Element from transformed sequence.
+ * @return True if a and b are defined as equal, false otherwise.
+ */
+static inline const /* attribute pure or const? */
+bool equalityDefinition_areEqual(const EqualityDefinition *ed, unsigned char a, unsigned char b) {
+    return ed->matrix[a][b];
+}
+
+static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
+                                           int queryLength,
+                                           const unsigned char* target, int targetLength,
+                                           int k, EdlibAlignMode mode,
+                                           int* bestScore_, int** positions_, int* numPositions_);
+
+static char *transformSequences(const char* queryOriginal, int queryLength,
+				const char* targetOriginal, int targetLength,
+				unsigned char** queryTransformed,
+				unsigned char** targetTransformed,
+                                int *alphabet_size);
+
+static inline int ceilDiv(int x, int y);
+
+static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
+
+static inline Word* buildPeq(const int alphabetLength,
+                             const unsigned char* query,
+                             const int queryLength,
+                             const EqualityDefinition* equalityDefinition);
+
+
+/**
+ * Main edlib method.
+ */
+EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
+			    const char* const targetOriginal, const int targetLength,
+			    const EdlibAlignConfig config) {
+    EdlibAlignResult result;
+    result.status = EDLIB_STATUS_OK;
+    result.editDistance = -1;
+    result.endLocations = result.startLocations = NULL;
+    result.numLocations = 0;
+    result.alignment = NULL;
+    result.alignmentLength = 0;
+    result.alphabetLength = 0;
+
+    /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
+    unsigned char* query, * target;
+    int alphabet_size;
+    char *alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
+                                         &query, &target, &alphabet_size);
+    result.alphabetLength = alphabet_size;
+    /*-------------------------------------------------------*/
+
+    // Handle special situation when at least one of the sequences has length 0.
+    if (queryLength == 0 || targetLength == 0) {
+        if (config.mode == EDLIB_MODE_NW) {
+            result.editDistance = MAX(queryLength, targetLength);
+            result.endLocations = malloc(sizeof(int) * 1);
+            result.endLocations[0] = targetLength - 1;
+            result.numLocations = 1;
+        } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) {
+            result.editDistance = queryLength;
+            result.endLocations = malloc(sizeof(int) * 1);
+            result.endLocations[0] = -1;
+            result.numLocations = 1;
+        } else {
+            result.status = EDLIB_STATUS_ERROR;
+        }
+
+        free(query);
+        free(target);
+        free(alphabet);
+        return result;
+    }
+
+    /*--------------------- INITIALIZATION ------------------*/
+    int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
+    int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
+    EqualityDefinition *equalityDefinition =
+	CreateEqualityDefinition(alphabet, alphabet_size, config.additionalEqualities, config.additionalEqualitiesLength);
+    Word* Peq = buildPeq(alphabet_size, query, queryLength, equalityDefinition);
+    /*-------------------------------------------------------*/
+
+    /*------------------ MAIN CALCULATION -------------------*/
+    // TODO: Store alignment data only after k is determined? That could make things faster.
+//    int positionNW; // Used only when mode is NW.
+//    AlignmentData* alignData = NULL;
+    bool dynamicK = false;
+    int k = config.k;
+    if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
+        dynamicK = true;
+        k = WORD_SIZE; // Gives better results than smaller k.
+    }
+
+    do {
+        if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) {
+            myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks,
+                                            queryLength, target, targetLength,
+                                            k, config.mode, &(result.editDistance),
+                                            &(result.endLocations), &(result.numLocations));
+        } else {  // mode == EDLIB_MODE_NW
+//            myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
+//                                    queryLength, target, targetLength,
+//                                    k, &(result.editDistance), &positionNW,
+//                                    false, &alignData, -1);
+        }
+        k *= 2;
+    } while(dynamicK && result.editDistance == -1);
+
+    if (result.editDistance >= 0) {  // If there is solution.
+        // If NW mode, set end location explicitly.
+        if (config.mode == EDLIB_MODE_NW) {
+            result.endLocations = malloc(sizeof(int) * 1);
+            result.endLocations[0] = targetLength - 1;
+            result.numLocations = 1;
+        }
+
+        // Find starting locations.
+        if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
+            result.startLocations = malloc(result.numLocations * sizeof(int));
+            if (config.mode == EDLIB_MODE_HW) {  // If HW, I need to calculate start locations.
+                const unsigned char* rTarget = createReverseCopy(target, targetLength);
+                const unsigned char* rQuery  = createReverseCopy(query, queryLength);
+                // Peq for reversed query.
+                Word* rPeq = buildPeq(alphabet_size, rQuery, queryLength, equalityDefinition);
+                for (int i = 0; i < result.numLocations; i++) {
+                    int endLocation = result.endLocations[i];
+                    if (endLocation == -1) {
+                        // NOTE: Sometimes one of optimal solutions is that query starts before target, like this:
+                        //                       AAGG <- target
+                        //                   CCTT     <- query
+                        //   It will never be only optimal solution and it does not happen often, however it is
+                        //   possible and in that case end location will be -1. What should we do with that?
+                        //   Should we just skip reporting such end location, although it is a solution?
+                        //   If we do report it, what is the start location? -4? -1? Nothing?
+                        // TODO: Figure this out. This has to do in general with how we think about start
+                        //   and end locations.
+                        //   Also, we have alignment later relying on this locations to limit the space of it's
+                        //   search -> how can it do it right if these locations are negative or incorrect?
+                        result.startLocations[i] = 0;  // I put 0 for now, but it does not make much sense.
+                    } else {
+                        int bestScoreSHW, numPositionsSHW;
+                        int* positionsSHW;
+                        myersCalcEditDistanceSemiGlobal(
+                                rPeq, W, maxNumBlocks,
+                                queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1,
+                                result.editDistance, EDLIB_MODE_SHW,
+                                &bestScoreSHW, &positionsSHW, &numPositionsSHW);
+                        // Taking last location as start ensures that alignment will not start with insertions
+                        // if it can start with mismatches instead.
+                        result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1];
+                        free(positionsSHW);
+                    }
+                }
+                free((void *)rTarget);
+                free((void *)rQuery);
+                free(rPeq);
+            } else {  // If mode is SHW or NW
+                for (int i = 0; i < result.numLocations; i++) {
+                    result.startLocations[i] = 0;
+                }
+            }
+        }
+    }
+    /*-------------------------------------------------------*/
+
+    //--- Free memory ---//
+    free(Peq);
+    free(query);
+    free(target);
+    free(alphabet);
+    free(equalityDefinition);
+//    DestroyAlignmentData(alignData);
+    //-------------------//
+
+    return result;
+}
+
+/**
+ * Build Peq table for given query and alphabet.
+ * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
+ * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
+ * NOTICE: free returned array with free()!
+ */
+static inline Word* buildPeq(const int alphabetLength,
+                             const unsigned char* const query,
+                             const int queryLength,
+                             const EqualityDefinition* equalityDefinition) {
+    int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
+    // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
+    Word* Peq = malloc((alphabetLength + 1) * maxNumBlocks * sizeof(*Peq));
+
+    // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
+    // Optimised Peq building avoiding branching.
+    for (int symbol = 0; symbol < alphabetLength; symbol++) {
+        for (int b = 0; b < maxNumBlocks; b++) {
+            Word PeqW = 0;
+            for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
+                PeqW = (PeqW<<1)
+                     + (r >= queryLength
+                        || equalityDefinition_areEqual(equalityDefinition,
+                                                       query[r], symbol)); 
+            }
+            Peq[symbol * maxNumBlocks + b] = PeqW;
+        }
+    }
+    {
+        int symbol = alphabetLength;
+        for (int b = 0; b < maxNumBlocks; b++) {
+            // Last symbol is wildcard, so it is all 1s
+            Peq[symbol * maxNumBlocks + b] = (Word)-1;
+        }
+    }
+
+    return Peq;
+}
+
+
+/**
+ * Returns new sequence that is reverse of given sequence.
+ * Free returned array with free()
+ */
+static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
+    unsigned char* rSeq = malloc(length);
+    for (int i = 0; i < length; i++) {
+        rSeq[i] = seq[length - i - 1];
+    }
+    return rSeq;
+}
+
+/**
+ * Corresponds to Advance_Block function from Myers.
+ * Calculates one word(block), which is part of a column.
+ * Highest bit of word (one most to the left) is most bottom cell of block from column.
+ * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
+ * @param [in] Pv  Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
+ * @param [in] Mv  Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
+ * @param [in] Eq  Bitset, Eq[i] == 1 if match, 0 if mismatch.
+ * @param [in] hin  Will be +1, 0 or -1.
+ * @param [out] PvOut  Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
+ * @param [out] MvOut  Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
+ * @param [out] hout  Will be +1, 0 or -1.
+ */
+static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
+                                 Word *PvOut, Word *MvOut) {
+    // hin can be 1, -1 or 0.
+    // 1  -> 00...01
+    // 0  -> 00...00
+    // -1 -> 11...11 (2-complement)
+
+    Word hinIsNeg = (Word)(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
+
+    Word Xv = Eq | Mv;
+    // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
+    Eq |= hinIsNeg;
+    Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
+
+    Word Ph = Mv | ~(Xh | Pv);
+    Word Mh = Pv & Xh;
+
+    int hout = 0;
+    // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
+    hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
+    // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
+    hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
+
+    Ph <<= 1;
+    Mh <<= 1;
+
+    // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
+    Mh |= hinIsNeg;
+    // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
+    Ph |= (Word)((hin + 1) >> 1);
+
+    *PvOut = Mh | ~(Xv | Ph);
+    *MvOut = Ph & Xv;
+
+    return hout;
+}
+
+/**
+ * Does ceiling division x / y.
+ * Note: x and y must be non-negative and x + y must not overflow.
+ */
+static inline int ceilDiv(const int x, const int y) {
+    return x % y ? x / y + 1 : x / y;
+}
+
+static inline int min(const int x, const int y) {
+    return x < y ? x : y;
+}
+
+
+/**
+ * @param [in] block
+ * @return Values of cells in block, starting with bottom cell in block.
+ */
+static inline int *getBlockCellValues(const Block block) {
+    int *scores = malloc(WORD_SIZE * sizeof(*scores));
+    int score = block.score;
+    Word mask = HIGH_BIT_MASK;
+    for (int i = 0; i < WORD_SIZE - 1; i++) {
+        scores[i] = score;
+        if (block.P & mask) score--;
+        if (block.M & mask) score++;
+        mask >>= 1;
+    }
+    scores[WORD_SIZE - 1] = score;
+    return scores;
+}
+
+/**
+ * @param [in] block
+ * @param [in] k
+ * @return True if all cells in block have value larger than k, otherwise false.
+ */
+static inline bool allBlockCellsLarger(const Block block, const int k) {
+    int *scores = getBlockCellValues(block);
+    for (int i = 0; i < WORD_SIZE; i++) {
+        if (scores[i] <= k) {
+            free(scores);
+            return false;
+        }
+    }
+
+    free(scores);
+    return true;
+}
+
+
+/**
+ * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods.
+ * @param [in] Peq  Query profile.
+ * @param [in] W  Size of padding in last block.
+ *                TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] maxNumBlocks  Number of blocks needed to cover the whole query.
+ *                           TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] queryLength
+ * @param [in] target
+ * @param [in] targetLength
+ * @param [in] k
+ * @param [in] mode  EDLIB_MODE_HW or EDLIB_MODE_SHW
+ * @param [out] bestScore_  Edit distance.
+ * @param [out] positions_  Array of 0-indexed positions in target at which best score was found.
+                            Make sure to free this array with free().
+ * @param [out] numPositions_  Number of positions in the positions_ array.
+ * @return Status.
+ */
+static int myersCalcEditDistanceSemiGlobal(
+        const Word* const Peq, const int W, const int maxNumBlocks,
+        const int queryLength,
+        const unsigned char* const target, const int targetLength,
+        int k, const EdlibAlignMode mode,
+        int* const bestScore_, int** const positions_, int* const numPositions_) {
+    *positions_ = NULL;
+    *numPositions_ = 0;
+
+    // firstBlock is 0-based index of first block in Ukkonen band.
+    // lastBlock is 0-based index of last block in Ukkonen band.
+    int firstBlock = 0;
+    int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
+    Block *bl; // Current block
+
+    Block* blocks = malloc(maxNumBlocks * sizeof(*blocks));
+
+    // For HW, solution will never be larger then queryLength.
+    if (mode == EDLIB_MODE_HW) {
+        k = min(queryLength, k);
+    }
+
+    // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
+    // This gives speed up of about 2 times for small k.
+    const int STRONG_REDUCE_NUM = 2048;
+
+    // Initialize P, M and score
+    bl = blocks;
+    for (int b = 0; b <= lastBlock; b++) {
+        bl->score = (b + 1) * WORD_SIZE;
+        bl->P = (Word)(-1); // All 1s
+        bl->M = (Word)(0);
+        bl++;
+    }
+
+    int bestScore = -1;
+#define MAX_POS 100  // maximum number of positions returned.
+    int positions[MAX_POS];
+    int npositions = 0;
+    const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
+    const unsigned char* targetChar = target;
+    for (int c = 0; c < targetLength; c++) { // for each column
+        const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks;
+
+        //----------------------- Calculate column -------------------------//
+        int hout = startHout;
+        bl = blocks + firstBlock;
+        Peq_c += firstBlock;
+        for (int b = firstBlock; b <= lastBlock; b++) {
+            hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, &bl->P, &bl->M);
+            bl->score += hout;
+            bl++; Peq_c++;
+        }
+        bl--; Peq_c--;
+        //------------------------------------------------------------------//
+
+        //---------- Adjust number of blocks according to Ukkonen ----------//
+        if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block
+            && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
+            // If score of left block is not too big, calculate one more block
+            lastBlock++; bl++; Peq_c++;
+            bl->P = (Word)(-1); // All 1s
+            bl->M = (Word)(0);
+            bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, &bl->P, &bl->M);
+        } else {
+            while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) {
+                lastBlock--; bl--; Peq_c--;
+            }
+        }
+
+        // Every some columns, do some expensive but also more efficient block reducing.
+        // This is important!
+        //
+        // Reduce the band by decreasing last block if possible.
+        if (c % STRONG_REDUCE_NUM == 0) {
+            while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) {
+                lastBlock--; bl--; Peq_c--;
+            }
+        }
+        // For HW, even if all cells are > k, there still may be solution in next
+        // column because starting conditions at upper boundary are 0.
+        // That means that first block is always candidate for solution,
+        // and we can never end calculation before last column.
+        if (mode == EDLIB_MODE_HW && lastBlock == -1) {
+            lastBlock++; bl++; Peq_c++;
+        }
+
+        // Reduce band by increasing first block if possible. Not applicable to HW.
+        if (mode != EDLIB_MODE_HW) {
+            while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) {
+                firstBlock++;
+            }
+            if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks
+                while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) {
+                    firstBlock++;
+                }
+            }
+        }
+
+        // If band stops to exist finish
+        if (lastBlock < firstBlock) {
+            *bestScore_ = bestScore;
+            if (bestScore != -1) {
+                *positions_ = malloc(npositions * sizeof(int));
+                *numPositions_ = npositions;
+                memcpy(*positions_, positions, npositions * sizeof(int));
+            }
+            free(blocks);
+            return EDLIB_STATUS_OK;
+        }
+        //------------------------------------------------------------------//
+
+        //------------------------- Update best score ----------------------//
+        if (lastBlock == maxNumBlocks - 1) {
+            int colScore = bl->score;
+            if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
+                // NOTE: Score that I find in column c is actually score from column c-W
+                if (bestScore == -1 || colScore <= bestScore) {
+                    if (colScore != bestScore) {
+			npositions = 0;
+                        bestScore = colScore;
+                        // Change k so we will look only for equal or better
+                        // scores then the best found so far.
+                        k = bestScore;
+                    }
+		    if (npositions < MAX_POS)
+			positions[npositions++] = c - W;
+                }
+            }
+        }
+        //------------------------------------------------------------------//
+
+        targetChar++;
+    }
+
+
+    // Obtain results for last W columns from last column.
+    if (lastBlock == maxNumBlocks - 1) {
+	int *blockScores = getBlockCellValues(*bl);
+        for (int i = 0; i < W; i++) {
+            int colScore = blockScores[i + 1];
+            if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
+                if (colScore != bestScore) {
+                    npositions = 0;
+                    k = bestScore = colScore;
+                }
+		if (npositions < MAX_POS)
+		    positions[npositions++] = targetLength - W + i;
+            }
+        }
+        free(blockScores);
+    }
+
+    *bestScore_ = bestScore;
+    if (bestScore != -1) {
+        *positions_ = malloc(npositions * sizeof(int));
+        *numPositions_ = npositions;
+        memcpy(*positions_, positions, npositions * sizeof(int));
+    }
+
+    free(blocks);
+    return EDLIB_STATUS_OK;
+}
+
+
+/**
+ * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
+ * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
+ * Most of internal edlib functions expect such transformed sequences.
+ * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
+ * Example:
+ *   Original sequences: "ACT" and "CGT".
+ *   Alphabet would be recognized as "ACTG". Alphabet length = 4.
+ *   Transformed sequences: [0, 1, 2] and [1, 3, 2].
+ * @param [in] queryOriginal
+ * @param [in] queryLength
+ * @param [in] targetOriginal
+ * @param [in] targetLength
+ * @param [out] queryTransformed  It will contain values in range [0, alphabet length - 1].
+ * @param [out] targetTransformed  It will contain values in range [0, alphabet length - 1].
+ * @return  Alphabet as a string of unique characters, where index of each character is its value in transformed
+ *          sequences.
+ */
+static char *transformSequences(const char* const queryOriginal, const int queryLength,
+				const char* const targetOriginal, const int targetLength,
+				unsigned char** const queryTransformed,
+				unsigned char** const targetTransformed,
+                                int *alphabet_size) {
+    // Alphabet is constructed from letters that are present in sequences.
+    // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
+    // and new query and target are created in which letters are replaced with their ordinal numbers.
+    // This query and target are used in all the calculations later.
+    *queryTransformed  = malloc(sizeof(unsigned char) * queryLength);
+    *targetTransformed = malloc(sizeof(unsigned char) * targetLength);
+
+    char *alphabet = malloc(MAX_UCHAR+1), *alphabet_cp = alphabet;
+
+    // Alphabet information, it is constructed on fly while transforming sequences.
+    // letterIdx[c] is index of letter c in alphabet.
+    unsigned char letterIdx[MAX_UCHAR + 1];
+    bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet
+    for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false;
+
+    for (int i = 0; i < queryLength; i++) {
+        unsigned char c = queryOriginal[i];
+        if (!inAlphabet[c]) {
+            inAlphabet[c] = true;
+            letterIdx[c] = alphabet_cp - alphabet;
+            *alphabet_cp++ = queryOriginal[i];
+        }
+        (*queryTransformed)[i] = letterIdx[c];
+    }
+    for (int i = 0; i < targetLength; i++) {
+        unsigned char c = targetOriginal[i];
+        if (!inAlphabet[c]) {
+            inAlphabet[c] = true;
+            letterIdx[c] = alphabet_cp - alphabet;
+            *alphabet_cp++ = targetOriginal[i];
+        }
+        (*targetTransformed)[i] = letterIdx[c];
+    }
+
+    *alphabet_size = alphabet_cp - alphabet;
+    return alphabet;
+}
+
+
+EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task,
+				     const EdlibEqualityPair* additionalEqualities,
+				     int additionalEqualitiesLength) {
+    EdlibAlignConfig config;
+    config.k = k;
+    config.mode = mode;
+    config.task = task;
+    config.additionalEqualities = additionalEqualities;
+    config.additionalEqualitiesLength = additionalEqualitiesLength;
+    return config;
+}
+
+EdlibAlignConfig edlibDefaultAlignConfig(void) {
+    return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0);
+}
+
+void edlibFreeAlignResult(EdlibAlignResult result) {
+    if (result.endLocations) free(result.endLocations);
+    if (result.startLocations) free(result.startLocations);
+    if (result.alignment) free(result.alignment);
+}
diff --git a/bcftools/edlib.c.pysam.c b/bcftools/edlib.c.pysam.c
new file mode 100644
index 000000000..e5b69984d
--- /dev/null
+++ b/bcftools/edlib.c.pysam.c
@@ -0,0 +1,664 @@
+#include "bcftools.pysam.h"
+
+/*
+ * A cut down C translated of the C++ edlib.cpp file.
+ * Taken from edlib v0.1.0-166-g931be2b
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "edlib.h"
+
+typedef uint64_t Word;
+static const int WORD_SIZE = 64; // Size of Word in bits
+static const Word WORD_1 = (Word)1;
+static const Word HIGH_BIT_MASK = 1LL << 63;  // 100..00
+//#define MAX_UCHAR 255
+#define MAX_UCHAR 7 // better cache usage for our data
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+typedef struct Block {
+    Word P;  // Pvin
+    Word M;  // Mvin
+    int score; // score of last cell in block;
+} Block;
+
+
+/**
+ * Defines equality relation on alphabet characters.
+ * By default each character is always equal only to itself, but you can also provide additional equalities.
+ */
+typedef struct EqualityDefinition {
+    bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1];
+} EqualityDefinition;
+
+static EqualityDefinition *
+CreateEqualityDefinition(const char *alphabet, int alphabet_size,
+			 const EdlibEqualityPair* additionalEqualities,
+			 const int additionalEqualitiesLength) {
+    EqualityDefinition *ed = malloc(sizeof(*ed));
+
+    for (size_t i = 0; i < alphabet_size; i++) {
+	for (size_t j = 0; j < alphabet_size; j++) {
+	    ed->matrix[i][j] = (i == j);
+	}
+    }
+    if (additionalEqualities != NULL) {
+	for (int i = 0; i < additionalEqualitiesLength; i++) {
+	    const char *firstTransformed = strchr(alphabet, additionalEqualities[i].first);
+	    const char *secondTransformed = strchr(alphabet, additionalEqualities[i].second);
+	    if (firstTransformed && alphabet_size) {
+		ed->matrix[firstTransformed - alphabet][secondTransformed - alphabet] =
+		ed->matrix[secondTransformed - alphabet][firstTransformed - alphabet]
+		    = true;
+	    }
+	}
+    }
+
+    return ed;
+}
+
+/**
+ * @param a  Element from transformed sequence.
+ * @param b  Element from transformed sequence.
+ * @return True if a and b are defined as equal, false otherwise.
+ */
+static inline const /* attribute pure or const? */
+bool equalityDefinition_areEqual(const EqualityDefinition *ed, unsigned char a, unsigned char b) {
+    return ed->matrix[a][b];
+}
+
+static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
+                                           int queryLength,
+                                           const unsigned char* target, int targetLength,
+                                           int k, EdlibAlignMode mode,
+                                           int* bestScore_, int** positions_, int* numPositions_);
+
+static char *transformSequences(const char* queryOriginal, int queryLength,
+				const char* targetOriginal, int targetLength,
+				unsigned char** queryTransformed,
+				unsigned char** targetTransformed,
+                                int *alphabet_size);
+
+static inline int ceilDiv(int x, int y);
+
+static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
+
+static inline Word* buildPeq(const int alphabetLength,
+                             const unsigned char* query,
+                             const int queryLength,
+                             const EqualityDefinition* equalityDefinition);
+
+
+/**
+ * Main edlib method.
+ */
+EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
+			    const char* const targetOriginal, const int targetLength,
+			    const EdlibAlignConfig config) {
+    EdlibAlignResult result;
+    result.status = EDLIB_STATUS_OK;
+    result.editDistance = -1;
+    result.endLocations = result.startLocations = NULL;
+    result.numLocations = 0;
+    result.alignment = NULL;
+    result.alignmentLength = 0;
+    result.alphabetLength = 0;
+
+    /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
+    unsigned char* query, * target;
+    int alphabet_size;
+    char *alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
+                                         &query, &target, &alphabet_size);
+    result.alphabetLength = alphabet_size;
+    /*-------------------------------------------------------*/
+
+    // Handle special situation when at least one of the sequences has length 0.
+    if (queryLength == 0 || targetLength == 0) {
+        if (config.mode == EDLIB_MODE_NW) {
+            result.editDistance = MAX(queryLength, targetLength);
+            result.endLocations = malloc(sizeof(int) * 1);
+            result.endLocations[0] = targetLength - 1;
+            result.numLocations = 1;
+        } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) {
+            result.editDistance = queryLength;
+            result.endLocations = malloc(sizeof(int) * 1);
+            result.endLocations[0] = -1;
+            result.numLocations = 1;
+        } else {
+            result.status = EDLIB_STATUS_ERROR;
+        }
+
+        free(query);
+        free(target);
+        free(alphabet);
+        return result;
+    }
+
+    /*--------------------- INITIALIZATION ------------------*/
+    int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
+    int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
+    EqualityDefinition *equalityDefinition =
+	CreateEqualityDefinition(alphabet, alphabet_size, config.additionalEqualities, config.additionalEqualitiesLength);
+    Word* Peq = buildPeq(alphabet_size, query, queryLength, equalityDefinition);
+    /*-------------------------------------------------------*/
+
+    /*------------------ MAIN CALCULATION -------------------*/
+    // TODO: Store alignment data only after k is determined? That could make things faster.
+//    int positionNW; // Used only when mode is NW.
+//    AlignmentData* alignData = NULL;
+    bool dynamicK = false;
+    int k = config.k;
+    if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
+        dynamicK = true;
+        k = WORD_SIZE; // Gives better results than smaller k.
+    }
+
+    do {
+        if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) {
+            myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks,
+                                            queryLength, target, targetLength,
+                                            k, config.mode, &(result.editDistance),
+                                            &(result.endLocations), &(result.numLocations));
+        } else {  // mode == EDLIB_MODE_NW
+//            myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
+//                                    queryLength, target, targetLength,
+//                                    k, &(result.editDistance), &positionNW,
+//                                    false, &alignData, -1);
+        }
+        k *= 2;
+    } while(dynamicK && result.editDistance == -1);
+
+    if (result.editDistance >= 0) {  // If there is solution.
+        // If NW mode, set end location explicitly.
+        if (config.mode == EDLIB_MODE_NW) {
+            result.endLocations = malloc(sizeof(int) * 1);
+            result.endLocations[0] = targetLength - 1;
+            result.numLocations = 1;
+        }
+
+        // Find starting locations.
+        if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
+            result.startLocations = malloc(result.numLocations * sizeof(int));
+            if (config.mode == EDLIB_MODE_HW) {  // If HW, I need to calculate start locations.
+                const unsigned char* rTarget = createReverseCopy(target, targetLength);
+                const unsigned char* rQuery  = createReverseCopy(query, queryLength);
+                // Peq for reversed query.
+                Word* rPeq = buildPeq(alphabet_size, rQuery, queryLength, equalityDefinition);
+                for (int i = 0; i < result.numLocations; i++) {
+                    int endLocation = result.endLocations[i];
+                    if (endLocation == -1) {
+                        // NOTE: Sometimes one of optimal solutions is that query starts before target, like this:
+                        //                       AAGG <- target
+                        //                   CCTT     <- query
+                        //   It will never be only optimal solution and it does not happen often, however it is
+                        //   possible and in that case end location will be -1. What should we do with that?
+                        //   Should we just skip reporting such end location, although it is a solution?
+                        //   If we do report it, what is the start location? -4? -1? Nothing?
+                        // TODO: Figure this out. This has to do in general with how we think about start
+                        //   and end locations.
+                        //   Also, we have alignment later relying on this locations to limit the space of it's
+                        //   search -> how can it do it right if these locations are negative or incorrect?
+                        result.startLocations[i] = 0;  // I put 0 for now, but it does not make much sense.
+                    } else {
+                        int bestScoreSHW, numPositionsSHW;
+                        int* positionsSHW;
+                        myersCalcEditDistanceSemiGlobal(
+                                rPeq, W, maxNumBlocks,
+                                queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1,
+                                result.editDistance, EDLIB_MODE_SHW,
+                                &bestScoreSHW, &positionsSHW, &numPositionsSHW);
+                        // Taking last location as start ensures that alignment will not start with insertions
+                        // if it can start with mismatches instead.
+                        result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1];
+                        free(positionsSHW);
+                    }
+                }
+                free((void *)rTarget);
+                free((void *)rQuery);
+                free(rPeq);
+            } else {  // If mode is SHW or NW
+                for (int i = 0; i < result.numLocations; i++) {
+                    result.startLocations[i] = 0;
+                }
+            }
+        }
+    }
+    /*-------------------------------------------------------*/
+
+    //--- Free memory ---//
+    free(Peq);
+    free(query);
+    free(target);
+    free(alphabet);
+    free(equalityDefinition);
+//    DestroyAlignmentData(alignData);
+    //-------------------//
+
+    return result;
+}
+
+/**
+ * Build Peq table for given query and alphabet.
+ * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
+ * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
+ * NOTICE: free returned array with free()!
+ */
+static inline Word* buildPeq(const int alphabetLength,
+                             const unsigned char* const query,
+                             const int queryLength,
+                             const EqualityDefinition* equalityDefinition) {
+    int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
+    // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
+    Word* Peq = malloc((alphabetLength + 1) * maxNumBlocks * sizeof(*Peq));
+
+    // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
+    // Optimised Peq building avoiding branching.
+    for (int symbol = 0; symbol < alphabetLength; symbol++) {
+        for (int b = 0; b < maxNumBlocks; b++) {
+            Word PeqW = 0;
+            for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
+                PeqW = (PeqW<<1)
+                     + (r >= queryLength
+                        || equalityDefinition_areEqual(equalityDefinition,
+                                                       query[r], symbol)); 
+            }
+            Peq[symbol * maxNumBlocks + b] = PeqW;
+        }
+    }
+    {
+        int symbol = alphabetLength;
+        for (int b = 0; b < maxNumBlocks; b++) {
+            // Last symbol is wildcard, so it is all 1s
+            Peq[symbol * maxNumBlocks + b] = (Word)-1;
+        }
+    }
+
+    return Peq;
+}
+
+
+/**
+ * Returns new sequence that is reverse of given sequence.
+ * Free returned array with free()
+ */
+static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
+    unsigned char* rSeq = malloc(length);
+    for (int i = 0; i < length; i++) {
+        rSeq[i] = seq[length - i - 1];
+    }
+    return rSeq;
+}
+
+/**
+ * Corresponds to Advance_Block function from Myers.
+ * Calculates one word(block), which is part of a column.
+ * Highest bit of word (one most to the left) is most bottom cell of block from column.
+ * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
+ * @param [in] Pv  Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
+ * @param [in] Mv  Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
+ * @param [in] Eq  Bitset, Eq[i] == 1 if match, 0 if mismatch.
+ * @param [in] hin  Will be +1, 0 or -1.
+ * @param [out] PvOut  Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
+ * @param [out] MvOut  Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
+ * @param [out] hout  Will be +1, 0 or -1.
+ */
+static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
+                                 Word *PvOut, Word *MvOut) {
+    // hin can be 1, -1 or 0.
+    // 1  -> 00...01
+    // 0  -> 00...00
+    // -1 -> 11...11 (2-complement)
+
+    Word hinIsNeg = (Word)(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
+
+    Word Xv = Eq | Mv;
+    // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
+    Eq |= hinIsNeg;
+    Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
+
+    Word Ph = Mv | ~(Xh | Pv);
+    Word Mh = Pv & Xh;
+
+    int hout = 0;
+    // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
+    hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
+    // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
+    hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
+
+    Ph <<= 1;
+    Mh <<= 1;
+
+    // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
+    Mh |= hinIsNeg;
+    // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
+    Ph |= (Word)((hin + 1) >> 1);
+
+    *PvOut = Mh | ~(Xv | Ph);
+    *MvOut = Ph & Xv;
+
+    return hout;
+}
+
+/**
+ * Does ceiling division x / y.
+ * Note: x and y must be non-negative and x + y must not overflow.
+ */
+static inline int ceilDiv(const int x, const int y) {
+    return x % y ? x / y + 1 : x / y;
+}
+
+static inline int min(const int x, const int y) {
+    return x < y ? x : y;
+}
+
+
+/**
+ * @param [in] block
+ * @return Values of cells in block, starting with bottom cell in block.
+ */
+static inline int *getBlockCellValues(const Block block) {
+    int *scores = malloc(WORD_SIZE * sizeof(*scores));
+    int score = block.score;
+    Word mask = HIGH_BIT_MASK;
+    for (int i = 0; i < WORD_SIZE - 1; i++) {
+        scores[i] = score;
+        if (block.P & mask) score--;
+        if (block.M & mask) score++;
+        mask >>= 1;
+    }
+    scores[WORD_SIZE - 1] = score;
+    return scores;
+}
+
+/**
+ * @param [in] block
+ * @param [in] k
+ * @return True if all cells in block have value larger than k, otherwise false.
+ */
+static inline bool allBlockCellsLarger(const Block block, const int k) {
+    int *scores = getBlockCellValues(block);
+    for (int i = 0; i < WORD_SIZE; i++) {
+        if (scores[i] <= k) {
+            free(scores);
+            return false;
+        }
+    }
+
+    free(scores);
+    return true;
+}
+
+
+/**
+ * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods.
+ * @param [in] Peq  Query profile.
+ * @param [in] W  Size of padding in last block.
+ *                TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] maxNumBlocks  Number of blocks needed to cover the whole query.
+ *                           TODO: Calculate this directly from query, instead of passing it.
+ * @param [in] queryLength
+ * @param [in] target
+ * @param [in] targetLength
+ * @param [in] k
+ * @param [in] mode  EDLIB_MODE_HW or EDLIB_MODE_SHW
+ * @param [out] bestScore_  Edit distance.
+ * @param [out] positions_  Array of 0-indexed positions in target at which best score was found.
+                            Make sure to free this array with free().
+ * @param [out] numPositions_  Number of positions in the positions_ array.
+ * @return Status.
+ */
+static int myersCalcEditDistanceSemiGlobal(
+        const Word* const Peq, const int W, const int maxNumBlocks,
+        const int queryLength,
+        const unsigned char* const target, const int targetLength,
+        int k, const EdlibAlignMode mode,
+        int* const bestScore_, int** const positions_, int* const numPositions_) {
+    *positions_ = NULL;
+    *numPositions_ = 0;
+
+    // firstBlock is 0-based index of first block in Ukkonen band.
+    // lastBlock is 0-based index of last block in Ukkonen band.
+    int firstBlock = 0;
+    int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
+    Block *bl; // Current block
+
+    Block* blocks = malloc(maxNumBlocks * sizeof(*blocks));
+
+    // For HW, solution will never be larger then queryLength.
+    if (mode == EDLIB_MODE_HW) {
+        k = min(queryLength, k);
+    }
+
+    // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
+    // This gives speed up of about 2 times for small k.
+    const int STRONG_REDUCE_NUM = 2048;
+
+    // Initialize P, M and score
+    bl = blocks;
+    for (int b = 0; b <= lastBlock; b++) {
+        bl->score = (b + 1) * WORD_SIZE;
+        bl->P = (Word)(-1); // All 1s
+        bl->M = (Word)(0);
+        bl++;
+    }
+
+    int bestScore = -1;
+#define MAX_POS 100  // maximum number of positions returned.
+    int positions[MAX_POS];
+    int npositions = 0;
+    const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
+    const unsigned char* targetChar = target;
+    for (int c = 0; c < targetLength; c++) { // for each column
+        const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks;
+
+        //----------------------- Calculate column -------------------------//
+        int hout = startHout;
+        bl = blocks + firstBlock;
+        Peq_c += firstBlock;
+        for (int b = firstBlock; b <= lastBlock; b++) {
+            hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, &bl->P, &bl->M);
+            bl->score += hout;
+            bl++; Peq_c++;
+        }
+        bl--; Peq_c--;
+        //------------------------------------------------------------------//
+
+        //---------- Adjust number of blocks according to Ukkonen ----------//
+        if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block
+            && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
+            // If score of left block is not too big, calculate one more block
+            lastBlock++; bl++; Peq_c++;
+            bl->P = (Word)(-1); // All 1s
+            bl->M = (Word)(0);
+            bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, &bl->P, &bl->M);
+        } else {
+            while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) {
+                lastBlock--; bl--; Peq_c--;
+            }
+        }
+
+        // Every some columns, do some expensive but also more efficient block reducing.
+        // This is important!
+        //
+        // Reduce the band by decreasing last block if possible.
+        if (c % STRONG_REDUCE_NUM == 0) {
+            while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) {
+                lastBlock--; bl--; Peq_c--;
+            }
+        }
+        // For HW, even if all cells are > k, there still may be solution in next
+        // column because starting conditions at upper boundary are 0.
+        // That means that first block is always candidate for solution,
+        // and we can never end calculation before last column.
+        if (mode == EDLIB_MODE_HW && lastBlock == -1) {
+            lastBlock++; bl++; Peq_c++;
+        }
+
+        // Reduce band by increasing first block if possible. Not applicable to HW.
+        if (mode != EDLIB_MODE_HW) {
+            while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) {
+                firstBlock++;
+            }
+            if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks
+                while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) {
+                    firstBlock++;
+                }
+            }
+        }
+
+        // If band stops to exist finish
+        if (lastBlock < firstBlock) {
+            *bestScore_ = bestScore;
+            if (bestScore != -1) {
+                *positions_ = malloc(npositions * sizeof(int));
+                *numPositions_ = npositions;
+                memcpy(*positions_, positions, npositions * sizeof(int));
+            }
+            free(blocks);
+            return EDLIB_STATUS_OK;
+        }
+        //------------------------------------------------------------------//
+
+        //------------------------- Update best score ----------------------//
+        if (lastBlock == maxNumBlocks - 1) {
+            int colScore = bl->score;
+            if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
+                // NOTE: Score that I find in column c is actually score from column c-W
+                if (bestScore == -1 || colScore <= bestScore) {
+                    if (colScore != bestScore) {
+			npositions = 0;
+                        bestScore = colScore;
+                        // Change k so we will look only for equal or better
+                        // scores then the best found so far.
+                        k = bestScore;
+                    }
+		    if (npositions < MAX_POS)
+			positions[npositions++] = c - W;
+                }
+            }
+        }
+        //------------------------------------------------------------------//
+
+        targetChar++;
+    }
+
+
+    // Obtain results for last W columns from last column.
+    if (lastBlock == maxNumBlocks - 1) {
+	int *blockScores = getBlockCellValues(*bl);
+        for (int i = 0; i < W; i++) {
+            int colScore = blockScores[i + 1];
+            if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
+                if (colScore != bestScore) {
+                    npositions = 0;
+                    k = bestScore = colScore;
+                }
+		if (npositions < MAX_POS)
+		    positions[npositions++] = targetLength - W + i;
+            }
+        }
+        free(blockScores);
+    }
+
+    *bestScore_ = bestScore;
+    if (bestScore != -1) {
+        *positions_ = malloc(npositions * sizeof(int));
+        *numPositions_ = npositions;
+        memcpy(*positions_, positions, npositions * sizeof(int));
+    }
+
+    free(blocks);
+    return EDLIB_STATUS_OK;
+}
+
+
+/**
+ * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
+ * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
+ * Most of internal edlib functions expect such transformed sequences.
+ * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
+ * Example:
+ *   Original sequences: "ACT" and "CGT".
+ *   Alphabet would be recognized as "ACTG". Alphabet length = 4.
+ *   Transformed sequences: [0, 1, 2] and [1, 3, 2].
+ * @param [in] queryOriginal
+ * @param [in] queryLength
+ * @param [in] targetOriginal
+ * @param [in] targetLength
+ * @param [out] queryTransformed  It will contain values in range [0, alphabet length - 1].
+ * @param [out] targetTransformed  It will contain values in range [0, alphabet length - 1].
+ * @return  Alphabet as a string of unique characters, where index of each character is its value in transformed
+ *          sequences.
+ */
+static char *transformSequences(const char* const queryOriginal, const int queryLength,
+				const char* const targetOriginal, const int targetLength,
+				unsigned char** const queryTransformed,
+				unsigned char** const targetTransformed,
+                                int *alphabet_size) {
+    // Alphabet is constructed from letters that are present in sequences.
+    // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
+    // and new query and target are created in which letters are replaced with their ordinal numbers.
+    // This query and target are used in all the calculations later.
+    *queryTransformed  = malloc(sizeof(unsigned char) * queryLength);
+    *targetTransformed = malloc(sizeof(unsigned char) * targetLength);
+
+    char *alphabet = malloc(MAX_UCHAR+1), *alphabet_cp = alphabet;
+
+    // Alphabet information, it is constructed on fly while transforming sequences.
+    // letterIdx[c] is index of letter c in alphabet.
+    unsigned char letterIdx[MAX_UCHAR + 1];
+    bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet
+    for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false;
+
+    for (int i = 0; i < queryLength; i++) {
+        unsigned char c = queryOriginal[i];
+        if (!inAlphabet[c]) {
+            inAlphabet[c] = true;
+            letterIdx[c] = alphabet_cp - alphabet;
+            *alphabet_cp++ = queryOriginal[i];
+        }
+        (*queryTransformed)[i] = letterIdx[c];
+    }
+    for (int i = 0; i < targetLength; i++) {
+        unsigned char c = targetOriginal[i];
+        if (!inAlphabet[c]) {
+            inAlphabet[c] = true;
+            letterIdx[c] = alphabet_cp - alphabet;
+            *alphabet_cp++ = targetOriginal[i];
+        }
+        (*targetTransformed)[i] = letterIdx[c];
+    }
+
+    *alphabet_size = alphabet_cp - alphabet;
+    return alphabet;
+}
+
+
+EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task,
+				     const EdlibEqualityPair* additionalEqualities,
+				     int additionalEqualitiesLength) {
+    EdlibAlignConfig config;
+    config.k = k;
+    config.mode = mode;
+    config.task = task;
+    config.additionalEqualities = additionalEqualities;
+    config.additionalEqualitiesLength = additionalEqualitiesLength;
+    return config;
+}
+
+EdlibAlignConfig edlibDefaultAlignConfig(void) {
+    return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0);
+}
+
+void edlibFreeAlignResult(EdlibAlignResult result) {
+    if (result.endLocations) free(result.endLocations);
+    if (result.startLocations) free(result.startLocations);
+    if (result.alignment) free(result.alignment);
+}
diff --git a/bcftools/edlib.h b/bcftools/edlib.h
new file mode 100644
index 000000000..1f5eca192
--- /dev/null
+++ b/bcftools/edlib.h
@@ -0,0 +1,277 @@
+#ifndef EDLIB_H
+#define EDLIB_H
+
+/**
+ * @file
+ * @author Martin Sosic
+ * @brief Main header file, containing all public functions and structures.
+ */
+
+// Define EDLIB_API macro to properly export symbols
+#ifdef EDLIB_SHARED
+#    ifdef _WIN32
+#        ifdef EDLIB_BUILD
+#            define EDLIB_API __declspec(dllexport)
+#        else
+#            define EDLIB_API __declspec(dllimport)
+#        endif
+#    else
+#        define EDLIB_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define EDLIB_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Status codes
+#define EDLIB_STATUS_OK 0
+#define EDLIB_STATUS_ERROR 1
+
+    /**
+     * Alignment methods - how should Edlib treat gaps before and after query?
+     */
+    typedef enum {
+        /**
+         * Global method. This is the standard method.
+         * Useful when you want to find out how similar is first sequence to second sequence.
+         */
+        EDLIB_MODE_NW,
+        /**
+         * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
+         * What that means is that deleting elements from the end of second sequence is "free"!
+         * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
+         * of second sequence is "free" and does not count into total edit distance. This method is appropriate
+         * when you want to find out how well first sequence fits at the beginning of second sequence.
+         */
+        EDLIB_MODE_SHW,
+        /**
+         * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
+         * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
+         * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
+         * and GAC from the end of second sequence is "free" and does not count into total edit distance.
+         * This method is appropriate when you want to find out how well first sequence fits at any part of
+         * second sequence.
+         * For example, if your second sequence was a long text and your first sequence was a sentence from that text,
+         * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
+         * that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
+         */
+        EDLIB_MODE_HW
+    } EdlibAlignMode;
+
+    /**
+     * Alignment tasks - what do you want Edlib to do?
+     */
+    typedef enum {
+        EDLIB_TASK_DISTANCE,  //!< Find edit distance and end locations.
+        EDLIB_TASK_LOC,       //!< Find edit distance, end locations and start locations.
+        EDLIB_TASK_PATH       //!< Find edit distance, end locations and start locations and alignment path.
+    } EdlibAlignTask;
+
+    /**
+     * Describes cigar format.
+     * @see http://samtools.github.io/hts-specs/SAMv1.pdf
+     * @see http://drive5.com/usearch/manual/cigar.html
+     */
+    typedef enum {
+        EDLIB_CIGAR_STANDARD,  //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
+        EDLIB_CIGAR_EXTENDED   //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
+    } EdlibCigarFormat;
+
+// Edit operations.
+#define EDLIB_EDOP_MATCH 0    //!< Match.
+#define EDLIB_EDOP_INSERT 1   //!< Insertion to target = deletion from query.
+#define EDLIB_EDOP_DELETE 2   //!< Deletion from target = insertion to query.
+#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
+
+    /**
+     * @brief Defines two given characters as equal.
+     */
+    typedef struct {
+        char first;
+        char second;
+    } EdlibEqualityPair;
+
+    /**
+     * @brief Configuration object for edlibAlign() function.
+     */
+    typedef struct {
+        /**
+         * Set k to non-negative value to tell edlib that edit distance is not larger than k.
+         * Smaller k can significantly improve speed of computation.
+         * If edit distance is larger than k, edlib will set edit distance to -1.
+         * Set k to negative value and edlib will internally auto-adjust k until score is found.
+         */
+        int k;
+
+        /**
+         * Alignment method.
+         * EDLIB_MODE_NW: global (Needleman-Wunsch)
+         * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
+         * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
+         */
+        EdlibAlignMode mode;
+
+        /**
+         * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
+         * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
+         * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
+         * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
+         */
+        EdlibAlignTask task;
+
+        /**
+         * List of pairs of characters, where each pair defines two characters as equal.
+         * This way you can extend edlib's definition of equality (which is that each character is equal only
+         * to itself).
+         * This can be useful if you have some wildcard characters that should match multiple other characters,
+         * or e.g. if you want edlib to be case insensitive.
+         * Can be set to NULL if there are none.
+         */
+        const EdlibEqualityPair* additionalEqualities;
+
+        /**
+         * Number of additional equalities, which is non-negative number.
+         * 0 if there are none.
+         */
+        int additionalEqualitiesLength;
+    } EdlibAlignConfig;
+
+    /**
+     * Helper method for easy construction of configuration object.
+     * @return Configuration object filled with given parameters.
+     */
+    EDLIB_API EdlibAlignConfig edlibNewAlignConfig(
+        int k, EdlibAlignMode mode, EdlibAlignTask task,
+        const EdlibEqualityPair* additionalEqualities,
+        int additionalEqualitiesLength
+    );
+
+    /**
+     * @return Default configuration object, with following defaults:
+     *         k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities.
+     */
+    EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void);
+
+
+    /**
+     * Container for results of alignment done by edlibAlign() function.
+     */
+    typedef struct {
+        /**
+         * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
+         */
+        int status;
+
+        /**
+         * -1 if k is non-negative and edit distance is larger than k.
+         */
+        int editDistance;
+
+        /**
+         * Array of zero-based positions in target where optimal alignment paths end.
+         * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
+         * Set to NULL if edit distance is larger than k.
+         * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
+         */
+        int* endLocations;
+
+        /**
+         * Array of zero-based positions in target where optimal alignment paths start,
+         * they correspond to endLocations.
+         * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
+         * Set to NULL if not calculated or if edit distance is larger than k.
+         * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
+         */
+        int* startLocations;
+
+        /**
+         * Number of end (and start) locations.
+         */
+        int numLocations;
+
+        /**
+         * Alignment is found for first pair of start and end locations.
+         * Set to NULL if not calculated.
+         * Alignment is sequence of numbers: 0, 1, 2, 3.
+         * 0 stands for match.
+         * 1 stands for insertion to target.
+         * 2 stands for insertion to query.
+         * 3 stands for mismatch.
+         * Alignment aligns query to target from beginning of query till end of query.
+         * If gaps are not penalized, they are not in alignment.
+         * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
+         */
+        unsigned char* alignment;
+
+        /**
+         * Length of alignment.
+         */
+        int alignmentLength;
+
+        /**
+         * Number of different characters in query and target together.
+         */
+        int alphabetLength;
+    } EdlibAlignResult;
+
+    /**
+     * Frees memory in EdlibAlignResult that was allocated by edlib.
+     * If you do not use it, make sure to free needed members manually using free().
+     */
+    EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result);
+
+
+    /**
+     * Aligns two sequences (query and target) using edit distance (levenshtein distance).
+     * Through config parameter, this function supports different alignment methods (global, prefix, infix),
+     * as well as different modes of search (tasks).
+     * It always returns edit distance and end locations of optimal alignment in target.
+     * It optionally returns start locations of optimal alignment in target and alignment path,
+     * if you choose appropriate tasks.
+     * @param [in] query  First sequence.
+     * @param [in] queryLength  Number of characters in first sequence.
+     * @param [in] target  Second sequence.
+     * @param [in] targetLength  Number of characters in second sequence.
+     * @param [in] config  Additional alignment parameters, like alignment method and wanted results.
+     * @return  Result of alignment, which can contain edit distance, start and end locations and alignment path.
+     *          Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
+     */
+    EDLIB_API EdlibAlignResult edlibAlign(
+        const char* query, int queryLength,
+        const char* target, int targetLength,
+        const EdlibAlignConfig config
+    );
+
+
+    /**
+     * Builds cigar string from given alignment sequence.
+     * @param [in] alignment  Alignment sequence.
+     *     0 stands for match.
+     *     1 stands for insertion to target.
+     *     2 stands for insertion to query.
+     *     3 stands for mismatch.
+     * @param [in] alignmentLength
+     * @param [in] cigarFormat  Cigar will be returned in specified format.
+     * @return Cigar string.
+     *     I stands for insertion.
+     *     D stands for deletion.
+     *     X stands for mismatch. (used only in extended format)
+     *     = stands for match. (used only in extended format)
+     *     M stands for (mis)match. (used only in standard format)
+     *     String is null terminated.
+     *     Needed memory is allocated and given pointer is set to it.
+     *     Do not forget to free it later using free()!
+     */
+    EDLIB_API char* edlibAlignmentToCigar(
+        const unsigned char* alignment, int alignmentLength,
+        EdlibCigarFormat cigarFormat
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // EDLIB_H
diff --git a/bcftools/filter.c b/bcftools/filter.c
index b6547f81f..c9dcd023b 100644
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -1,6 +1,6 @@
 /*  filter.c -- filter expressions.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -29,6 +29,7 @@ THE SOFTWARE.  */
 #include <errno.h>
 #include <math.h>
 #include <sys/types.h>
+#include <stdint.h>
 #include <inttypes.h>
 #ifndef _WIN32
 #include <pwd.h>
@@ -38,11 +39,17 @@ THE SOFTWARE.  */
 #include <htslib/hts_defs.h>
 #include <htslib/vcfutils.h>
 #include <htslib/kfunc.h>
+#include <htslib/hts_endian.h>
 #include "config.h"
 #include "filter.h"
 #include "bcftools.h"
 
 #if ENABLE_PERL_FILTERS
+// Work around clang warning problems
+#  if defined(__clang__)
+#    define PERL_GCC_BRACE_GROUPS_FORBIDDEN
+#  endif
+
 #  define filter_t perl_filter_t
 #  include <EXTERN.h>
 #  include <perl.h>
@@ -66,7 +73,7 @@ typedef struct _token_t
     char *tag;          // for debugging and printout only, VCF tag name
     double threshold;   // filtering threshold
     int is_constant;    // the threshold is set
-    int hdr_id, tag_type;   // BCF header lookup ID and one of BCF_HL_* types
+    int hdr_id, hl_type, ht_type;   // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types
     int idx;            // 0-based index to VCF vectors,
                         //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
                         //  -3: select indices on the fly based on values in GT
@@ -79,11 +86,12 @@ typedef struct _token_t
     void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
     void *hash;         // test presence of str value in the hash via comparator
     regex_t *regex;     // precompiled regex for string comparison
+    int iext;           // for the use with filter_test_ext(), 1-based index to external values, 0=don't use
 
     // modified on filter evaluation at each VCF line
     double *values;
     kstring_t str_value;
-    int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
+    int is_str, is_missing; // is_missing is set only for constants, variables are controlled via nvalues
     int pass_site;          // -1 not applicable, 0 fails, >0 pass
     uint8_t *pass_samples;  // status of individual samples
     int nvalues, mvalues;   // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
@@ -112,6 +120,8 @@ struct _filter_t
     char **undef_tag, **used_tag;
     int nundef_tag, nused_tag;
     int status, exit_on_error;
+    int n_ext;      // number of external values to fill via filter_test_ext()
+    int *ext;       // types of external values to fill via filter_test_ext()
 };
 
 
@@ -156,13 +166,16 @@ struct _filter_t
 #define TOK_IN      38      // contains, e.g. FILTER~"A"
 #define TOK_NOT_IN  39      // does not contain, e.g. FILTER!~"A"
 #define TOK_MODULO  40      // %
+#define TOK_EXT     41      // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float}
 
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 //                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s                          %
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 };
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 0};
 #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"       // this is only for debugging, not maintained diligently
 
 static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok);
+inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok);
+
 
 // Return negative values if it is a function with variable number of arguments
 static int filters_next_token(char **str, int *len)
@@ -221,6 +234,10 @@ static int filters_next_token(char **str, int *len)
     if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
     if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
     if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; }    // to be able to distinguish between INFO/ILEN and on-the-fly ILEN
+    if ( !strncasecmp(tmp,"{}",2) ) { *len = 2; return TOK_EXT; }
+    if ( !strncasecmp(tmp,"{STR}",5) ) { *len = 5; return TOK_EXT; }
+    if ( !strncasecmp(tmp,"{INT}",5) ) { *len = 5; return TOK_EXT; }
+    if ( !strncasecmp(tmp,"{FLOAT}",7) ) { *len = 7; return TOK_EXT; }
 
     if ( tmp[0]=='@' )  // file name
     {
@@ -596,40 +613,146 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1
         error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
     return;
 }
-static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
+static void filters_cmp_string_hash(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
 {
     if ( btok->hash )
     {
         token_t *tmp = atok; atok = btok; btok = tmp;
     }
-    if ( atok->hash )
+    if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+        error("Only == and != operators are supported for strings read from a file\n");
+
+    // INFO
+    if ( !btok->nsamples )
     {
-        if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
-            error("Only == and != operators are supported for strings read from a file\n");
+        // there is only one string value, e.g. STR[1]=@list.txt
+        if ( btok->idx >= 0 )
+        {
+            int ret = btok->str_value.s ? khash_str2int_has_key(atok->hash, btok->str_value.s) : 0;
+            if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+            rtok->pass_site = ret;
+            return;
+        }
 
-        int ret = khash_str2int_has_key(atok->hash, line->d.id);
+        // there can be multiple comma-separated string values, e.g. STR=@list.txt or STR[*]=@list.txt
+        int ret = 0;
+        char *ptr = btok->str_value.s;
+        while ( *ptr )
+        {
+            char *eptr = ptr + 1;
+            while ( *eptr && *eptr!=',' ) eptr++;
+            char keep = *eptr;
+            *eptr = 0;
+            ret |= khash_str2int_has_key(atok->hash, ptr);
+            *eptr = keep;
+            if ( !keep ) break;
+            ptr = eptr + 1;
+        }
         if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
         rtok->pass_site = ret;
         return;
     }
 
-    if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
 
-    if ( rtok->tok_type==TOK_EQ )
-        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
-    else if ( rtok->tok_type==TOK_NE )
-        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
-    else
+    // FORMAT
+    tok_init_samples(atok, btok, rtok);
+    rtok->pass_site = 0;
+    int i;
+
+    // there is only one string value, e.g. FMT/STR[*:1]=@list.txt
+    if ( btok->idx >= 0 )
+    {
+        for (i=0; i<btok->nsamples; i++)
+        {
+            if ( !rtok->usmpl[i] ) continue;
+            char *str = btok->str_value.s + i*btok->nval1;
+            char keep = str[btok->nval1];
+            str[btok->nval1] = 0;
+            int ret = khash_str2int_has_key(atok->hash, str);
+            str[btok->nval1] = keep;
+            if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+            rtok->pass_samples[i] = ret;
+            rtok->pass_site |= ret;
+        }
+        return;
+    }
+
+    // there can be multiple comma-separated string values, e.g. FMT/STR=@list.txt
+    for (i=0; i<btok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        char *str = btok->str_value.s + i*btok->nval1;
+        char keep = str[btok->nval1];
+        str[btok->nval1] = 0;
+
+        // now str contains the block of per-sample comma-separated strings to loop over
+        int ret = 0;
+        char *ptr = str;
+        while ( *ptr )
+        {
+            char *eptr = ptr + 1;
+            while ( *eptr && *eptr!=',' ) eptr++;
+            char keep0 = *eptr;
+            *eptr = 0;
+            ret |= khash_str2int_has_key(atok->hash, ptr);
+            *eptr = keep0;
+            if ( !keep0 ) break;
+            ptr = eptr + 1;
+        }
+        str[btok->nval1] = keep;
+        if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+        rtok->pass_samples[i] = ret;
+        rtok->pass_site |= ret;
+    }
+}
+static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
+{
+    if ( btok->hash )
     {
-        if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE )
-            error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n",
-                rtok->tok_type,atok->regex,btok->regex);
+        token_t *tmp = atok; atok = btok; btok = tmp;
+    }
+
+    char *id = line->d.id;
+    int pass = 0;
+
+    while ( id )
+    {
+        char *ep = strchr(id,';');
+        if ( ep ) *ep = 0;
+
+        if ( atok->hash )
+        {
+            if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+                error("Only == and != operators are supported for strings read from a file\n");
 
-        regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
-        if ( !regex ) error("fixme: regex initialization failed\n");
-        rtok->pass_site = regexec(regex,line->d.id, 0,NULL,0) ? 0 : 1;
-        if ( rtok->tok_type==TOK_NLIKE ) rtok->pass_site = rtok->pass_site ? 0 : 1;
+            pass = khash_str2int_has_key(atok->hash, id);
+        }
+        else
+        {
+            if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
+
+            if ( rtok->tok_type==TOK_EQ || rtok->tok_type==TOK_NE )
+                pass = strcmp(btok->str_value.s,id) ? 0 : 1;
+            else
+            {
+                if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE )
+                    error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n",
+                            rtok->tok_type,atok->regex,btok->regex);
+
+                regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
+                if ( !regex ) error("fixme: regex initialization failed\n");
+                pass = regexec(regex,id, 0,NULL,0) ? 0 : 1;
+            }
+        }
+        if ( ep )
+        {
+            *ep = ';';
+            id = ep + 1;
+        }
+        if ( pass || !ep ) break;
     }
+    if ( rtok->tok_type==TOK_NE || rtok->tok_type==TOK_NE) pass = pass ? 0 : 1;
+    rtok->pass_site = pass;
 }
 
 /**
@@ -637,7 +760,7 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *
  *  @line:      BCF line
  *  @info_id:   tag ID, as returned by bcf_hdr_id2int
  *  @ivec:      0-based index to retrieve, -1 when single value is expected
- *  @vptr:      pointer to memory location of sufficient size to accomodate
+ *  @vptr:      pointer to memory location of sufficient size to accommodate
  *              info_id's type
  *
  *  The returned value is -1 if tag is not present, 0 if present but
@@ -659,22 +782,25 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
     }
 
     if ( ivec<0 ) ivec = 0;
+    if ( ivec>=info->len) return 0;
 
-    #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
-        type_t *p = (type_t *) info->vptr; \
-        for (j=0; j<ivec && j<info->len; j++) \
+    #define BRANCH(type_t, convert, is_missing, is_vector_end, out_type_t) { \
+        uint8_t *p = info->vptr; \
+        for (j=0; j<ivec; j++) \
         { \
+            type_t val = convert(&p[j * sizeof(type_t)]); \
             if ( is_vector_end ) return 0; \
         } \
+        type_t val = convert(&p[ivec * sizeof(type_t)]); \
         if ( is_missing ) return 0; \
-        *((out_type_t*)value) = p[j]; \
+        *((out_type_t*)value) = val; \
         return 1; \
     }
     switch (info->type) {
-        case BCF_BT_INT8:  BRANCH(int8_t,  p[j]==bcf_int8_missing,  p[j]==bcf_int8_vector_end,  int64_t); break;
-        case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
-        case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
-        case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  int64_t); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, int64_t); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, int64_t); break;
+        case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), double); break;
         default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
     }
     #undef BRANCH
@@ -875,6 +1001,7 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
                 if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
                 dst[j++] = src[k];
             }
+            if ( !j ) { bcf_double_set_missing(dst[j]); j++; }
             for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
         }
     }
@@ -965,6 +1092,7 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
                     dst[j] = src[k];
                 j++;
             }
+            if ( !j ) { bcf_double_set_missing(dst[j]); j++; }
             for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
         }
     }
@@ -1008,35 +1136,45 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
 
     int i, ndim = tok->str_value.m;
     int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
-    tok->str_value.m = ndim;
+    tok->str_value.m = tok->str_value.l = ndim;
+    kputc(0,&tok->str_value); // append the nul byte
     tok->str_value.l = tok->nvalues = 0;
 
     if ( nstr<0 ) return;
 
+    if ( tok->idx==-3 && filters_cache_genotypes(flt,line)!=0 ) return;
+
     tok->nvalues = tok->str_value.l = nstr;
     tok->nval1   = nstr / tok->nsamples;
     for (i=0; i<tok->nsamples; i++)
     {
         if ( !tok->usmpl[i] ) continue;
         char *src = tok->str_value.s + i*tok->nval1, *dst = src;
-        int ibeg = 0, idx = 0;
+        int ibeg = 0;
+        int idx = 0;        // idx-th field
         while ( ibeg < tok->nval1 )
         {
             int iend = ibeg;
             while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
 
             int keep = 0;
-            if ( tok->idx >= 0 )
+            if ( tok->idx >= 0 )    // single index, given explicitly, e.g. AD[:1]
             {
                 if ( tok->idx==idx ) keep = 1;
             }
-            else if ( idx < tok->nidxs )
+            else if ( tok->idx == -3 )  // given by GT index, e.g. AD[:GT]
             {
-                if ( tok->idxs[idx] != 0 ) keep = 1;
+                if ( flt->cached_GT.mask[i] & (1<<idx) ) keep = 1;
+            }
+            else    // given as a list, e.g. AD[:0,3]
+            {
+                if ( idx < tok->nidxs )
+                {
+                    if ( tok->idxs[idx] != 0 ) keep = 1;
+                }
+                else if ( tok->idxs[tok->nidxs-1] < 0 )
+                    keep = 1;
             }
-            else if ( tok->idxs[tok->nidxs-1] < 0 )
-                keep = 1;
-
             if ( keep )
             {
                 if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
@@ -1067,23 +1205,23 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
         tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
     }
 
-#define BRANCH_INT(type_t,vector_end) \
+#define BRANCH_INT(type_t, convert, vector_end) \
     { \
         for (i=0; i<line->n_sample; i++) \
         { \
-            type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
-            int is_het = 0, has_ref = 0, missing = 0; \
+            uint8_t *ptr = fmt->p + i*fmt->size; \
+            int is_het = 0, has_ref = 0, missing = 0, jal = 0; \
             for (j=0; j<fmt->n; j++) \
             { \
-                if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
-                if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
-                int ial = ptr[j]; \
+                type_t ial = convert(&ptr[j * sizeof(type_t)]); \
+                if ( ial==vector_end ) break; /* smaller ploidy */ \
+                if ( bcf_gt_is_missing(ial) ) { missing=1; break; } /* missing allele */ \
                 if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
                 if ( j>0 ) \
                 { \
-                    int jal = ptr[j-1]; \
                     if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
                 } \
+                jal = ial; \
             } \
             char *dst = &tok->str_value.s[nvals1*i]; \
             if ( type==4 ) \
@@ -1121,9 +1259,9 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
         } \
     }
     switch (fmt->type) {
-        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  le_to_i8,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, le_to_i16, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, le_to_i32, bcf_int32_vector_end); break;
         default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break;
     }
 #undef BRANCH_INT
@@ -1180,9 +1318,13 @@ static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok)
     int i, rlen = strlen(line->d.allele[0]);
     for (i=1; i<line->n_allele; i++)
     {
+        if ( line->d.allele[i][0]=='<' )
+        {
+            bcf_double_set_missing(tok->values[i-1]);
+            continue;
+        }
         int alen = strlen(line->d.allele[i]);
-        if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]);
-        else tok->values[i-1] = alen - rlen;
+        tok->values[i-1] = alen - rlen;
     }
 }
 static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -1247,21 +1389,22 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
     }
 
     int j,nmissing = 0;
-    #define BRANCH(type_t, is_vector_end) { \
+    #define BRANCH(type_t, convert, is_vector_end) { \
         for (i=0; i<line->n_sample; i++) \
         { \
-            type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
+            uint8_t *ptr = fmt->p + i*fmt->size; \
             for (j=0; j<fmt->n; j++) \
             { \
-                if ( ptr[j]==is_vector_end ) break; \
-                if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
+                type_t val = convert(&ptr[j * sizeof(type_t)]); \
+                if ( val==is_vector_end ) break; \
+                if ( val==bcf_gt_missing ) { nmissing++; break; } \
             } \
         } \
     }
     switch (fmt->type) {
-        case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_vector_end); break;
         default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break;
     }
     #undef BRANCH
@@ -1893,19 +2036,6 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
     }
     return 1;
 }
-static inline double calc_binom(int na, int nb)
-{
-    if ( na==0 && nb==0 ) return -1;
-    if ( na==nb ) return 1;
-
-    // kfunc.h implements kf_betai, which is the regularized beta function  P(X<=k/N;p) = I_{1-p}(N-k,k+1)
-
-    double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5);
-    pval *= 2;
-    if ( pval>1 ) pval = 1;     // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
-
-    return pval;
-}
 static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
 {
     int i, istack = nstack - rtok->nargs;
@@ -1973,7 +2103,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
                     bcf_double_set_missing(rtok->values[i]);
                     continue;
                 }
-                rtok->values[i] = calc_binom(vals[idx1],vals[idx2]);
+                rtok->values[i] = calc_binom_two_sided(vals[idx1],vals[idx2],0.5);
                 if ( rtok->values[i] < 0 )
                 {
                     bcf_double_set_missing(rtok->values[i]);
@@ -1997,7 +2127,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
                     bcf_double_set_missing(rtok->values[i]);
                     continue;
                 }
-                rtok->values[i] = calc_binom(ptr1[0],ptr2[0]);
+                rtok->values[i] = calc_binom_two_sided(ptr1[0],ptr2[0],0.5);
                 if ( rtok->values[i] < 0 )
                 {
                     bcf_double_set_missing(rtok->values[i]);
@@ -2036,7 +2166,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
             bcf_double_set_missing(rtok->values[0]);
         else
         {
-            rtok->values[0] = calc_binom(ptr1[0],ptr2[0]);
+            rtok->values[0] = calc_binom_two_sided(ptr1[0],ptr2[0],0.5);
             if ( rtok->values[0] < 0 )
                 bcf_double_set_missing(rtok->values[0]);
         }
@@ -2306,6 +2436,18 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
     return 2;
 }
 
+// A note about comparisons:
+// When setting value by determining index from the genotype, we face the problem
+// of how to interpret truncating arrays. Say we have TAG defined as Number=. and
+//      GT:TAG   1/1:0,1,2  0/0:0
+// Then when querying we expect the following expression to evaluate for the second
+// sample as
+//      -i 'TAG[1:1]="."'  .. true
+//      -i 'TAG[1:GT]="."' .. false
+// The problem is that the implementation truncates the number of fields, filling
+// usually fewer than the original number of per-sample values. This is fixed by
+// adding an exception that makes the code aware of this: the GT indexing can be
+// recognised by having tok->idx==-3
 #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
 { \
     token_t *rtok = _rtok; \
@@ -2393,6 +2535,8 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
                 double *bptr = btok->values + i*btok->nval1; \
                 for (j=0; j<atok->nval1; j++) \
                 { \
+                    if ( atok->idx==-3 && bcf_double_is_vector_end(aptr[j]) ) break; /* explained above */ \
+                    if ( btok->idx==-3 && bcf_double_is_vector_end(bptr[j]) ) break; /* explained above */ \
                     int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
                     if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
                     nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \
@@ -2414,9 +2558,10 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
             { \
                 if ( !rtok->usmpl[i] ) continue; \
                 double *aptr = atok->values + i*atok->nval1; \
-                double *bptr = btok->values + i*btok->nval1; \
+                double *bptr = btok->values; \
                 for (j=0; j<atok->nval1; j++) \
                 { \
+                    if ( atok->idx==-3 && bcf_double_is_vector_end(aptr[j]) ) break; /* explained above */ \
                     int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
                     if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
                     for (k=0; k<btok->nvalues; k++) \
@@ -2440,10 +2585,11 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
             for (i=0; i<btok->nsamples; i++) \
             { \
                 if ( !rtok->usmpl[i] ) continue; \
-                double *aptr = atok->values + i*atok->nval1; \
+                double *aptr = atok->values; \
                 double *bptr = btok->values + i*btok->nval1; \
                 for (j=0; j<btok->nval1; j++) \
                 { \
+                    if ( atok->idx==-3 && bcf_double_is_vector_end(bptr[j]) ) break; /* explained above */ \
                     int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
                     if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
                     for (k=0; k<atok->nvalues; k++) \
@@ -2807,9 +2953,26 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr)
 
     return BCF_UN_INFO;
 }
+static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok)
+{
+    tok->hl_type  = -1;
+    tok->ht_type  = -1;
+    tok->tok_type  = TOK_VAL;
+    tok->hdr_id    = -1;
+    tok->pass_site = -1;
+    tok->idx       = 0;
+    tok->iext = ++filter->n_ext;
+    filter->ext = realloc(filter->ext,sizeof(*filter->ext)*filter->n_ext);
+    if ( !strncasecmp(str,"{str}",len) ) { tok->ht_type = BCF_HT_STR; tok->is_str = 1; }
+    else if ( !strncasecmp(str,"{int}",len) ) tok->ht_type = BCF_HT_INT;
+    else if ( !strncasecmp(str,"{float}",len) ) tok->ht_type = BCF_HT_REAL;
+    filter->ext[filter->n_ext-1] = tok->ht_type;
+    return 0;
+}
 static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
 {
-    tok->tag_type  = -1;
+    tok->ht_type  = -1;
+    tok->hl_type  = -1;
     tok->tok_type  = TOK_VAL;
     tok->hdr_id    = -1;
     tok->pass_site = -1;
@@ -2826,6 +2989,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         memcpy(tok->key,str+1,len-2);
         tok->key[len-2] = 0;
         tok->is_str = 1;
+        tok->ht_type = BCF_HT_STR;
         tok->nvalues = len-2;
         if ( !strcmp(".",tok->key) ) tok->is_missing = 1;
         return 0;
@@ -2867,6 +3031,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = filters_set_qual;
             tok->tag = strdup("QUAL");
+            tok->ht_type = BCF_HT_REAL;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2874,6 +3039,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = filters_set_type;
             tok->tag = strdup("TYPE");
+            tok->ht_type = BCF_HT_STR;
             return 0;
         }
         else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ )
@@ -2881,7 +3047,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->comparator = filters_cmp_filter;
             tok->tag = strdup("FILTER");
             filter->max_unpack |= BCF_UN_FLT;
-            tok->tag_type = BCF_HL_FLT;
+            tok->hl_type = BCF_HL_FLT;
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2889,6 +3056,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->comparator = filters_cmp_id;
             tok->tag = strdup("ID");
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2896,6 +3064,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = &filters_set_chrom;
             tok->tag = strdup("CHROM");
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2903,6 +3072,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = &filters_set_pos;
             tok->tag = strdup("POS");
+            tok->ht_type = BCF_HT_INT;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2911,6 +3081,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->setter = &filters_set_ref_string;
             tok->is_str = 1;
             tok->tag = strdup("REF");
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2919,6 +3090,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->setter = &filters_set_alt_string;
             tok->is_str = 1;
             tok->tag = strdup("ALT");
+            tok->ht_type = BCF_HT_STR;
             tok->idxs = (int*) malloc(sizeof(int));
             tok->idxs[0] = -1;
             tok->nidxs   = 1;
@@ -2930,6 +3102,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = &filters_set_nalt;
             tok->tag = strdup("N_ALT");
+            tok->ht_type = BCF_HT_INT;
             return 0;
         }
         else if ( !strncasecmp(str,"N_SAMPLES",len) )
@@ -2937,6 +3110,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->tok_type = TOK_VAL;
             tok->threshold = bcf_hdr_nsamples(filter->hdr);
             tok->is_constant = 1;
+            tok->ht_type = BCF_HT_INT;
             return 0;
         }
         else if ( !strncasecmp(str,"N_MISSING",len) )
@@ -2944,6 +3118,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             filter->max_unpack |= BCF_UN_FMT;
             tok->setter = &filters_set_nmissing;
             tok->tag = strdup("N_MISSING");
+            tok->ht_type = BCF_HT_INT;
             return 0;
         }
         else if ( !strncasecmp(str,"F_MISSING",len) )
@@ -2951,6 +3126,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             filter->max_unpack |= BCF_UN_FMT;
             tok->setter = &filters_set_nmissing;
             tok->tag = strdup("F_MISSING");
+            tok->ht_type = BCF_HT_REAL;
             return 0;
         }
     }
@@ -2991,13 +3167,14 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
     }
 
-    tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
+    tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
     if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
     if ( tok->hdr_id>=0 )
     {
         if ( is_fmt && !strcmp("GT",tmp.s) )
         {
             tok->setter = &filters_set_genotype_string; tok->is_str = 1;
+            tok->ht_type = BCF_HT_STR;
         }
         else if ( is_fmt )
         {
@@ -3012,9 +3189,9 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             }
             switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
             {
-                case BCF_HT_INT:  tok->setter = &filters_set_format_int; break;
-                case BCF_HT_REAL: tok->setter = &filters_set_format_float; break;
-                case BCF_HT_STR:  tok->setter = &filters_set_format_string; tok->is_str = 1; break;
+                case BCF_HT_INT:  tok->setter = &filters_set_format_int; tok->ht_type = BCF_HT_INT; break;
+                case BCF_HT_REAL: tok->setter = &filters_set_format_float; tok->ht_type = BCF_HT_REAL; break;
+                case BCF_HT_STR:  tok->setter = &filters_set_format_string; tok->ht_type = BCF_HT_STR; tok->is_str = 1; break;
                 default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
             }
         }
@@ -3023,10 +3200,14 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         else
         {
             if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
+            {
                 tok->setter = filters_set_info_flag;
+                tok->ht_type = BCF_HT_INT;
+            }
             else
             {
-                if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1;
+                tok->ht_type = bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id);
+                if ( tok->ht_type == BCF_HT_STR ) tok->is_str = 1;
                 if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 )
                     tok->setter = filters_set_info;
                 else
@@ -3058,6 +3239,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     {
         tok->setter = &filters_set_alt_string;
         tok->is_str = 1;
+        tok->ht_type = BCF_HT_STR;
         tok->tag = strdup(tmp.s);
         free(tmp.s);
         filter_add_used_tag(filter,NULL,tok->tag);
@@ -3068,6 +3250,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= BCF_UN_FMT;
         tok->setter = &filters_set_an;
         tok->tag = strdup("AN");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3076,6 +3259,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= BCF_UN_FMT;
         tok->setter = &filters_set_ac;
         tok->tag = strdup("AC");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3084,6 +3268,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_mac;
         tok->tag = strdup("MAC");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3092,6 +3277,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_af;
         tok->tag = strdup("AF");
+        tok->ht_type = BCF_HT_REAL;
         free(tmp.s);
         return 0;
     }
@@ -3100,6 +3286,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_maf;
         tok->tag = strdup("MAF");
+        tok->ht_type = BCF_HT_REAL;
         free(tmp.s);
         return 0;
     }
@@ -3108,6 +3295,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= BCF_UN_STR;
         tok->setter = &filters_set_ilen;
         tok->tag = strdup("ILEN");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3130,7 +3318,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             filter->status |= FILTER_ERR_UNKN_TAGS;
             filter_add_undef_tag(filter,tmp.s);
         }
+        tok->ht_type = BCF_HT_REAL;
     }
+    else
+        tok->ht_type = BCF_HT_INT;
     tok->is_constant = 1;
 
     if ( tmp.s ) free(tmp.s);
@@ -3284,6 +3475,33 @@ static void perl_destroy(filter_t *filter)
 #endif
 }
 
+// A very rudimentary heuristics to determine type, e.g. STR_TAG={} implies {str}.
+// Throws an error on anything more complex and asks for an explicit type.
+static void determine_ext_types(filter_t *filter, int ntok, token_t *tok)
+{
+    int i;
+    for (i=0; i<ntok; i++)
+    {
+        if ( !tok[i].iext || tok[i].ht_type!=-1 ) continue;
+        if ( !i || i+1==ntok ) break;       // first or last in the RPN
+        if ( tok[i-1].ht_type==-1 ) break;  // previous type not set
+        // todo: check if the next is an operator
+        // todo: case when the order is reversed, {}=TAG
+        tok[i].ht_type = tok[i-1].ht_type;
+    }
+    if ( i!=ntok )
+        error("[%s:%d %s] Error: unable to determine the type, use explicit notation: %s\n",__FILE__,__LINE__,__FUNCTION__,filter->str);
+    for (i=0; i<ntok; i++)
+    {
+        int j = tok[i].iext - 1;
+        if ( j<0 ) continue;
+        if ( filter->ext[j]!=-1 && filter->ext[j]!=tok[i].ht_type  )
+            error("[%s:%d %s] FIXME: this should not happen %d vs %d, iext=%d\n",__FILE__,__LINE__,__FUNCTION__,filter->ext[j],tok[i].ht_type,j);
+        filter->ext[j] = tok[i].ht_type;
+        if ( tok[i].ht_type==BCF_HT_STR ) tok[i].is_str = 1;
+    }
+}
+
 
 // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
 static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error)
@@ -3331,6 +3549,13 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
             memset(&ops[nops-1],0,sizeof(token_t));
             nops--;
         }
+        else if ( ret==TOK_EXT )    // external value
+        {
+            nout++;
+            hts_expand0(token_t, nout, mout, out);
+            filters_init1_ext(filter, tmp, len, &out[nout-1]);
+            tmp += len;
+        }
         else if ( ret!=TOK_VAL )    // one of the operators
         {
             // detect unary minus: replace -value with -1*(value)
@@ -3441,12 +3666,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
             hts_expand0(token_t, nops, mops, ops);
             ops[nops-1].tok_type = ret;
         }
-        else if ( !len )
+        else if ( !len )    // all tokes read or an error
         {
             if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
             break;     // all tokens read
         }
-        else           // annotation name or filtering value
+        else           // TOK_VAL: annotation name or value
         {
             nout++;
             hts_expand0(token_t, nout, mout, out);
@@ -3468,10 +3693,21 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
         nops--;
     }
 
+    if ( filter->status != FILTER_OK )
+    {
+        if ( mops ) free(ops);
+        filter->filters   = out;
+        filter->nfilters  = nout;
+        return filter;
+    }
+
+    // Determine types of external variables from the context
+    determine_ext_types(filter,nout,out);
+
     // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the
     // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
     // just before or after the FILTER token and they must be followed with a comparison operator.
-    // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator.
+    // At this point we also initialize regex expressions which, in RPN, must precede the LIKE/NLIKE operator.
     // Additionally, treat "." as missing value rather than a string in numeric equalities; that
     // @file is only used with ID; etc.
     // This code is fragile: improve me.
@@ -3486,7 +3722,10 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
         {
             int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1;
             if ( out[j].comparator!=filters_cmp_id )
-                error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n");
+            {
+                if ( out[j].comparator ) error("Error: could not parse the expression with \"@file_name\" syntax (possible todo)\n");
+                out[j].comparator = filters_cmp_string_hash;
+            }
         }
         if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
             out[i].func = vector_logic_or;
@@ -3502,7 +3741,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
                 int set_missing = 0;
                 if ( out[k].hdr_id>0 )
                 {
-                    int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id);
+                    int type = bcf_hdr_id2type(filter->hdr,out[k].hl_type,out[k].hdr_id);
                     if ( type==BCF_HT_INT ) set_missing = 1;
                     else if ( type==BCF_HT_REAL ) set_missing = 1;
                 }
@@ -3543,7 +3782,8 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
         if ( !out[i].tag ) continue;
         if ( out[i].setter==filters_set_type )
         {
-            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+            if ( i+1==nout || !out[i+1].key )
+                error("Could not parse the expression: %s\n", filter->str);
             int itok, ival;
             if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
             else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
@@ -3591,7 +3831,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
             else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; }  // r
             continue;
         }
-        if ( out[i].tag_type==BCF_HL_FLT )
+        if ( out[i].hl_type==BCF_HL_FLT )
         {
             if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
             int itok = i, ival;
@@ -3705,6 +3945,7 @@ void filter_destroy(filter_t *filter)
     }
     for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
     for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
+    free(filter->ext);
     free(filter->undef_tag);
     free(filter->used_tag);
     free(filter->cached_GT.buf);
@@ -3718,6 +3959,37 @@ void filter_destroy(filter_t *filter)
     free(filter);
 }
 
+int filter_test_ext(filter_t *filter, bcf1_t *rec, const uint8_t **samples, const void **ext)
+{
+    if ( !filter->n_ext )
+        return filter_test(filter,rec,samples);
+
+    int i;
+    for (i=0; i<filter->nfilters; i++)
+    {
+        token_t *tok = &filter->filters[i];
+        if ( !tok->iext ) continue;
+        if ( !ext[tok->iext-1] )
+        {
+            tok->is_missing = 1;
+            tok->nvalues = 0;
+            if ( filter->ext[tok->iext-1]==BCF_HT_STR ) tok->str_value.l = 0;
+            continue;
+        }
+        tok->is_missing = 0;
+        tok->nvalues = 1;
+        if ( filter->ext[tok->iext-1]==BCF_HT_STR )
+        {
+            tok->str_value.l = 0;
+            kputs((const char*)ext[tok->iext-1],&tok->str_value);
+            tok->nvalues = tok->str_value.l;
+        }
+        else if ( filter->ext[tok->iext-1]==BCF_HT_INT ) tok->values[0] = *((const int*)ext[tok->iext-1]);
+        else if ( filter->ext[tok->iext-1]==BCF_HT_REAL ) tok->values[0] = *((const float*)ext[tok->iext-1]);
+    }
+    return filter_test(filter,rec,samples);
+}
+
 int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
 {
     if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n");
@@ -3854,7 +4126,11 @@ int filter_max_unpack(filter_t *flt)
 {
     return flt->max_unpack;
 }
-
+const int *filter_ext_types(filter_t *filter, int *n_ext)
+{
+    *n_ext = filter->n_ext;
+    return filter->ext;
+}
 const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
 {
     token_t *tok = filter->flt_stack[0];
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c
index d0e26258c..2db56801b 100644
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  filter.c -- filter expressions.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -31,6 +31,7 @@ THE SOFTWARE.  */
 #include <errno.h>
 #include <math.h>
 #include <sys/types.h>
+#include <stdint.h>
 #include <inttypes.h>
 #ifndef _WIN32
 #include <pwd.h>
@@ -40,11 +41,17 @@ THE SOFTWARE.  */
 #include <htslib/hts_defs.h>
 #include <htslib/vcfutils.h>
 #include <htslib/kfunc.h>
+#include <htslib/hts_endian.h>
 #include "config.h"
 #include "filter.h"
 #include "bcftools.h"
 
 #if ENABLE_PERL_FILTERS
+// Work around clang warning problems
+#  if defined(__clang__)
+#    define PERL_GCC_BRACE_GROUPS_FORBIDDEN
+#  endif
+
 #  define filter_t perl_filter_t
 #  include <EXTERN.h>
 #  include <perl.h>
@@ -68,7 +75,7 @@ typedef struct _token_t
     char *tag;          // for debugging and printout only, VCF tag name
     double threshold;   // filtering threshold
     int is_constant;    // the threshold is set
-    int hdr_id, tag_type;   // BCF header lookup ID and one of BCF_HL_* types
+    int hdr_id, hl_type, ht_type;   // BCF header lookup ID and one of BCF_HL_* types and BCF_HT_* types
     int idx;            // 0-based index to VCF vectors,
                         //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
                         //  -3: select indices on the fly based on values in GT
@@ -81,11 +88,12 @@ typedef struct _token_t
     void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
     void *hash;         // test presence of str value in the hash via comparator
     regex_t *regex;     // precompiled regex for string comparison
+    int iext;           // for the use with filter_test_ext(), 1-based index to external values, 0=don't use
 
     // modified on filter evaluation at each VCF line
     double *values;
     kstring_t str_value;
-    int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
+    int is_str, is_missing; // is_missing is set only for constants, variables are controlled via nvalues
     int pass_site;          // -1 not applicable, 0 fails, >0 pass
     uint8_t *pass_samples;  // status of individual samples
     int nvalues, mvalues;   // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
@@ -114,6 +122,8 @@ struct _filter_t
     char **undef_tag, **used_tag;
     int nundef_tag, nused_tag;
     int status, exit_on_error;
+    int n_ext;      // number of external values to fill via filter_test_ext()
+    int *ext;       // types of external values to fill via filter_test_ext()
 };
 
 
@@ -158,13 +168,16 @@ struct _filter_t
 #define TOK_IN      38      // contains, e.g. FILTER~"A"
 #define TOK_NOT_IN  39      // does not contain, e.g. FILTER!~"A"
 #define TOK_MODULO  40      // %
+#define TOK_EXT     41      // external values set before each filter_test_ext() call, can be one of {},{str},{int},{float}
 
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 //                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s                          %
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 };
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 0};
 #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"       // this is only for debugging, not maintained diligently
 
 static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok);
+inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok);
+
 
 // Return negative values if it is a function with variable number of arguments
 static int filters_next_token(char **str, int *len)
@@ -223,6 +236,10 @@ static int filters_next_token(char **str, int *len)
     if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
     if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; }
     if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; }    // to be able to distinguish between INFO/ILEN and on-the-fly ILEN
+    if ( !strncasecmp(tmp,"{}",2) ) { *len = 2; return TOK_EXT; }
+    if ( !strncasecmp(tmp,"{STR}",5) ) { *len = 5; return TOK_EXT; }
+    if ( !strncasecmp(tmp,"{INT}",5) ) { *len = 5; return TOK_EXT; }
+    if ( !strncasecmp(tmp,"{FLOAT}",7) ) { *len = 7; return TOK_EXT; }
 
     if ( tmp[0]=='@' )  // file name
     {
@@ -598,40 +615,146 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1
         error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
     return;
 }
-static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
+static void filters_cmp_string_hash(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
 {
     if ( btok->hash )
     {
         token_t *tmp = atok; atok = btok; btok = tmp;
     }
-    if ( atok->hash )
+    if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+        error("Only == and != operators are supported for strings read from a file\n");
+
+    // INFO
+    if ( !btok->nsamples )
     {
-        if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
-            error("Only == and != operators are supported for strings read from a file\n");
+        // there is only one string value, e.g. STR[1]=@list.txt
+        if ( btok->idx >= 0 )
+        {
+            int ret = btok->str_value.s ? khash_str2int_has_key(atok->hash, btok->str_value.s) : 0;
+            if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+            rtok->pass_site = ret;
+            return;
+        }
 
-        int ret = khash_str2int_has_key(atok->hash, line->d.id);
+        // there can be multiple comma-separated string values, e.g. STR=@list.txt or STR[*]=@list.txt
+        int ret = 0;
+        char *ptr = btok->str_value.s;
+        while ( *ptr )
+        {
+            char *eptr = ptr + 1;
+            while ( *eptr && *eptr!=',' ) eptr++;
+            char keep = *eptr;
+            *eptr = 0;
+            ret |= khash_str2int_has_key(atok->hash, ptr);
+            *eptr = keep;
+            if ( !keep ) break;
+            ptr = eptr + 1;
+        }
         if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
         rtok->pass_site = ret;
         return;
     }
 
-    if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
 
-    if ( rtok->tok_type==TOK_EQ )
-        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
-    else if ( rtok->tok_type==TOK_NE )
-        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
-    else
+    // FORMAT
+    tok_init_samples(atok, btok, rtok);
+    rtok->pass_site = 0;
+    int i;
+
+    // there is only one string value, e.g. FMT/STR[*:1]=@list.txt
+    if ( btok->idx >= 0 )
+    {
+        for (i=0; i<btok->nsamples; i++)
+        {
+            if ( !rtok->usmpl[i] ) continue;
+            char *str = btok->str_value.s + i*btok->nval1;
+            char keep = str[btok->nval1];
+            str[btok->nval1] = 0;
+            int ret = khash_str2int_has_key(atok->hash, str);
+            str[btok->nval1] = keep;
+            if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+            rtok->pass_samples[i] = ret;
+            rtok->pass_site |= ret;
+        }
+        return;
+    }
+
+    // there can be multiple comma-separated string values, e.g. FMT/STR=@list.txt
+    for (i=0; i<btok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        char *str = btok->str_value.s + i*btok->nval1;
+        char keep = str[btok->nval1];
+        str[btok->nval1] = 0;
+
+        // now str contains the block of per-sample comma-separated strings to loop over
+        int ret = 0;
+        char *ptr = str;
+        while ( *ptr )
+        {
+            char *eptr = ptr + 1;
+            while ( *eptr && *eptr!=',' ) eptr++;
+            char keep0 = *eptr;
+            *eptr = 0;
+            ret |= khash_str2int_has_key(atok->hash, ptr);
+            *eptr = keep0;
+            if ( !keep0 ) break;
+            ptr = eptr + 1;
+        }
+        str[btok->nval1] = keep;
+        if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+        rtok->pass_samples[i] = ret;
+        rtok->pass_site |= ret;
+    }
+}
+static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
+{
+    if ( btok->hash )
     {
-        if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE )
-            error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n",
-                rtok->tok_type,atok->regex,btok->regex);
+        token_t *tmp = atok; atok = btok; btok = tmp;
+    }
+
+    char *id = line->d.id;
+    int pass = 0;
+
+    while ( id )
+    {
+        char *ep = strchr(id,';');
+        if ( ep ) *ep = 0;
+
+        if ( atok->hash )
+        {
+            if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+                error("Only == and != operators are supported for strings read from a file\n");
 
-        regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
-        if ( !regex ) error("fixme: regex initialization failed\n");
-        rtok->pass_site = regexec(regex,line->d.id, 0,NULL,0) ? 0 : 1;
-        if ( rtok->tok_type==TOK_NLIKE ) rtok->pass_site = rtok->pass_site ? 0 : 1;
+            pass = khash_str2int_has_key(atok->hash, id);
+        }
+        else
+        {
+            if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
+
+            if ( rtok->tok_type==TOK_EQ || rtok->tok_type==TOK_NE )
+                pass = strcmp(btok->str_value.s,id) ? 0 : 1;
+            else
+            {
+                if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE )
+                    error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n",
+                            rtok->tok_type,atok->regex,btok->regex);
+
+                regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
+                if ( !regex ) error("fixme: regex initialization failed\n");
+                pass = regexec(regex,id, 0,NULL,0) ? 0 : 1;
+            }
+        }
+        if ( ep )
+        {
+            *ep = ';';
+            id = ep + 1;
+        }
+        if ( pass || !ep ) break;
     }
+    if ( rtok->tok_type==TOK_NE || rtok->tok_type==TOK_NE) pass = pass ? 0 : 1;
+    rtok->pass_site = pass;
 }
 
 /**
@@ -639,7 +762,7 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *
  *  @line:      BCF line
  *  @info_id:   tag ID, as returned by bcf_hdr_id2int
  *  @ivec:      0-based index to retrieve, -1 when single value is expected
- *  @vptr:      pointer to memory location of sufficient size to accomodate
+ *  @vptr:      pointer to memory location of sufficient size to accommodate
  *              info_id's type
  *
  *  The returned value is -1 if tag is not present, 0 if present but
@@ -661,22 +784,25 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
     }
 
     if ( ivec<0 ) ivec = 0;
+    if ( ivec>=info->len) return 0;
 
-    #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
-        type_t *p = (type_t *) info->vptr; \
-        for (j=0; j<ivec && j<info->len; j++) \
+    #define BRANCH(type_t, convert, is_missing, is_vector_end, out_type_t) { \
+        uint8_t *p = info->vptr; \
+        for (j=0; j<ivec; j++) \
         { \
+            type_t val = convert(&p[j * sizeof(type_t)]); \
             if ( is_vector_end ) return 0; \
         } \
+        type_t val = convert(&p[ivec * sizeof(type_t)]); \
         if ( is_missing ) return 0; \
-        *((out_type_t*)value) = p[j]; \
+        *((out_type_t*)value) = val; \
         return 1; \
     }
     switch (info->type) {
-        case BCF_BT_INT8:  BRANCH(int8_t,  p[j]==bcf_int8_missing,  p[j]==bcf_int8_vector_end,  int64_t); break;
-        case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
-        case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
-        case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  int64_t); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, int64_t); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, int64_t); break;
+        case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), double); break;
         default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
     }
     #undef BRANCH
@@ -877,6 +1003,7 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
                 if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
                 dst[j++] = src[k];
             }
+            if ( !j ) { bcf_double_set_missing(dst[j]); j++; }
             for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
         }
     }
@@ -967,6 +1094,7 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
                     dst[j] = src[k];
                 j++;
             }
+            if ( !j ) { bcf_double_set_missing(dst[j]); j++; }
             for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
         }
     }
@@ -1010,35 +1138,45 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
 
     int i, ndim = tok->str_value.m;
     int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
-    tok->str_value.m = ndim;
+    tok->str_value.m = tok->str_value.l = ndim;
+    kputc(0,&tok->str_value); // append the nul byte
     tok->str_value.l = tok->nvalues = 0;
 
     if ( nstr<0 ) return;
 
+    if ( tok->idx==-3 && filters_cache_genotypes(flt,line)!=0 ) return;
+
     tok->nvalues = tok->str_value.l = nstr;
     tok->nval1   = nstr / tok->nsamples;
     for (i=0; i<tok->nsamples; i++)
     {
         if ( !tok->usmpl[i] ) continue;
         char *src = tok->str_value.s + i*tok->nval1, *dst = src;
-        int ibeg = 0, idx = 0;
+        int ibeg = 0;
+        int idx = 0;        // idx-th field
         while ( ibeg < tok->nval1 )
         {
             int iend = ibeg;
             while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
 
             int keep = 0;
-            if ( tok->idx >= 0 )
+            if ( tok->idx >= 0 )    // single index, given explicitly, e.g. AD[:1]
             {
                 if ( tok->idx==idx ) keep = 1;
             }
-            else if ( idx < tok->nidxs )
+            else if ( tok->idx == -3 )  // given by GT index, e.g. AD[:GT]
             {
-                if ( tok->idxs[idx] != 0 ) keep = 1;
+                if ( flt->cached_GT.mask[i] & (1<<idx) ) keep = 1;
+            }
+            else    // given as a list, e.g. AD[:0,3]
+            {
+                if ( idx < tok->nidxs )
+                {
+                    if ( tok->idxs[idx] != 0 ) keep = 1;
+                }
+                else if ( tok->idxs[tok->nidxs-1] < 0 )
+                    keep = 1;
             }
-            else if ( tok->idxs[tok->nidxs-1] < 0 )
-                keep = 1;
-
             if ( keep )
             {
                 if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
@@ -1069,23 +1207,23 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
         tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
     }
 
-#define BRANCH_INT(type_t,vector_end) \
+#define BRANCH_INT(type_t, convert, vector_end) \
     { \
         for (i=0; i<line->n_sample; i++) \
         { \
-            type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
-            int is_het = 0, has_ref = 0, missing = 0; \
+            uint8_t *ptr = fmt->p + i*fmt->size; \
+            int is_het = 0, has_ref = 0, missing = 0, jal = 0; \
             for (j=0; j<fmt->n; j++) \
             { \
-                if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
-                if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
-                int ial = ptr[j]; \
+                type_t ial = convert(&ptr[j * sizeof(type_t)]); \
+                if ( ial==vector_end ) break; /* smaller ploidy */ \
+                if ( bcf_gt_is_missing(ial) ) { missing=1; break; } /* missing allele */ \
                 if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
                 if ( j>0 ) \
                 { \
-                    int jal = ptr[j-1]; \
                     if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
                 } \
+                jal = ial; \
             } \
             char *dst = &tok->str_value.s[nvals1*i]; \
             if ( type==4 ) \
@@ -1123,9 +1261,9 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
         } \
     }
     switch (fmt->type) {
-        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  le_to_i8,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, le_to_i16, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, le_to_i32, bcf_int32_vector_end); break;
         default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break;
     }
 #undef BRANCH_INT
@@ -1182,9 +1320,13 @@ static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok)
     int i, rlen = strlen(line->d.allele[0]);
     for (i=1; i<line->n_allele; i++)
     {
+        if ( line->d.allele[i][0]=='<' )
+        {
+            bcf_double_set_missing(tok->values[i-1]);
+            continue;
+        }
         int alen = strlen(line->d.allele[i]);
-        if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]);
-        else tok->values[i-1] = alen - rlen;
+        tok->values[i-1] = alen - rlen;
     }
 }
 static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -1249,21 +1391,22 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
     }
 
     int j,nmissing = 0;
-    #define BRANCH(type_t, is_vector_end) { \
+    #define BRANCH(type_t, convert, is_vector_end) { \
         for (i=0; i<line->n_sample; i++) \
         { \
-            type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
+            uint8_t *ptr = fmt->p + i*fmt->size; \
             for (j=0; j<fmt->n; j++) \
             { \
-                if ( ptr[j]==is_vector_end ) break; \
-                if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
+                type_t val = convert(&ptr[j * sizeof(type_t)]); \
+                if ( val==is_vector_end ) break; \
+                if ( val==bcf_gt_missing ) { nmissing++; break; } \
             } \
         } \
     }
     switch (fmt->type) {
-        case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_vector_end); break;
         default: fprintf(bcftools_stderr,"todo: type %d\n", fmt->type); bcftools_exit(1); break;
     }
     #undef BRANCH
@@ -1895,19 +2038,6 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
     }
     return 1;
 }
-static inline double calc_binom(int na, int nb)
-{
-    if ( na==0 && nb==0 ) return -1;
-    if ( na==nb ) return 1;
-
-    // kfunc.h implements kf_betai, which is the regularized beta function  P(X<=k/N;p) = I_{1-p}(N-k,k+1)
-
-    double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5);
-    pval *= 2;
-    if ( pval>1 ) pval = 1;     // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
-
-    return pval;
-}
 static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
 {
     int i, istack = nstack - rtok->nargs;
@@ -1975,7 +2105,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
                     bcf_double_set_missing(rtok->values[i]);
                     continue;
                 }
-                rtok->values[i] = calc_binom(vals[idx1],vals[idx2]);
+                rtok->values[i] = calc_binom_two_sided(vals[idx1],vals[idx2],0.5);
                 if ( rtok->values[i] < 0 )
                 {
                     bcf_double_set_missing(rtok->values[i]);
@@ -1999,7 +2129,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
                     bcf_double_set_missing(rtok->values[i]);
                     continue;
                 }
-                rtok->values[i] = calc_binom(ptr1[0],ptr2[0]);
+                rtok->values[i] = calc_binom_two_sided(ptr1[0],ptr2[0],0.5);
                 if ( rtok->values[i] < 0 )
                 {
                     bcf_double_set_missing(rtok->values[i]);
@@ -2038,7 +2168,7 @@ static int func_binom(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
             bcf_double_set_missing(rtok->values[0]);
         else
         {
-            rtok->values[0] = calc_binom(ptr1[0],ptr2[0]);
+            rtok->values[0] = calc_binom_two_sided(ptr1[0],ptr2[0],0.5);
             if ( rtok->values[0] < 0 )
                 bcf_double_set_missing(rtok->values[0]);
         }
@@ -2308,6 +2438,18 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
     return 2;
 }
 
+// A note about comparisons:
+// When setting value by determining index from the genotype, we face the problem
+// of how to interpret truncating arrays. Say we have TAG defined as Number=. and
+//      GT:TAG   1/1:0,1,2  0/0:0
+// Then when querying we expect the following expression to evaluate for the second
+// sample as
+//      -i 'TAG[1:1]="."'  .. true
+//      -i 'TAG[1:GT]="."' .. false
+// The problem is that the implementation truncates the number of fields, filling
+// usually fewer than the original number of per-sample values. This is fixed by
+// adding an exception that makes the code aware of this: the GT indexing can be
+// recognised by having tok->idx==-3
 #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
 { \
     token_t *rtok = _rtok; \
@@ -2395,6 +2537,8 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
                 double *bptr = btok->values + i*btok->nval1; \
                 for (j=0; j<atok->nval1; j++) \
                 { \
+                    if ( atok->idx==-3 && bcf_double_is_vector_end(aptr[j]) ) break; /* explained above */ \
+                    if ( btok->idx==-3 && bcf_double_is_vector_end(bptr[j]) ) break; /* explained above */ \
                     int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
                     if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
                     nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \
@@ -2416,9 +2560,10 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
             { \
                 if ( !rtok->usmpl[i] ) continue; \
                 double *aptr = atok->values + i*atok->nval1; \
-                double *bptr = btok->values + i*btok->nval1; \
+                double *bptr = btok->values; \
                 for (j=0; j<atok->nval1; j++) \
                 { \
+                    if ( atok->idx==-3 && bcf_double_is_vector_end(aptr[j]) ) break; /* explained above */ \
                     int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
                     if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
                     for (k=0; k<btok->nvalues; k++) \
@@ -2442,10 +2587,11 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
             for (i=0; i<btok->nsamples; i++) \
             { \
                 if ( !rtok->usmpl[i] ) continue; \
-                double *aptr = atok->values + i*atok->nval1; \
+                double *aptr = atok->values; \
                 double *bptr = btok->values + i*btok->nval1; \
                 for (j=0; j<btok->nval1; j++) \
                 { \
+                    if ( atok->idx==-3 && bcf_double_is_vector_end(bptr[j]) ) break; /* explained above */ \
                     int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
                     if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
                     for (k=0; k<atok->nvalues; k++) \
@@ -2809,9 +2955,26 @@ static int max_ac_an_unpack(bcf_hdr_t *hdr)
 
     return BCF_UN_INFO;
 }
+static int filters_init1_ext(filter_t *filter, char *str, int len, token_t *tok)
+{
+    tok->hl_type  = -1;
+    tok->ht_type  = -1;
+    tok->tok_type  = TOK_VAL;
+    tok->hdr_id    = -1;
+    tok->pass_site = -1;
+    tok->idx       = 0;
+    tok->iext = ++filter->n_ext;
+    filter->ext = realloc(filter->ext,sizeof(*filter->ext)*filter->n_ext);
+    if ( !strncasecmp(str,"{str}",len) ) { tok->ht_type = BCF_HT_STR; tok->is_str = 1; }
+    else if ( !strncasecmp(str,"{int}",len) ) tok->ht_type = BCF_HT_INT;
+    else if ( !strncasecmp(str,"{float}",len) ) tok->ht_type = BCF_HT_REAL;
+    filter->ext[filter->n_ext-1] = tok->ht_type;
+    return 0;
+}
 static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
 {
-    tok->tag_type  = -1;
+    tok->ht_type  = -1;
+    tok->hl_type  = -1;
     tok->tok_type  = TOK_VAL;
     tok->hdr_id    = -1;
     tok->pass_site = -1;
@@ -2828,6 +2991,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         memcpy(tok->key,str+1,len-2);
         tok->key[len-2] = 0;
         tok->is_str = 1;
+        tok->ht_type = BCF_HT_STR;
         tok->nvalues = len-2;
         if ( !strcmp(".",tok->key) ) tok->is_missing = 1;
         return 0;
@@ -2869,6 +3033,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = filters_set_qual;
             tok->tag = strdup("QUAL");
+            tok->ht_type = BCF_HT_REAL;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2876,6 +3041,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = filters_set_type;
             tok->tag = strdup("TYPE");
+            tok->ht_type = BCF_HT_STR;
             return 0;
         }
         else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ )
@@ -2883,7 +3049,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->comparator = filters_cmp_filter;
             tok->tag = strdup("FILTER");
             filter->max_unpack |= BCF_UN_FLT;
-            tok->tag_type = BCF_HL_FLT;
+            tok->hl_type = BCF_HL_FLT;
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2891,6 +3058,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->comparator = filters_cmp_id;
             tok->tag = strdup("ID");
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2898,6 +3066,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = &filters_set_chrom;
             tok->tag = strdup("CHROM");
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2905,6 +3074,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = &filters_set_pos;
             tok->tag = strdup("POS");
+            tok->ht_type = BCF_HT_INT;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2913,6 +3083,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->setter = &filters_set_ref_string;
             tok->is_str = 1;
             tok->tag = strdup("REF");
+            tok->ht_type = BCF_HT_STR;
             filter_add_used_tag(filter,NULL,tok->tag);
             return 0;
         }
@@ -2921,6 +3092,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->setter = &filters_set_alt_string;
             tok->is_str = 1;
             tok->tag = strdup("ALT");
+            tok->ht_type = BCF_HT_STR;
             tok->idxs = (int*) malloc(sizeof(int));
             tok->idxs[0] = -1;
             tok->nidxs   = 1;
@@ -2932,6 +3104,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         {
             tok->setter = &filters_set_nalt;
             tok->tag = strdup("N_ALT");
+            tok->ht_type = BCF_HT_INT;
             return 0;
         }
         else if ( !strncasecmp(str,"N_SAMPLES",len) )
@@ -2939,6 +3112,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             tok->tok_type = TOK_VAL;
             tok->threshold = bcf_hdr_nsamples(filter->hdr);
             tok->is_constant = 1;
+            tok->ht_type = BCF_HT_INT;
             return 0;
         }
         else if ( !strncasecmp(str,"N_MISSING",len) )
@@ -2946,6 +3120,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             filter->max_unpack |= BCF_UN_FMT;
             tok->setter = &filters_set_nmissing;
             tok->tag = strdup("N_MISSING");
+            tok->ht_type = BCF_HT_INT;
             return 0;
         }
         else if ( !strncasecmp(str,"F_MISSING",len) )
@@ -2953,6 +3128,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             filter->max_unpack |= BCF_UN_FMT;
             tok->setter = &filters_set_nmissing;
             tok->tag = strdup("F_MISSING");
+            tok->ht_type = BCF_HT_REAL;
             return 0;
         }
     }
@@ -2993,13 +3169,14 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
     }
 
-    tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
+    tok->hl_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
     if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
     if ( tok->hdr_id>=0 )
     {
         if ( is_fmt && !strcmp("GT",tmp.s) )
         {
             tok->setter = &filters_set_genotype_string; tok->is_str = 1;
+            tok->ht_type = BCF_HT_STR;
         }
         else if ( is_fmt )
         {
@@ -3014,9 +3191,9 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             }
             switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
             {
-                case BCF_HT_INT:  tok->setter = &filters_set_format_int; break;
-                case BCF_HT_REAL: tok->setter = &filters_set_format_float; break;
-                case BCF_HT_STR:  tok->setter = &filters_set_format_string; tok->is_str = 1; break;
+                case BCF_HT_INT:  tok->setter = &filters_set_format_int; tok->ht_type = BCF_HT_INT; break;
+                case BCF_HT_REAL: tok->setter = &filters_set_format_float; tok->ht_type = BCF_HT_REAL; break;
+                case BCF_HT_STR:  tok->setter = &filters_set_format_string; tok->ht_type = BCF_HT_STR; tok->is_str = 1; break;
                 default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
             }
         }
@@ -3025,10 +3202,14 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         else
         {
             if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
+            {
                 tok->setter = filters_set_info_flag;
+                tok->ht_type = BCF_HT_INT;
+            }
             else
             {
-                if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1;
+                tok->ht_type = bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id);
+                if ( tok->ht_type == BCF_HT_STR ) tok->is_str = 1;
                 if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 )
                     tok->setter = filters_set_info;
                 else
@@ -3060,6 +3241,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     {
         tok->setter = &filters_set_alt_string;
         tok->is_str = 1;
+        tok->ht_type = BCF_HT_STR;
         tok->tag = strdup(tmp.s);
         free(tmp.s);
         filter_add_used_tag(filter,NULL,tok->tag);
@@ -3070,6 +3252,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= BCF_UN_FMT;
         tok->setter = &filters_set_an;
         tok->tag = strdup("AN");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3078,6 +3261,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= BCF_UN_FMT;
         tok->setter = &filters_set_ac;
         tok->tag = strdup("AC");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3086,6 +3270,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_mac;
         tok->tag = strdup("MAC");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3094,6 +3279,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_af;
         tok->tag = strdup("AF");
+        tok->ht_type = BCF_HT_REAL;
         free(tmp.s);
         return 0;
     }
@@ -3102,6 +3288,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_maf;
         tok->tag = strdup("MAF");
+        tok->ht_type = BCF_HT_REAL;
         free(tmp.s);
         return 0;
     }
@@ -3110,6 +3297,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         filter->max_unpack |= BCF_UN_STR;
         tok->setter = &filters_set_ilen;
         tok->tag = strdup("ILEN");
+        tok->ht_type = BCF_HT_INT;
         free(tmp.s);
         return 0;
     }
@@ -3132,7 +3320,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
             filter->status |= FILTER_ERR_UNKN_TAGS;
             filter_add_undef_tag(filter,tmp.s);
         }
+        tok->ht_type = BCF_HT_REAL;
     }
+    else
+        tok->ht_type = BCF_HT_INT;
     tok->is_constant = 1;
 
     if ( tmp.s ) free(tmp.s);
@@ -3286,6 +3477,33 @@ static void perl_destroy(filter_t *filter)
 #endif
 }
 
+// A very rudimentary heuristics to determine type, e.g. STR_TAG={} implies {str}.
+// Throws an error on anything more complex and asks for an explicit type.
+static void determine_ext_types(filter_t *filter, int ntok, token_t *tok)
+{
+    int i;
+    for (i=0; i<ntok; i++)
+    {
+        if ( !tok[i].iext || tok[i].ht_type!=-1 ) continue;
+        if ( !i || i+1==ntok ) break;       // first or last in the RPN
+        if ( tok[i-1].ht_type==-1 ) break;  // previous type not set
+        // todo: check if the next is an operator
+        // todo: case when the order is reversed, {}=TAG
+        tok[i].ht_type = tok[i-1].ht_type;
+    }
+    if ( i!=ntok )
+        error("[%s:%d %s] Error: unable to determine the type, use explicit notation: %s\n",__FILE__,__LINE__,__FUNCTION__,filter->str);
+    for (i=0; i<ntok; i++)
+    {
+        int j = tok[i].iext - 1;
+        if ( j<0 ) continue;
+        if ( filter->ext[j]!=-1 && filter->ext[j]!=tok[i].ht_type  )
+            error("[%s:%d %s] FIXME: this should not happen %d vs %d, iext=%d\n",__FILE__,__LINE__,__FUNCTION__,filter->ext[j],tok[i].ht_type,j);
+        filter->ext[j] = tok[i].ht_type;
+        if ( tok[i].ht_type==BCF_HT_STR ) tok[i].is_str = 1;
+    }
+}
+
 
 // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
 static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error)
@@ -3333,6 +3551,13 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
             memset(&ops[nops-1],0,sizeof(token_t));
             nops--;
         }
+        else if ( ret==TOK_EXT )    // external value
+        {
+            nout++;
+            hts_expand0(token_t, nout, mout, out);
+            filters_init1_ext(filter, tmp, len, &out[nout-1]);
+            tmp += len;
+        }
         else if ( ret!=TOK_VAL )    // one of the operators
         {
             // detect unary minus: replace -value with -1*(value)
@@ -3443,12 +3668,12 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
             hts_expand0(token_t, nops, mops, ops);
             ops[nops-1].tok_type = ret;
         }
-        else if ( !len )
+        else if ( !len )    // all tokes read or an error
         {
             if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
             break;     // all tokens read
         }
-        else           // annotation name or filtering value
+        else           // TOK_VAL: annotation name or value
         {
             nout++;
             hts_expand0(token_t, nout, mout, out);
@@ -3470,10 +3695,21 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
         nops--;
     }
 
+    if ( filter->status != FILTER_OK )
+    {
+        if ( mops ) free(ops);
+        filter->filters   = out;
+        filter->nfilters  = nout;
+        return filter;
+    }
+
+    // Determine types of external variables from the context
+    determine_ext_types(filter,nout,out);
+
     // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the
     // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
     // just before or after the FILTER token and they must be followed with a comparison operator.
-    // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator.
+    // At this point we also initialize regex expressions which, in RPN, must precede the LIKE/NLIKE operator.
     // Additionally, treat "." as missing value rather than a string in numeric equalities; that
     // @file is only used with ID; etc.
     // This code is fragile: improve me.
@@ -3488,7 +3724,10 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
         {
             int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1;
             if ( out[j].comparator!=filters_cmp_id )
-                error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n");
+            {
+                if ( out[j].comparator ) error("Error: could not parse the expression with \"@file_name\" syntax (possible todo)\n");
+                out[j].comparator = filters_cmp_string_hash;
+            }
         }
         if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
             out[i].func = vector_logic_or;
@@ -3504,7 +3743,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
                 int set_missing = 0;
                 if ( out[k].hdr_id>0 )
                 {
-                    int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id);
+                    int type = bcf_hdr_id2type(filter->hdr,out[k].hl_type,out[k].hdr_id);
                     if ( type==BCF_HT_INT ) set_missing = 1;
                     else if ( type==BCF_HT_REAL ) set_missing = 1;
                 }
@@ -3545,7 +3784,8 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
         if ( !out[i].tag ) continue;
         if ( out[i].setter==filters_set_type )
         {
-            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+            if ( i+1==nout || !out[i+1].key )
+                error("Could not parse the expression: %s\n", filter->str);
             int itok, ival;
             if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
             else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
@@ -3593,7 +3833,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
             else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; }  // r
             continue;
         }
-        if ( out[i].tag_type==BCF_HL_FLT )
+        if ( out[i].hl_type==BCF_HL_FLT )
         {
             if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
             int itok = i, ival;
@@ -3707,6 +3947,7 @@ void filter_destroy(filter_t *filter)
     }
     for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
     for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
+    free(filter->ext);
     free(filter->undef_tag);
     free(filter->used_tag);
     free(filter->cached_GT.buf);
@@ -3720,6 +3961,37 @@ void filter_destroy(filter_t *filter)
     free(filter);
 }
 
+int filter_test_ext(filter_t *filter, bcf1_t *rec, const uint8_t **samples, const void **ext)
+{
+    if ( !filter->n_ext )
+        return filter_test(filter,rec,samples);
+
+    int i;
+    for (i=0; i<filter->nfilters; i++)
+    {
+        token_t *tok = &filter->filters[i];
+        if ( !tok->iext ) continue;
+        if ( !ext[tok->iext-1] )
+        {
+            tok->is_missing = 1;
+            tok->nvalues = 0;
+            if ( filter->ext[tok->iext-1]==BCF_HT_STR ) tok->str_value.l = 0;
+            continue;
+        }
+        tok->is_missing = 0;
+        tok->nvalues = 1;
+        if ( filter->ext[tok->iext-1]==BCF_HT_STR )
+        {
+            tok->str_value.l = 0;
+            kputs((const char*)ext[tok->iext-1],&tok->str_value);
+            tok->nvalues = tok->str_value.l;
+        }
+        else if ( filter->ext[tok->iext-1]==BCF_HT_INT ) tok->values[0] = *((const int*)ext[tok->iext-1]);
+        else if ( filter->ext[tok->iext-1]==BCF_HT_REAL ) tok->values[0] = *((const float*)ext[tok->iext-1]);
+    }
+    return filter_test(filter,rec,samples);
+}
+
 int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
 {
     if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n");
@@ -3856,7 +4128,11 @@ int filter_max_unpack(filter_t *flt)
 {
     return flt->max_unpack;
 }
-
+const int *filter_ext_types(filter_t *filter, int *n_ext)
+{
+    *n_ext = filter->n_ext;
+    return filter->ext;
+}
 const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
 {
     token_t *tok = filter->flt_stack[0];
diff --git a/bcftools/filter.h b/bcftools/filter.h
index cc60d6b96..d6a8e0893 100644
--- a/bcftools/filter.h
+++ b/bcftools/filter.h
@@ -1,6 +1,6 @@
 /*  filter.h -- filter expressions.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -31,8 +31,9 @@ typedef struct _filter_t filter_t;
 
 /**
   *  @hdr:  BCF header file
-  *  @str:  see the bcftools filter command help for description
-  *
+  *  @str:  see the bcftools filter command help for description.
+  *         See also the extended usage described in filter_test_ext(),
+  *         intended for programmatic access
   *  Same as filter_parse() but exits on errors
   */
 filter_t *filter_init(bcf_hdr_t *hdr, const char *str);
@@ -48,6 +49,22 @@ void filter_destroy(filter_t *filter);
   */
 int filter_test(filter_t *filter, bcf1_t *rec, const uint8_t **samples);
 
+/**
+  *  filter_test_ext() - same as filter_test(), but sets some of the terms
+  *        on the fly. An expression initialized with, say,
+  *        "STR_TAG={} | INT_TAG={} | FLT_TAG={}" takes three
+  *        additional pointer arguments which are expected to point to memory
+  *        area occupied by the appropriate type, see also filter_ext_types().
+  *        The type determination is not fool-proof, in such case the type can
+  *        be given explicitly as eg "TAG={str}".
+  *  @ext: array of size 'n_ext' occupied with pointers to the data types
+  *        inferred from the expression given at the time of initialization.
+  *        The pointers set to NULL will be treated as if missing value "."
+  *        was given.
+  *  @n_ext: the size of 'ext' array
+  */
+int filter_test_ext(filter_t *filter, bcf1_t *rec, const uint8_t **samples, const void **ext);
+
 /**
   *  filter_set_samples() - restrict filtering expression to samples.
   *             Call after filter_init().
@@ -60,9 +77,14 @@ void filter_set_samples(filter_t *filter, const uint8_t *samples);
   */
 const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1);
 
-void filter_expression_info(FILE *fp);
 int filter_max_unpack(filter_t *filter);
 
+/**
+  *  filter_ext_types() - returns the number and BCF_HT_* types of external values
+  *         found in the filtering expression
+  */
+const int *filter_ext_types(filter_t *filter, int *n_ext);
+
 /**
   *  Same as filter_init() but may not exit on some type of errors. The caller
   *  must check if the returned value is not NULL and if the consequent call
diff --git a/bcftools/gff.c b/bcftools/gff.c
index 90da84ba9..283ced331 100644
--- a/bcftools/gff.c
+++ b/bcftools/gff.c
@@ -23,6 +23,21 @@
    THE SOFTWARE.
 */
 
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
+#include <strings.h>
+#include <htslib/hts.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
+#include "bcftools.h"
+#include "regidx.h"
 #include "gff.h"
 
 /*
@@ -39,9 +54,9 @@ typedef struct
     uint32_t beg;
     uint32_t end;
     uint32_t trid;
-    uint32_t strand:1;  // STRAND_REV,STRAND_FWD
+    uint32_t strand:2;  // STRAND_{REV,FWD,UNK}
     uint32_t phase:2;   // 0, 1, 2, or 3 for unknown
-    uint32_t iseq:29;
+    uint32_t iseq:28;
 }
 ftr_t;
 
@@ -460,13 +475,13 @@ static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
     // associate with transcript id
     gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
 
-    if ( ftr->strand==-1 && gff->verbosity > 0 )
+    if ( ftr->strand==STRAND_UNK && gff->verbosity > 0 )
     {
         if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
             fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
         gff->warned.unknown_strand++;
     }
-    if ( ftr->phase==-1 && gff->verbosity > 0 )
+    if ( ftr->phase==CDS_PHASE_UNKN && gff->verbosity > 0 )
     {
         if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
             fprintf(stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
@@ -507,8 +522,8 @@ static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
         gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
 }
 
-// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
-// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// Returns 0 for exons,CDS,UTRs to indicate these need to be pruned later and regidx built on them,
+// or -1 to indicate the structure needs not be saved (either because of an error or because saved
 // as transcript or gene.)
 static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
 {
@@ -554,10 +569,11 @@ static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
     ftr->strand = -1;
     if ( *ss == '+' ) ftr->strand = STRAND_FWD;
     else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+    else ftr->strand = STRAND_UNK;
     ss += 2;
 
     // 8th column: phase (codon offset)
-    ftr->phase = -1;
+    ftr->phase = CDS_PHASE_UNKN;
     if ( *ss == '0' ) ftr->phase = 0;
     else if ( *ss == '1' ) ftr->phase = 1;
     else if ( *ss == '2' ) ftr->phase = 2;
@@ -727,12 +743,12 @@ static void tscript_init_cds(gff_t *gff)
                 if ( phase!=len%3 )
                 {
                     if ( !gff->force )
-                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%"PRIu32": phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
                                 gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                     if ( gff->verbosity > 0 )
                     {
                         if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
-                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%"PRIu32": phase!=len%%3 (phase=%d, len=%d)\n",
                                     gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                         gff->warned.wrong_phase++;
                     }
@@ -743,7 +759,7 @@ static void tscript_init_cds(gff_t *gff)
             }
             if ( !tscript_ok ) continue;    // skip this transcript
         }
-        else
+        else if ( tr->strand==STRAND_REV )
         {
             if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
             {
@@ -790,12 +806,12 @@ static void tscript_init_cds(gff_t *gff)
                 if ( phase!=len%3 )
                 {
                     if ( !gff->force )
-                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%"PRIu32": phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
                                 gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                     if ( gff->verbosity > 0 )
                     {
                         if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
-                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%"PRIu32": phase!=len%%3 (phase=%d, len=%d)\n",
                                     gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                         gff->warned.wrong_phase++;
                     }
@@ -806,6 +822,8 @@ static void tscript_init_cds(gff_t *gff)
             }
             if ( !tscript_ok ) continue;    // skip this transcript
         }
+        else
+            continue;   // unknown strand
 
         // set len. At the same check that CDS within a transcript do not overlap
         len = 0;
@@ -854,7 +872,7 @@ static void tscript_init_cds(gff_t *gff)
                     i--;
                 }
             }
-            else
+            else if ( tr->strand==STRAND_REV )
             {
                 i = 0;
                 while ( i<tr->ncds && len%3 )
@@ -896,7 +914,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
         char *gene_id = gff->init.gene_ids.str[gene->id];
         str.l = 0;
-        ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+        ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
 
@@ -907,7 +925,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         char *gene_id =  gff->init.gene_ids.str[tr->gene->id];
         const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
         str.l = 0;
-        ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+        ksprintf(&str,"%s\t.\t%s\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -918,7 +936,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
         gf_tscript_t *tr = cds->tr;
         str.l = 0;
-        ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+        ksprintf(&str,"%s\t.\tCDS\t%"PRIu32"\t%"PRIu32"\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -929,7 +947,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
         gf_tscript_t *tr = utr->tr;
         str.l = 0;
-        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id]);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -940,7 +958,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
         gf_tscript_t *tr = exon->tr;
         str.l = 0;
-        ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        ksprintf(&str,"%s\t.\texon\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id]);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -1004,7 +1022,7 @@ int gff_parse(gff_t *gff)
         else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
         else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
         else
-            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+            error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
     }
     tscript_init_cds(gff);
 
@@ -1046,7 +1064,7 @@ int gff_parse(gff_t *gff)
         INC_NWARN(wrong_phase);
         INC_NWARN(overlapping_cds);
         if ( nwarn > 0 )
-            fprintf(stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+            fprintf(stderr,"Warning: %d warnings were suppressed, increase verbosity to see them all\n",nwarn);
     }
 
     if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
diff --git a/bcftools/gff.c.pysam.c b/bcftools/gff.c.pysam.c
index f5c817d73..3722f606c 100644
--- a/bcftools/gff.c.pysam.c
+++ b/bcftools/gff.c.pysam.c
@@ -25,6 +25,21 @@
    THE SOFTWARE.
 */
 
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
+#include <strings.h>
+#include <htslib/hts.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
+#include "bcftools.h"
+#include "regidx.h"
 #include "gff.h"
 
 /*
@@ -41,9 +56,9 @@ typedef struct
     uint32_t beg;
     uint32_t end;
     uint32_t trid;
-    uint32_t strand:1;  // STRAND_REV,STRAND_FWD
+    uint32_t strand:2;  // STRAND_{REV,FWD,UNK}
     uint32_t phase:2;   // 0, 1, 2, or 3 for unknown
-    uint32_t iseq:29;
+    uint32_t iseq:28;
 }
 ftr_t;
 
@@ -462,13 +477,13 @@ static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
     // associate with transcript id
     gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
 
-    if ( ftr->strand==-1 && gff->verbosity > 0 )
+    if ( ftr->strand==STRAND_UNK && gff->verbosity > 0 )
     {
         if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
             fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
         gff->warned.unknown_strand++;
     }
-    if ( ftr->phase==-1 && gff->verbosity > 0 )
+    if ( ftr->phase==CDS_PHASE_UNKN && gff->verbosity > 0 )
     {
         if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
             fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
@@ -509,8 +524,8 @@ static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
         gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
 }
 
-// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
-// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// Returns 0 for exons,CDS,UTRs to indicate these need to be pruned later and regidx built on them,
+// or -1 to indicate the structure needs not be saved (either because of an error or because saved
 // as transcript or gene.)
 static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
 {
@@ -556,10 +571,11 @@ static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
     ftr->strand = -1;
     if ( *ss == '+' ) ftr->strand = STRAND_FWD;
     else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+    else ftr->strand = STRAND_UNK;
     ss += 2;
 
     // 8th column: phase (codon offset)
-    ftr->phase = -1;
+    ftr->phase = CDS_PHASE_UNKN;
     if ( *ss == '0' ) ftr->phase = 0;
     else if ( *ss == '1' ) ftr->phase = 1;
     else if ( *ss == '2' ) ftr->phase = 2;
@@ -729,12 +745,12 @@ static void tscript_init_cds(gff_t *gff)
                 if ( phase!=len%3 )
                 {
                     if ( !gff->force )
-                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%"PRIu32": phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
                                 gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                     if ( gff->verbosity > 0 )
                     {
                         if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
-                            fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                            fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%"PRIu32": phase!=len%%3 (phase=%d, len=%d)\n",
                                     gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                         gff->warned.wrong_phase++;
                     }
@@ -745,7 +761,7 @@ static void tscript_init_cds(gff_t *gff)
             }
             if ( !tscript_ok ) continue;    // skip this transcript
         }
-        else
+        else if ( tr->strand==STRAND_REV )
         {
             if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
             {
@@ -792,12 +808,12 @@ static void tscript_init_cds(gff_t *gff)
                 if ( phase!=len%3 )
                 {
                     if ( !gff->force )
-                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%"PRIu32": phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
                                 gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                     if ( gff->verbosity > 0 )
                     {
                         if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
-                            fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                            fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%"PRIu32": phase!=len%%3 (phase=%d, len=%d)\n",
                                     gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                         gff->warned.wrong_phase++;
                     }
@@ -808,6 +824,8 @@ static void tscript_init_cds(gff_t *gff)
             }
             if ( !tscript_ok ) continue;    // skip this transcript
         }
+        else
+            continue;   // unknown strand
 
         // set len. At the same check that CDS within a transcript do not overlap
         len = 0;
@@ -856,7 +874,7 @@ static void tscript_init_cds(gff_t *gff)
                     i--;
                 }
             }
-            else
+            else if ( tr->strand==STRAND_REV )
             {
                 i = 0;
                 while ( i<tr->ncds && len%3 )
@@ -898,7 +916,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
         char *gene_id = gff->init.gene_ids.str[gene->id];
         str.l = 0;
-        ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+        ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
 
@@ -909,7 +927,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         char *gene_id =  gff->init.gene_ids.str[tr->gene->id];
         const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
         str.l = 0;
-        ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+        ksprintf(&str,"%s\t.\t%s\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -920,7 +938,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
         gf_tscript_t *tr = cds->tr;
         str.l = 0;
-        ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+        ksprintf(&str,"%s\t.\tCDS\t%"PRIu32"\t%"PRIu32"\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -931,7 +949,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
         gf_tscript_t *tr = utr->tr;
         str.l = 0;
-        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id]);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -942,7 +960,7 @@ static int gff_dump(gff_t *gff, const char *fname)
         gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
         gf_tscript_t *tr = exon->tr;
         str.l = 0;
-        ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        ksprintf(&str,"%s\t.\texon\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id]);
         if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
     }
     regitr_destroy(itr);
@@ -1006,7 +1024,7 @@ int gff_parse(gff_t *gff)
         else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
         else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
         else
-            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+            error("something: %s\t%"PRIu32"\t%"PRIu32"\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
     }
     tscript_init_cds(gff);
 
@@ -1048,7 +1066,7 @@ int gff_parse(gff_t *gff)
         INC_NWARN(wrong_phase);
         INC_NWARN(overlapping_cds);
         if ( nwarn > 0 )
-            fprintf(bcftools_stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+            fprintf(bcftools_stderr,"Warning: %d warnings were suppressed, increase verbosity to see them all\n",nwarn);
     }
 
     if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
diff --git a/bcftools/gff.h b/bcftools/gff.h
index ebb64634a..afa945e81 100644
--- a/bcftools/gff.h
+++ b/bcftools/gff.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2023 Genome Research Ltd.
+   Copyright (c) 2023-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -36,7 +36,7 @@
 
     Read about transcript types here
         http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
-        http://www.ensembl.org/info/genome/variation/predicted_data.html
+        https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
         https://www.gencodegenes.org/pages/biotypes.html
 
     List of supported biotypes
@@ -137,23 +137,7 @@
 #ifndef GFF_H__
 #define GFF_H__
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <getopt.h>
-#include <math.h>
-#include <inttypes.h>
-#include <htslib/hts.h>
-#include <htslib/khash.h>
-#include <htslib/khash_str2int.h>
-#include <htslib/kseq.h>
-#include <htslib/faidx.h>
-#include <htslib/bgzf.h>
-#include <errno.h>
-#include <unistd.h>
-#include <ctype.h>
-#include "bcftools.h"
-#include "regidx.h"
+#include <stdint.h>
 
 #ifndef __FUNCTION__
 #  define __FUNCTION__ __func__
@@ -166,6 +150,7 @@
 
 #define STRAND_REV 0
 #define STRAND_FWD 1
+#define STRAND_UNK 2
 
 #define TRIM_NONE   0
 #define TRIM_5PRIME 1
@@ -289,9 +274,9 @@ struct gf_tscript_t_
 {
     uint32_t id;        // transcript id
     uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
-    uint32_t strand:1,  // STRAND_REV or STRAND_FWD
+    uint32_t strand:2,  // STRAND_REV,FWD,UNK
              used:1,    // does it have any exons, UTRs, CDS?
-             ncds:30,   // number of exons
+             ncds:29,   // number of exons
              mcds;
     gf_cds_t **cds;     // ordered list of exons
     uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
diff --git a/bcftools/gvcf.c b/bcftools/gvcf.c
index c7b2e77d1..137194a44 100644
--- a/bcftools/gvcf.c
+++ b/bcftools/gvcf.c
@@ -40,7 +40,7 @@ struct _gvcf_t
 void gvcf_update_header(gvcf_t *gvcf, bcf_hdr_t *hdr)
 {
     bcf_hdr_append(hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
-    bcf_hdr_append(hdr,"##INFO=<ID=MinDP,Number=1,Type=Integer,Description=\"Minimum per-sample depth in this gVCF block\">");
+    bcf_hdr_append(hdr,"##INFO=<ID=MIN_DP,Number=1,Type=Integer,Description=\"Minimum per-sample depth in this gVCF block\">");
 }
 
 gvcf_t *gvcf_init(const char *dp_ranges)
@@ -98,7 +98,6 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
     // encountered, or other conditions not met (block broken by a non-ref or DP too low).
     int needs_flush = can_collapse ? 0 : 1;
 
-
     // Can the record be included in a gVCF block? That is, is this a ref-only site?
     if ( rec && can_collapse )
     {
@@ -148,7 +147,7 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
         bcf_update_alleles_str(hdr, gvcf->line, gvcf->als.s);
         if ( gvcf->start+1 < gvcf->end )    // create gVCF record only if it spans at least two sites
             bcf_update_info_int32(hdr, gvcf->line, "END", &gvcf->end, 1);
-        bcf_update_info_int32(hdr, gvcf->line, "MinDP", &gvcf->min_dp, 1);
+        bcf_update_info_int32(hdr, gvcf->line, "MIN_DP", &gvcf->min_dp, 1);
         if ( gvcf->nqsum>0 )
             bcf_update_info_float(hdr, gvcf->line, "QS", gvcf->qsum, gvcf->nqsum);
         if ( gvcf->ngts )
@@ -220,7 +219,7 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
     }
 
     if ( is_ref && min_dp )
-        bcf_update_info_int32(hdr, rec, "MinDP", &min_dp, 1);
+        bcf_update_info_int32(hdr, rec, "MIN_DP", &min_dp, 1);
 
     return rec;
 }
diff --git a/bcftools/gvcf.c.pysam.c b/bcftools/gvcf.c.pysam.c
index bd881f4b3..9db3af99d 100644
--- a/bcftools/gvcf.c.pysam.c
+++ b/bcftools/gvcf.c.pysam.c
@@ -42,7 +42,7 @@ struct _gvcf_t
 void gvcf_update_header(gvcf_t *gvcf, bcf_hdr_t *hdr)
 {
     bcf_hdr_append(hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
-    bcf_hdr_append(hdr,"##INFO=<ID=MinDP,Number=1,Type=Integer,Description=\"Minimum per-sample depth in this gVCF block\">");
+    bcf_hdr_append(hdr,"##INFO=<ID=MIN_DP,Number=1,Type=Integer,Description=\"Minimum per-sample depth in this gVCF block\">");
 }
 
 gvcf_t *gvcf_init(const char *dp_ranges)
@@ -100,7 +100,6 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
     // encountered, or other conditions not met (block broken by a non-ref or DP too low).
     int needs_flush = can_collapse ? 0 : 1;
 
-
     // Can the record be included in a gVCF block? That is, is this a ref-only site?
     if ( rec && can_collapse )
     {
@@ -150,7 +149,7 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
         bcf_update_alleles_str(hdr, gvcf->line, gvcf->als.s);
         if ( gvcf->start+1 < gvcf->end )    // create gVCF record only if it spans at least two sites
             bcf_update_info_int32(hdr, gvcf->line, "END", &gvcf->end, 1);
-        bcf_update_info_int32(hdr, gvcf->line, "MinDP", &gvcf->min_dp, 1);
+        bcf_update_info_int32(hdr, gvcf->line, "MIN_DP", &gvcf->min_dp, 1);
         if ( gvcf->nqsum>0 )
             bcf_update_info_float(hdr, gvcf->line, "QS", gvcf->qsum, gvcf->nqsum);
         if ( gvcf->ngts )
@@ -222,7 +221,7 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
     }
 
     if ( is_ref && min_dp )
-        bcf_update_info_int32(hdr, rec, "MinDP", &min_dp, 1);
+        bcf_update_info_int32(hdr, rec, "MIN_DP", &min_dp, 1);
 
     return rec;
 }
diff --git a/bcftools/main.c b/bcftools/main.c
index a0213589f..14357373e 100644
--- a/bcftools/main.c
+++ b/bcftools/main.c
@@ -265,7 +265,7 @@ int main(int argc, char *argv[])
     if (argc < 2) { usage(stderr); return 1; }
 
     if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
-        printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2023 Genome Research Ltd.\n", bcftools_version(), hts_version());
+        printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2024 Genome Research Ltd.\n", bcftools_version(), hts_version());
 #if USE_GPL
         printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
 #else
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c
index 7608adc8b..56174fa2d 100644
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -267,7 +267,7 @@ int bcftools_main(int argc, char *argv[])
     if (argc < 2) { usage(bcftools_stderr); return 1; }
 
     if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
-        fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2023 Genome Research Ltd.\n", bcftools_version(), hts_version());
+        fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2024 Genome Research Ltd.\n", bcftools_version(), hts_version());
 #if USE_GPL
         fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
 #else
diff --git a/bcftools/mcall.c b/bcftools/mcall.c
index 804ff0131..13383787e 100644
--- a/bcftools/mcall.c
+++ b/bcftools/mcall.c
@@ -444,7 +444,7 @@ void mcall_destroy(call_t *call)
 // qual calculation is not affected.
 // Missing values are replaced by generic likelihoods when X (unseen allele) is
 // present.
-// NB: While the -m callig model uses the pdgs in canonical order,
+// NB: While the -m calling model uses the pdgs in canonical order,
 // the original samtools -c calling code uses pdgs in reverse order (AA comes
 // first, RR last).
 // NB: Ploidy is not taken into account here, which is incorrect.
@@ -1495,7 +1495,7 @@ int mcall(call_t *call, bcf1_t *rec)
     // If available, take into account reference panel AFs
     if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
     {
-        int an = call->ac[0];   // number of alleles total, procede only if not zero; reuse call->ac
+        int an = call->ac[0];   // number of alleles total, proceed only if not zero; reuse call->ac
         if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 )    // number of ALT alleles
         {
             int ac0 = an;       // this will become the number of REFs
@@ -1558,6 +1558,11 @@ int mcall(call_t *call, bcf1_t *rec)
     call->nals_new = 0;
     for (i=0; i<nals_ori; i++)
     {
+        if ( (call->flag&CALL_KEEP_UNSEEN) && i==unseen && call->nals_new==1 )
+        {
+            call->nals_new++;
+            call->als_new |= 1<<i;
+        }
         if ( i>0 && i==unseen ) continue;
         if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
         if ( call->als_new & (1<<i) ) call->nals_new++;
@@ -1669,6 +1674,6 @@ int mcall(call_t *call, bcf1_t *rec)
 
     bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0);     // remove I16 tag
 
-    return call->nals_new;
+    return is_variant ? call->nals_new : 1;
 }
 
diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c
index bf3806f5f..345d11037 100644
--- a/bcftools/mcall.c.pysam.c
+++ b/bcftools/mcall.c.pysam.c
@@ -446,7 +446,7 @@ void mcall_destroy(call_t *call)
 // qual calculation is not affected.
 // Missing values are replaced by generic likelihoods when X (unseen allele) is
 // present.
-// NB: While the -m callig model uses the pdgs in canonical order,
+// NB: While the -m calling model uses the pdgs in canonical order,
 // the original samtools -c calling code uses pdgs in reverse order (AA comes
 // first, RR last).
 // NB: Ploidy is not taken into account here, which is incorrect.
@@ -1497,7 +1497,7 @@ int mcall(call_t *call, bcf1_t *rec)
     // If available, take into account reference panel AFs
     if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
     {
-        int an = call->ac[0];   // number of alleles total, procede only if not zero; reuse call->ac
+        int an = call->ac[0];   // number of alleles total, proceed only if not zero; reuse call->ac
         if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 )    // number of ALT alleles
         {
             int ac0 = an;       // this will become the number of REFs
@@ -1560,6 +1560,11 @@ int mcall(call_t *call, bcf1_t *rec)
     call->nals_new = 0;
     for (i=0; i<nals_ori; i++)
     {
+        if ( (call->flag&CALL_KEEP_UNSEEN) && i==unseen && call->nals_new==1 )
+        {
+            call->nals_new++;
+            call->als_new |= 1<<i;
+        }
         if ( i>0 && i==unseen ) continue;
         if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
         if ( call->als_new & (1<<i) ) call->nals_new++;
@@ -1671,6 +1676,6 @@ int mcall(call_t *call, bcf1_t *rec)
 
     bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0);     // remove I16 tag
 
-    return call->nals_new;
+    return is_variant ? call->nals_new : 1;
 }
 
diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c
index d42a6a360..943e0f6f6 100644
--- a/bcftools/mpileup.c
+++ b/bcftools/mpileup.c
@@ -1,6 +1,6 @@
 /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
 
-    Copyright (C) 2008-2023 Genome Research Ltd.
+    Copyright (C) 2008-2024 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -72,8 +72,11 @@ typedef struct {
     uint32_t fmt_flag;
     int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
     int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
+    int seqQ_offset;
     double min_frac; // for indels
-    double indel_bias;
+    double indel_bias, poly_mqual;
+    double del_bias; // compensate for diff deletion vs insertion error rates
+    double vs_ref;
     char *reg_fname, *pl_list, *fai_fname, *output_fname;
     int reg_is_file, record_cmd_line, n_threads, clevel;
     faidx_t *fai;
@@ -99,6 +102,7 @@ typedef struct {
     htsFile *bcf_fp;
     bcf_hdr_t *bcf_hdr;
     int indels_v20;
+    int edlib;
     int argc;
     char **argv;
     int write_index;
@@ -584,12 +588,14 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
 
         // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
         // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
-        if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth )
+        if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth)
         {
             bcf_callaux_clean(conf->bca, &conf->bc);
             conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL;
             int iret;
-            if ( conf->indels_v20 )
+            if (conf->edlib)
+                iret = bcf_edlib_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref, ref_len);
+            else if ( conf->indels_v20 )
                 iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
             else
                 iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
@@ -606,7 +612,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
             }
         }
     }
-    return 0;
+    return ret;
 }
 
 static int mpileup(mplp_conf_t *conf)
@@ -646,8 +652,12 @@ static int mpileup(mplp_conf_t *conf)
             }
         }
         nregs = regidx_nregs(conf->reg);
-        conf->reg_itr = regitr_init(conf->reg);
-        regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
+        if ( nregs )
+        {
+            // the regions list can be empty, see #2250
+            conf->reg_itr = regitr_init(conf->reg);
+            regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
+        }
     }
 
     // read the header of each file in the list and initialize data
@@ -693,7 +703,7 @@ static int mpileup(mplp_conf_t *conf)
             i--;
             continue;
         }
-        if (conf->reg) {
+        if (conf->reg && nregs) {
             hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
             if (idx == NULL) {
                 fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
@@ -857,13 +867,16 @@ static int mpileup(mplp_conf_t *conf)
     for (i=0; i<nsmpl; i++)
         bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
     if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
-    if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
+    if ( init_index2(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,
+                     &conf->index_fn, conf->write_index) < 0 )
+        error("Error: failed to initialise index for %s\n",conf->output_fname);
 
     conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
                               conf->delta_baseQ);
     conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
     conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
     conf->bca->indel_bias = conf->indel_bias;
+    conf->bca->del_bias = conf->del_bias;
     conf->bca->min_frac = conf->min_frac;
     conf->bca->min_support = conf->min_support;
     conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
@@ -871,6 +884,10 @@ static int mpileup(mplp_conf_t *conf)
     conf->bca->ambig_reads = conf->ambig_reads;
     conf->bca->indel_win_size = conf->indel_win_size;
     conf->bca->indels_v20 = conf->indels_v20;
+    conf->bca->edlib = conf->edlib;
+    conf->bca->seqQ_offset = conf->seqQ_offset;
+    conf->bca->poly_mqual = conf->poly_mqual;
+    conf->bca->vs_ref = conf->vs_ref;
 
     conf->bc.bcf_hdr = conf->bcf_hdr;
     conf->bc.n  = nsmpl;
@@ -925,6 +942,7 @@ static int mpileup(mplp_conf_t *conf)
 
 
     // Run mpileup for multiple regions
+    int ret = 0;
     if ( nregs )
     {
         int ireg = 0;
@@ -953,12 +971,18 @@ static int mpileup(mplp_conf_t *conf)
                     bam_mplp_reset(conf->iter);
                 }
             }
-            mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+            ret = mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+            if ( ret<0 ) break;
         }
         while ( regitr_loop(conf->reg_itr) );
     }
-    else
-        mpileup_reg(conf,0,UINT32_MAX);
+    else if ( !conf->reg )
+        ret = mpileup_reg(conf,0,UINT32_MAX);
+    if ( ret<0 )
+    {
+        fprintf(stderr, "[%s] failed to read from input file\n", __func__);
+        exit(EXIT_FAILURE);
+    }
 
     flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
 
@@ -1150,7 +1174,7 @@ static void list_annotations(FILE *fp)
         "\n"
         "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
         "\n"
-        "  FORMAT/AD   .. Allelic depth (Number=R,Type=Integer)\n"
+        "* FORMAT/AD   .. Allelic depth (Number=R,Type=Integer)\n"
         "  FORMAT/ADF  .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
         "  FORMAT/ADR  .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
         "  FORMAT/DP   .. Number of high-quality bases (Number=1,Type=Integer)\n"
@@ -1200,7 +1224,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "\n"
         "Input options:\n"
         "  -6, --illumina1.3+      Quality is in the Illumina-1.3+ encoding\n"
-        "  -A, --count-orphans     Do not discard anomalous read pairs\n"
+        "  -A, --count-orphans     Include anomalous read pairs, with flag PAIRED but not PROPER_PAIR set\n"
         "  -b, --bam-list FILE     List of input BAM filenames, one per line\n"
         "  -B, --no-BAQ            Disable BAQ (per-Base Alignment Quality)\n"
         "  -C, --adjust-MQ INT     Adjust mapping quality [0]\n"
@@ -1245,10 +1269,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
         "                          'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
         "      --threads INT       Use multithreading with INT worker threads [0]\n"
-        "      --write-index       Automatically index the output files [off]\n"
+        "  -W, --write-index[=FMT] Automatically index the output files [off]\n"
         "\n"
         "SNP/INDEL genotype likelihoods options:\n"
-        "  -X, --config STR        Specify platform specific profiles (see below)\n"
+        "  -X, --config STR        Specify platform profile (use \"-X list\" for details)\n"
         "  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
     fprintf(fp,
         "  -F, --gap-frac FLOAT    Minimum fraction of gapped reads [%g]\n", mplp->min_frac);
@@ -1269,24 +1293,26 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "  --ar, --ambig-reads STR   What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
     fprintf(fp,
         "      --indel-bias FLOAT  Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+    fprintf(fp,
+        "      --del-bias FLOAT    Relative likelihood of insertion to deletion [%.2f]\n", mplp->del_bias);
+    fprintf(fp,
+        "      --score-vs-ref FLOAT\n"
+        "                          Ratio of score vs ref (1) or 2nd-best allele (0) [%.2f]\n", mplp->vs_ref);
     fprintf(fp,
         "      --indel-size INT    Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
     fprintf(fp,
-        "      --indels-2.0        New EXPERIMENTAL indel calling model (diploid reference consensus)\n");
+        "      --indels-2.0        New EXPERIMENTAL indel calling model (diploid reference consensus)\n"
+        "      --indels-cns        New EXPERIMENTAL indel calling model with edlib\n"
+        "      --seqq-offset       Indel-cns tuning for indel seq-qual scores [120]\n"
+        "      --no-indels-cns     Disable CNS mode, to use after a -X profile\n"
+        "      --poly-mqual        (Edlib mode) Use minimum quality within homopolymers\n");
     fprintf(fp,"\n");
     fprintf(fp,
-        "Configuration profiles activated with -X, --config:\n"
-        "    1.12:        -Q13 -h100 -m1 -F0.002\n"
-        "    illumina:    [ default values ]\n"
-        "    ont:         -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
-        "    pacbio-ccs:  -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
-        "\n"
-        "Notes: Assuming diploid individuals.\n"
-        "\n"
-        "Example:\n"
-        "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
-        "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
-        "\n");
+            "Notes: Assuming diploid individuals.\n\n"
+            "Example:\n"
+            "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
+            "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
+            "\n");
 
     free(tmp_skip_all_set);
     free(tmp_skip_any_unset);
@@ -1294,9 +1320,41 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
     free(tmp_skip_any_set);
 }
 
+static void print_profiles(void) {
+    printf(
+"Configuration profiles activated with -X, --config:\n\n"
+"1.12\n"
+"    -Q13 -h100 -m1 -F0.002\n\n"
+"bgi, bgi-1.20\n"
+"    --indels-cns -B --indel-size 80 -F0.1 --indel-bias 0.9 --seqq-offset 120\n\n"
+"illumina-1.18\n"
+"    --indel-size 110\n\n"
+"illumina\n"
+"illumina-1.20\n"
+"    --indels-cns --indel-size 110\n\n"
+"ont\n"
+"    -B -Q5 --max-BQ 30 -I\n\n"
+"ont-sup, ont-sup-1.20\n"
+"    --indels-cns -B -Q1 --max-BQ 35 -F0.2 -o15 -e1 -h110 --delta-BQ 99\\\n"
+"    --del-bias 0.4 --indel-bias 0.7 --poly-mqual --seqq-offset 130\\\n"
+"    --indel-size 80\n\n"
+"pacbio-ccs-1.18\n"
+"    -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n"
+"    -M99999 --indel-size 110\n\n"
+"pacbio-ccs, pacbio-ccs-1.20\n"
+"    --indels-cns -B -Q5 --max-BQ 50 -F0.1 -o25 -e1 -h300 --delta-BQ 10 \\\n"
+"    --del-bias 0.4 --poly-mqual --indel-bias 0.9 --seqq-offset 118\\\n"
+"    --indel-size 80 --score-vs-ref 0.7\n\n"
+"ultima, ultima-1.20\n"
+"    --indels-cns -B -Q1 --max-BQ 30 -F0.15 -o20 -e10 -h250 --delta-BQ 10 \\\n"
+"    --del-bias 0.3 --indel-bias 0.7 --poly-mqual --seqq-offset 140 \\\n"
+"    --indel-size 80 --score-vs-ref 0.3\n\n"
+"\n");
+}
+
 int main_mpileup(int argc, char *argv[])
 {
-    int c;
+    int c, i, ret = 1;
     const char *file_list = NULL;
     char **fn = NULL;
     int nfiles = 0, use_orphan = 0, noref = 0;
@@ -1309,6 +1367,7 @@ int main_mpileup(int argc, char *argv[])
     mplp.max_depth = 250; mplp.max_indel_depth = 250;
     mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
     mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
+    mplp.vs_ref = 0;
     mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
               | MPLP_SMART_OVERLAPS;
     mplp.argc = argc; mplp.argv = argv;
@@ -1319,11 +1378,14 @@ int main_mpileup(int argc, char *argv[])
     mplp.n_threads = 0;
     mplp.bsmpl = bam_smpl_init();
     // the default to be changed in future, see also parse_format_flag()
-    mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB;
+    mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB|B2B_FMT_AD;
     mplp.max_read_len = 500;
     mplp.ambig_reads = B2B_DROP;
     mplp.indel_win_size = 110;
+    mplp.poly_mqual = 0;
+    mplp.seqQ_offset = 120;
     mplp.clevel = -1;
+    mplp.del_bias = 0; // even insertion and deletion likelhoods.
     hts_srand48(0);
 
     static const struct option lopts[] =
@@ -1382,6 +1444,8 @@ int main_mpileup(int argc, char *argv[])
         {"indel-bias", required_argument, NULL, 10},
         {"indel-size", required_argument, NULL, 15},
         {"indels-2.0", no_argument, NULL, 20},
+        {"indels-cns", no_argument, NULL, 22},
+        {"no-indels-cns", no_argument, NULL, 25},
         {"tandem-qual", required_argument, NULL, 'h'},
         {"skip-indels", no_argument, NULL, 'I'},
         {"max-idepth", required_argument, NULL, 'L'},
@@ -1394,27 +1458,44 @@ int main_mpileup(int argc, char *argv[])
         {"seed", required_argument, NULL, 13},
         {"ambig-reads", required_argument, NULL, 14},
         {"ar", required_argument, NULL, 14},
-        {"write-index",no_argument,NULL,21},
+        {"write-index",optional_argument,NULL,'W'},
+        {"del-bias", required_argument, NULL, 23},
+        {"poly-mqual", no_argument, NULL, 24},
+        {"no-poly-mqual", no_argument, NULL, 26},
+        {"score-vs-ref",required_argument, NULL, 27},
+        {"seqq-offset", required_argument, NULL, 28},
         {NULL, 0, NULL, 0}
     };
-    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::",lopts,NULL)) >= 0) {
         switch (c) {
         case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
         case  16 :
             mplp.rflag_skip_any_unset = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_any_unset <0 ) { fprintf(stderr,"Could not parse --nf %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_any_unset <0 ) {
+                fprintf(stderr,"Could not parse --nf %s\n", optarg);
+                goto err;
+            }
             break;
         case  17 :
             mplp.rflag_skip_all_unset = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_all_unset<0 ) { fprintf(stderr,"Could not parse --if %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_all_unset<0 ) {
+                fprintf(stderr,"Could not parse --if %s\n", optarg);
+                goto err;
+            }
             break;
         case  18 :
             mplp.rflag_skip_any_set = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_any_set <0 ) { fprintf(stderr,"Could not parse --ef %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_any_set <0 ) {
+                fprintf(stderr,"Could not parse --ef %s\n", optarg);
+                goto err;
+            }
             break;
         case  19 :
             mplp.rflag_skip_all_set = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_all_set <0 ) { fprintf(stderr,"Could not parse --df %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_all_set <0 ) {
+                fprintf(stderr,"Could not parse --df %s\n", optarg);
+                goto err;
+            }
             break;
         case  3 : mplp.output_fname = optarg; break;
         case  4 : mplp.openQ = atoi(optarg); break;
@@ -1425,7 +1506,8 @@ int main_mpileup(int argc, char *argv[])
             break;
         case 'f':
             mplp.fai = fai_load(optarg);
-            if (mplp.fai == NULL) return 1;
+            if (mplp.fai == NULL)
+                goto err;
             mplp.fai_fname = optarg;
             break;
         case  7 : noref = 1; break;
@@ -1452,7 +1534,10 @@ int main_mpileup(int argc, char *argv[])
                   if ( optarg[0]=='^' ) optarg++;
                   else mplp.bed_logic = 1;
                   mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
-                  if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
+                  if (!mplp.bed) {
+                      fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg);
+                      goto err;
+                  }
                   break;
         case 'P': mplp.pl_list = strdup(optarg); break;
         case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
@@ -1505,19 +1590,39 @@ int main_mpileup(int argc, char *argv[])
             else
                 mplp.indel_bias = 1/atof(optarg);
             break;
+        case 27:
+            mplp.vs_ref = atof(optarg);
+            //if (mplp.vs_ref < 0) mplp.vs_ref = 0;
+            if (mplp.vs_ref > 1) mplp.vs_ref = 1;
+            break;
         case  15: {
                 char *tmp;
                 mplp.indel_win_size = strtol(optarg,&tmp,10);
                 if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg);
-                if ( mplp.indel_win_size < 110 )
+                if ( mplp.indel_win_size < 20 )
                 {
-                    mplp.indel_win_size = 110;
+                    mplp.indel_win_size = 20;
                     fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size);
                 }
             }
             break;
-        case  20: mplp.indels_v20 = 1; break;
-        case  21: mplp.write_index = 1; break;
+        case  20: mplp.indels_v20 = 1; mplp.edlib = 0; break;
+        case 'W':
+            if (!(mplp.write_index = write_index_parse(optarg)))
+                error("Unsupported index format '%s'\n", optarg);
+            break;
+        case  22: mplp.edlib = 1; mplp.indels_v20 = 0; break;
+        case  25: mplp.edlib = 0; break;
+        case  28:
+            mplp.seqQ_offset = atoi(optarg);
+            if (mplp.seqQ_offset < 100)
+                mplp.seqQ_offset = 100;
+            if (mplp.seqQ_offset > 200)
+                mplp.seqQ_offset = 200;
+            break;
+        case  23: mplp.del_bias = atof(optarg); break;
+        case  24: mplp.poly_mqual = 1; break;
+        case  26: mplp.poly_mqual = 0; break;
         case 'A': use_orphan = 1; break;
         case 'F': mplp.min_frac = atof(optarg); break;
         case 'm': mplp.min_support = atoi(optarg); break;
@@ -1526,13 +1631,13 @@ int main_mpileup(int argc, char *argv[])
         case 'a':
             if (optarg[0]=='?') {
                 list_annotations(stderr);
-                return 1;
+                goto err;
             }
             parse_format_flag(&mplp.fmt_flag,optarg);
         break;
         case 'M': mplp.max_read_len = atoi(optarg); break;
         case 'X':
-            if (strcasecmp(optarg, "pacbio-ccs") == 0) {
+            if (strcasecmp(optarg, "pacbio-ccs-1.18") == 0) {
                 mplp.min_frac = 0.1;
                 mplp.min_baseQ = 5;
                 mplp.max_baseQ = 50;
@@ -1541,13 +1646,70 @@ int main_mpileup(int argc, char *argv[])
                 mplp.extQ = 1;
                 mplp.flag |= MPLP_REALN_PARTIAL;
                 mplp.max_read_len = 99999;
+
+            } else if (strcasecmp(optarg, "pacbio-ccs") == 0 ||
+                strcasecmp(optarg, "pacbio-ccs-1.20") == 0) {
+                mplp.min_frac = 0.1;
+                mplp.min_baseQ = 5;
+                mplp.max_baseQ = 50;
+                mplp.delta_baseQ = 10;
+                mplp.tandemQ = 300;
+                mplp.openQ = 25;
+                mplp.extQ = 1;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.del_bias = 0.4;
+                mplp.indel_bias = 1/.9;
+                mplp.seqQ_offset = 118;
+                mplp.poly_mqual = 1;
+                mplp.edlib = 1;
+                mplp.vs_ref = 0.7;
+                mplp.indel_win_size = 80;
+
             } else if (strcasecmp(optarg, "ont") == 0) {
-                fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with "
+                fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with "
                         "a higher -P, eg -P0.01 or -P 0.1\n");
                 mplp.min_baseQ = 5;
                 mplp.max_baseQ = 30;
                 mplp.flag &= ~MPLP_REALN;
                 mplp.flag |= MPLP_NO_INDEL;
+
+            } else if (strcasecmp(optarg, "ont-sup") == 0 ||
+                       strcasecmp(optarg, "ont-sup-1.20") == 0) {
+                mplp.min_frac = 0.2;
+                mplp.min_baseQ = 1;
+                mplp.max_baseQ = 35;
+                mplp.delta_baseQ = 99;
+                mplp.openQ = 15;
+                mplp.extQ = 1;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.max_read_len = 9999999;
+                mplp.del_bias = 0.4;
+                mplp.poly_mqual = 1;
+                mplp.edlib = 1;
+                // If we increase -h then we can increase bias denominator too
+                mplp.tandemQ = 110;
+                mplp.indel_bias = 1/0.7;
+                mplp.seqQ_offset = 130;
+                mplp.indel_win_size = 80;
+
+            } else if (strcasecmp(optarg, "ultima") == 0 ||
+                       strcasecmp(optarg, "ultima-1.20") == 0) {
+                mplp.min_frac = 0.15;
+                mplp.min_baseQ = 1;
+                mplp.max_baseQ = 30;
+                mplp.delta_baseQ = 10;
+                mplp.openQ = 20;
+                mplp.extQ = 10;
+                mplp.tandemQ = 250;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.del_bias = 0.3;
+                mplp.poly_mqual = 1;
+                mplp.edlib = 1;
+                mplp.indel_bias = 1/0.7;
+                mplp.seqQ_offset = 140;
+                mplp.vs_ref = 0.3;
+                mplp.indel_win_size = 80;
+
             } else if (strcasecmp(optarg, "1.12") == 0) {
                 // 1.12 and earlier
                 mplp.min_frac = 0.002;
@@ -1556,13 +1718,38 @@ int main_mpileup(int argc, char *argv[])
                 mplp.tandemQ = 100;
                 mplp.flag &= ~MPLP_REALN_PARTIAL;
                 mplp.flag |= MPLP_REALN;
-            } else if (strcasecmp(optarg, "illumina") == 0) {
+
+            } else if (strcasecmp(optarg, "illumina-1.18") == 0) {
+                mplp.indel_win_size = 110;
+                mplp.flag |= MPLP_REALN_PARTIAL;
+
+            } else if (strcasecmp(optarg, "illumina") == 0 ||
+                       strcasecmp(optarg, "illumina-1.20") == 0) {
+                mplp.edlib = 1;
+                mplp.indel_win_size = 110;
                 mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.indel_bias = 1;
+                mplp.seqQ_offset = 125;
+                //mplp.indel_win_size = 80; TEST?
+
+            } else if (strcasecmp(optarg, "bgi") == 0 ||
+                       strcasecmp(optarg, "bgi-1.20") == 0) {
+                mplp.min_frac = 0.1;
+                mplp.edlib = 1;
+                mplp.indel_bias = 1;
+                mplp.seqQ_offset = 120;
+                mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.indel_win_size = 80;
+
+            } else if (strcasecmp(optarg, "list") == 0 ||
+                       strcasecmp(optarg, "help") == 0) {
+                print_profiles();
+                goto err;
             } else {
                 fprintf(stderr, "Unknown configuration name '%s'\n"
-                        "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
+                        "Please use '-X list' to show available choices.\n",
                         optarg);
-                return 1;
+                goto err;
             }
             break;
         case 13: hts_srand48(atoi(optarg)); break;
@@ -1574,7 +1761,7 @@ int main_mpileup(int argc, char *argv[])
             break;
         default:
             fprintf(stderr,"Invalid option: '%c'\n", c);
-            return 1;
+            goto err;
         }
     }
 
@@ -1599,22 +1786,23 @@ int main_mpileup(int argc, char *argv[])
     if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
     {
         fprintf(stderr,"Error: The -B option cannot be combined with -E\n");
-        return 1;
+        goto err;
     }
     if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
     if (argc == 1)
     {
         print_usage(stderr, &mplp);
-        return 1;
+        goto err;
     }
     if (!mplp.fai && !noref) {
         fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
-        return 1;
+        goto err;
     }
-    int ret,i;
+
     if (file_list)
     {
-        if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+        if ( read_file_list(file_list,&nfiles,&fn) )
+            goto err;
         mplp.files  = fn;
         mplp.nfiles = nfiles;
     }
@@ -1633,6 +1821,8 @@ int main_mpileup(int argc, char *argv[])
     if (mplp.bed) regidx_destroy(mplp.bed);
     if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
     if (mplp.reg) regidx_destroy(mplp.reg);
+
+ err:
     bam_smpl_destroy(mplp.bsmpl);
 
     return ret;
diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c
index 81c5849c5..4458b60f3 100644
--- a/bcftools/mpileup.c.pysam.c
+++ b/bcftools/mpileup.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
 
-    Copyright (C) 2008-2023 Genome Research Ltd.
+    Copyright (C) 2008-2024 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -74,8 +74,11 @@ typedef struct {
     uint32_t fmt_flag;
     int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
     int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
+    int seqQ_offset;
     double min_frac; // for indels
-    double indel_bias;
+    double indel_bias, poly_mqual;
+    double del_bias; // compensate for diff deletion vs insertion error rates
+    double vs_ref;
     char *reg_fname, *pl_list, *fai_fname, *output_fname;
     int reg_is_file, record_cmd_line, n_threads, clevel;
     faidx_t *fai;
@@ -101,6 +104,7 @@ typedef struct {
     htsFile *bcf_fp;
     bcf_hdr_t *bcf_hdr;
     int indels_v20;
+    int edlib;
     int argc;
     char **argv;
     int write_index;
@@ -586,12 +590,14 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
 
         // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
         // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
-        if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth )
+        if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth)
         {
             bcf_callaux_clean(conf->bca, &conf->bc);
             conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL;
             int iret;
-            if ( conf->indels_v20 )
+            if (conf->edlib)
+                iret = bcf_edlib_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref, ref_len);
+            else if ( conf->indels_v20 )
                 iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
             else
                 iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
@@ -608,7 +614,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
             }
         }
     }
-    return 0;
+    return ret;
 }
 
 static int mpileup(mplp_conf_t *conf)
@@ -648,8 +654,12 @@ static int mpileup(mplp_conf_t *conf)
             }
         }
         nregs = regidx_nregs(conf->reg);
-        conf->reg_itr = regitr_init(conf->reg);
-        regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
+        if ( nregs )
+        {
+            // the regions list can be empty, see #2250
+            conf->reg_itr = regitr_init(conf->reg);
+            regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
+        }
     }
 
     // read the header of each file in the list and initialize data
@@ -695,7 +705,7 @@ static int mpileup(mplp_conf_t *conf)
             i--;
             continue;
         }
-        if (conf->reg) {
+        if (conf->reg && nregs) {
             hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
             if (idx == NULL) {
                 fprintf(bcftools_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
@@ -859,13 +869,16 @@ static int mpileup(mplp_conf_t *conf)
     for (i=0; i<nsmpl; i++)
         bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
     if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
-    if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
+    if ( init_index2(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,
+                     &conf->index_fn, conf->write_index) < 0 )
+        error("Error: failed to initialise index for %s\n",conf->output_fname);
 
     conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
                               conf->delta_baseQ);
     conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
     conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
     conf->bca->indel_bias = conf->indel_bias;
+    conf->bca->del_bias = conf->del_bias;
     conf->bca->min_frac = conf->min_frac;
     conf->bca->min_support = conf->min_support;
     conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
@@ -873,6 +886,10 @@ static int mpileup(mplp_conf_t *conf)
     conf->bca->ambig_reads = conf->ambig_reads;
     conf->bca->indel_win_size = conf->indel_win_size;
     conf->bca->indels_v20 = conf->indels_v20;
+    conf->bca->edlib = conf->edlib;
+    conf->bca->seqQ_offset = conf->seqQ_offset;
+    conf->bca->poly_mqual = conf->poly_mqual;
+    conf->bca->vs_ref = conf->vs_ref;
 
     conf->bc.bcf_hdr = conf->bcf_hdr;
     conf->bc.n  = nsmpl;
@@ -927,6 +944,7 @@ static int mpileup(mplp_conf_t *conf)
 
 
     // Run mpileup for multiple regions
+    int ret = 0;
     if ( nregs )
     {
         int ireg = 0;
@@ -955,12 +973,18 @@ static int mpileup(mplp_conf_t *conf)
                     bam_mplp_reset(conf->iter);
                 }
             }
-            mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+            ret = mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+            if ( ret<0 ) break;
         }
         while ( regitr_loop(conf->reg_itr) );
     }
-    else
-        mpileup_reg(conf,0,UINT32_MAX);
+    else if ( !conf->reg )
+        ret = mpileup_reg(conf,0,UINT32_MAX);
+    if ( ret<0 )
+    {
+        fprintf(bcftools_stderr, "[%s] failed to read from input file\n", __func__);
+        bcftools_exit(EXIT_FAILURE);
+    }
 
     flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
 
@@ -1152,7 +1176,7 @@ static void list_annotations(FILE *fp)
         "\n"
         "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
         "\n"
-        "  FORMAT/AD   .. Allelic depth (Number=R,Type=Integer)\n"
+        "* FORMAT/AD   .. Allelic depth (Number=R,Type=Integer)\n"
         "  FORMAT/ADF  .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
         "  FORMAT/ADR  .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
         "  FORMAT/DP   .. Number of high-quality bases (Number=1,Type=Integer)\n"
@@ -1202,7 +1226,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "\n"
         "Input options:\n"
         "  -6, --illumina1.3+      Quality is in the Illumina-1.3+ encoding\n"
-        "  -A, --count-orphans     Do not discard anomalous read pairs\n"
+        "  -A, --count-orphans     Include anomalous read pairs, with flag PAIRED but not PROPER_PAIR set\n"
         "  -b, --bam-list FILE     List of input BAM filenames, one per line\n"
         "  -B, --no-BAQ            Disable BAQ (per-Base Alignment Quality)\n"
         "  -C, --adjust-MQ INT     Adjust mapping quality [0]\n"
@@ -1247,10 +1271,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
         "                          'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
         "      --threads INT       Use multithreading with INT worker threads [0]\n"
-        "      --write-index       Automatically index the output files [off]\n"
+        "  -W, --write-index[=FMT] Automatically index the output files [off]\n"
         "\n"
         "SNP/INDEL genotype likelihoods options:\n"
-        "  -X, --config STR        Specify platform specific profiles (see below)\n"
+        "  -X, --config STR        Specify platform profile (use \"-X list\" for details)\n"
         "  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
     fprintf(fp,
         "  -F, --gap-frac FLOAT    Minimum fraction of gapped reads [%g]\n", mplp->min_frac);
@@ -1271,24 +1295,26 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
         "  --ar, --ambig-reads STR   What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
     fprintf(fp,
         "      --indel-bias FLOAT  Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+    fprintf(fp,
+        "      --del-bias FLOAT    Relative likelihood of insertion to deletion [%.2f]\n", mplp->del_bias);
+    fprintf(fp,
+        "      --score-vs-ref FLOAT\n"
+        "                          Ratio of score vs ref (1) or 2nd-best allele (0) [%.2f]\n", mplp->vs_ref);
     fprintf(fp,
         "      --indel-size INT    Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
     fprintf(fp,
-        "      --indels-2.0        New EXPERIMENTAL indel calling model (diploid reference consensus)\n");
+        "      --indels-2.0        New EXPERIMENTAL indel calling model (diploid reference consensus)\n"
+        "      --indels-cns        New EXPERIMENTAL indel calling model with edlib\n"
+        "      --seqq-offset       Indel-cns tuning for indel seq-qual scores [120]\n"
+        "      --no-indels-cns     Disable CNS mode, to use after a -X profile\n"
+        "      --poly-mqual        (Edlib mode) Use minimum quality within homopolymers\n");
     fprintf(fp,"\n");
     fprintf(fp,
-        "Configuration profiles activated with -X, --config:\n"
-        "    1.12:        -Q13 -h100 -m1 -F0.002\n"
-        "    illumina:    [ default values ]\n"
-        "    ont:         -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
-        "    pacbio-ccs:  -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
-        "\n"
-        "Notes: Assuming diploid individuals.\n"
-        "\n"
-        "Example:\n"
-        "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
-        "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
-        "\n");
+            "Notes: Assuming diploid individuals.\n\n"
+            "Example:\n"
+            "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
+            "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
+            "\n");
 
     free(tmp_skip_all_set);
     free(tmp_skip_any_unset);
@@ -1296,9 +1322,41 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
     free(tmp_skip_any_set);
 }
 
+static void print_profiles(void) {
+    fprintf(bcftools_stdout, 
+"Configuration profiles activated with -X, --config:\n\n"
+"1.12\n"
+"    -Q13 -h100 -m1 -F0.002\n\n"
+"bgi, bgi-1.20\n"
+"    --indels-cns -B --indel-size 80 -F0.1 --indel-bias 0.9 --seqq-offset 120\n\n"
+"illumina-1.18\n"
+"    --indel-size 110\n\n"
+"illumina\n"
+"illumina-1.20\n"
+"    --indels-cns --indel-size 110\n\n"
+"ont\n"
+"    -B -Q5 --max-BQ 30 -I\n\n"
+"ont-sup, ont-sup-1.20\n"
+"    --indels-cns -B -Q1 --max-BQ 35 -F0.2 -o15 -e1 -h110 --delta-BQ 99\\\n"
+"    --del-bias 0.4 --indel-bias 0.7 --poly-mqual --seqq-offset 130\\\n"
+"    --indel-size 80\n\n"
+"pacbio-ccs-1.18\n"
+"    -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n"
+"    -M99999 --indel-size 110\n\n"
+"pacbio-ccs, pacbio-ccs-1.20\n"
+"    --indels-cns -B -Q5 --max-BQ 50 -F0.1 -o25 -e1 -h300 --delta-BQ 10 \\\n"
+"    --del-bias 0.4 --poly-mqual --indel-bias 0.9 --seqq-offset 118\\\n"
+"    --indel-size 80 --score-vs-ref 0.7\n\n"
+"ultima, ultima-1.20\n"
+"    --indels-cns -B -Q1 --max-BQ 30 -F0.15 -o20 -e10 -h250 --delta-BQ 10 \\\n"
+"    --del-bias 0.3 --indel-bias 0.7 --poly-mqual --seqq-offset 140 \\\n"
+"    --indel-size 80 --score-vs-ref 0.3\n\n"
+"\n");
+}
+
 int main_mpileup(int argc, char *argv[])
 {
-    int c;
+    int c, i, ret = 1;
     const char *file_list = NULL;
     char **fn = NULL;
     int nfiles = 0, use_orphan = 0, noref = 0;
@@ -1311,6 +1369,7 @@ int main_mpileup(int argc, char *argv[])
     mplp.max_depth = 250; mplp.max_indel_depth = 250;
     mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
     mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
+    mplp.vs_ref = 0;
     mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
               | MPLP_SMART_OVERLAPS;
     mplp.argc = argc; mplp.argv = argv;
@@ -1321,11 +1380,14 @@ int main_mpileup(int argc, char *argv[])
     mplp.n_threads = 0;
     mplp.bsmpl = bam_smpl_init();
     // the default to be changed in future, see also parse_format_flag()
-    mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB;
+    mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB|B2B_FMT_AD;
     mplp.max_read_len = 500;
     mplp.ambig_reads = B2B_DROP;
     mplp.indel_win_size = 110;
+    mplp.poly_mqual = 0;
+    mplp.seqQ_offset = 120;
     mplp.clevel = -1;
+    mplp.del_bias = 0; // even insertion and deletion likelhoods.
     hts_srand48(0);
 
     static const struct option lopts[] =
@@ -1384,6 +1446,8 @@ int main_mpileup(int argc, char *argv[])
         {"indel-bias", required_argument, NULL, 10},
         {"indel-size", required_argument, NULL, 15},
         {"indels-2.0", no_argument, NULL, 20},
+        {"indels-cns", no_argument, NULL, 22},
+        {"no-indels-cns", no_argument, NULL, 25},
         {"tandem-qual", required_argument, NULL, 'h'},
         {"skip-indels", no_argument, NULL, 'I'},
         {"max-idepth", required_argument, NULL, 'L'},
@@ -1396,27 +1460,44 @@ int main_mpileup(int argc, char *argv[])
         {"seed", required_argument, NULL, 13},
         {"ambig-reads", required_argument, NULL, 14},
         {"ar", required_argument, NULL, 14},
-        {"write-index",no_argument,NULL,21},
+        {"write-index",optional_argument,NULL,'W'},
+        {"del-bias", required_argument, NULL, 23},
+        {"poly-mqual", no_argument, NULL, 24},
+        {"no-poly-mqual", no_argument, NULL, 26},
+        {"score-vs-ref",required_argument, NULL, 27},
+        {"seqq-offset", required_argument, NULL, 28},
         {NULL, 0, NULL, 0}
     };
-    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:UW::",lopts,NULL)) >= 0) {
         switch (c) {
         case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
         case  16 :
             mplp.rflag_skip_any_unset = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_any_unset <0 ) { fprintf(bcftools_stderr,"Could not parse --nf %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_any_unset <0 ) {
+                fprintf(bcftools_stderr,"Could not parse --nf %s\n", optarg);
+                goto err;
+            }
             break;
         case  17 :
             mplp.rflag_skip_all_unset = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_all_unset<0 ) { fprintf(bcftools_stderr,"Could not parse --if %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_all_unset<0 ) {
+                fprintf(bcftools_stderr,"Could not parse --if %s\n", optarg);
+                goto err;
+            }
             break;
         case  18 :
             mplp.rflag_skip_any_set = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_any_set <0 ) { fprintf(bcftools_stderr,"Could not parse --ef %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_any_set <0 ) {
+                fprintf(bcftools_stderr,"Could not parse --ef %s\n", optarg);
+                goto err;
+            }
             break;
         case  19 :
             mplp.rflag_skip_all_set = bam_str2flag(optarg);
-            if ( mplp.rflag_skip_all_set <0 ) { fprintf(bcftools_stderr,"Could not parse --df %s\n", optarg); return 1; }
+            if ( mplp.rflag_skip_all_set <0 ) {
+                fprintf(bcftools_stderr,"Could not parse --df %s\n", optarg);
+                goto err;
+            }
             break;
         case  3 : mplp.output_fname = optarg; break;
         case  4 : mplp.openQ = atoi(optarg); break;
@@ -1427,7 +1508,8 @@ int main_mpileup(int argc, char *argv[])
             break;
         case 'f':
             mplp.fai = fai_load(optarg);
-            if (mplp.fai == NULL) return 1;
+            if (mplp.fai == NULL)
+                goto err;
             mplp.fai_fname = optarg;
             break;
         case  7 : noref = 1; break;
@@ -1454,7 +1536,10 @@ int main_mpileup(int argc, char *argv[])
                   if ( optarg[0]=='^' ) optarg++;
                   else mplp.bed_logic = 1;
                   mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
-                  if (!mplp.bed) { fprintf(bcftools_stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
+                  if (!mplp.bed) {
+                      fprintf(bcftools_stderr, "bcftools mpileup: Could not read file \"%s\"", optarg);
+                      goto err;
+                  }
                   break;
         case 'P': mplp.pl_list = strdup(optarg); break;
         case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
@@ -1507,19 +1592,39 @@ int main_mpileup(int argc, char *argv[])
             else
                 mplp.indel_bias = 1/atof(optarg);
             break;
+        case 27:
+            mplp.vs_ref = atof(optarg);
+            //if (mplp.vs_ref < 0) mplp.vs_ref = 0;
+            if (mplp.vs_ref > 1) mplp.vs_ref = 1;
+            break;
         case  15: {
                 char *tmp;
                 mplp.indel_win_size = strtol(optarg,&tmp,10);
                 if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg);
-                if ( mplp.indel_win_size < 110 )
+                if ( mplp.indel_win_size < 20 )
                 {
-                    mplp.indel_win_size = 110;
+                    mplp.indel_win_size = 20;
                     fprintf(bcftools_stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size);
                 }
             }
             break;
-        case  20: mplp.indels_v20 = 1; break;
-        case  21: mplp.write_index = 1; break;
+        case  20: mplp.indels_v20 = 1; mplp.edlib = 0; break;
+        case 'W':
+            if (!(mplp.write_index = write_index_parse(optarg)))
+                error("Unsupported index format '%s'\n", optarg);
+            break;
+        case  22: mplp.edlib = 1; mplp.indels_v20 = 0; break;
+        case  25: mplp.edlib = 0; break;
+        case  28:
+            mplp.seqQ_offset = atoi(optarg);
+            if (mplp.seqQ_offset < 100)
+                mplp.seqQ_offset = 100;
+            if (mplp.seqQ_offset > 200)
+                mplp.seqQ_offset = 200;
+            break;
+        case  23: mplp.del_bias = atof(optarg); break;
+        case  24: mplp.poly_mqual = 1; break;
+        case  26: mplp.poly_mqual = 0; break;
         case 'A': use_orphan = 1; break;
         case 'F': mplp.min_frac = atof(optarg); break;
         case 'm': mplp.min_support = atoi(optarg); break;
@@ -1528,13 +1633,13 @@ int main_mpileup(int argc, char *argv[])
         case 'a':
             if (optarg[0]=='?') {
                 list_annotations(bcftools_stderr);
-                return 1;
+                goto err;
             }
             parse_format_flag(&mplp.fmt_flag,optarg);
         break;
         case 'M': mplp.max_read_len = atoi(optarg); break;
         case 'X':
-            if (strcasecmp(optarg, "pacbio-ccs") == 0) {
+            if (strcasecmp(optarg, "pacbio-ccs-1.18") == 0) {
                 mplp.min_frac = 0.1;
                 mplp.min_baseQ = 5;
                 mplp.max_baseQ = 50;
@@ -1543,13 +1648,70 @@ int main_mpileup(int argc, char *argv[])
                 mplp.extQ = 1;
                 mplp.flag |= MPLP_REALN_PARTIAL;
                 mplp.max_read_len = 99999;
+
+            } else if (strcasecmp(optarg, "pacbio-ccs") == 0 ||
+                strcasecmp(optarg, "pacbio-ccs-1.20") == 0) {
+                mplp.min_frac = 0.1;
+                mplp.min_baseQ = 5;
+                mplp.max_baseQ = 50;
+                mplp.delta_baseQ = 10;
+                mplp.tandemQ = 300;
+                mplp.openQ = 25;
+                mplp.extQ = 1;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.del_bias = 0.4;
+                mplp.indel_bias = 1/.9;
+                mplp.seqQ_offset = 118;
+                mplp.poly_mqual = 1;
+                mplp.edlib = 1;
+                mplp.vs_ref = 0.7;
+                mplp.indel_win_size = 80;
+
             } else if (strcasecmp(optarg, "ont") == 0) {
-                fprintf(bcftools_stderr, "For ONT it may be beneficial to also run bcftools call with "
+                fprintf(bcftools_stderr, "With old ONT data may be beneficial to also run bcftools call with "
                         "a higher -P, eg -P0.01 or -P 0.1\n");
                 mplp.min_baseQ = 5;
                 mplp.max_baseQ = 30;
                 mplp.flag &= ~MPLP_REALN;
                 mplp.flag |= MPLP_NO_INDEL;
+
+            } else if (strcasecmp(optarg, "ont-sup") == 0 ||
+                       strcasecmp(optarg, "ont-sup-1.20") == 0) {
+                mplp.min_frac = 0.2;
+                mplp.min_baseQ = 1;
+                mplp.max_baseQ = 35;
+                mplp.delta_baseQ = 99;
+                mplp.openQ = 15;
+                mplp.extQ = 1;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.max_read_len = 9999999;
+                mplp.del_bias = 0.4;
+                mplp.poly_mqual = 1;
+                mplp.edlib = 1;
+                // If we increase -h then we can increase bias denominator too
+                mplp.tandemQ = 110;
+                mplp.indel_bias = 1/0.7;
+                mplp.seqQ_offset = 130;
+                mplp.indel_win_size = 80;
+
+            } else if (strcasecmp(optarg, "ultima") == 0 ||
+                       strcasecmp(optarg, "ultima-1.20") == 0) {
+                mplp.min_frac = 0.15;
+                mplp.min_baseQ = 1;
+                mplp.max_baseQ = 30;
+                mplp.delta_baseQ = 10;
+                mplp.openQ = 20;
+                mplp.extQ = 10;
+                mplp.tandemQ = 250;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.del_bias = 0.3;
+                mplp.poly_mqual = 1;
+                mplp.edlib = 1;
+                mplp.indel_bias = 1/0.7;
+                mplp.seqQ_offset = 140;
+                mplp.vs_ref = 0.3;
+                mplp.indel_win_size = 80;
+
             } else if (strcasecmp(optarg, "1.12") == 0) {
                 // 1.12 and earlier
                 mplp.min_frac = 0.002;
@@ -1558,13 +1720,38 @@ int main_mpileup(int argc, char *argv[])
                 mplp.tandemQ = 100;
                 mplp.flag &= ~MPLP_REALN_PARTIAL;
                 mplp.flag |= MPLP_REALN;
-            } else if (strcasecmp(optarg, "illumina") == 0) {
+
+            } else if (strcasecmp(optarg, "illumina-1.18") == 0) {
+                mplp.indel_win_size = 110;
+                mplp.flag |= MPLP_REALN_PARTIAL;
+
+            } else if (strcasecmp(optarg, "illumina") == 0 ||
+                       strcasecmp(optarg, "illumina-1.20") == 0) {
+                mplp.edlib = 1;
+                mplp.indel_win_size = 110;
                 mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.indel_bias = 1;
+                mplp.seqQ_offset = 125;
+                //mplp.indel_win_size = 80; TEST?
+
+            } else if (strcasecmp(optarg, "bgi") == 0 ||
+                       strcasecmp(optarg, "bgi-1.20") == 0) {
+                mplp.min_frac = 0.1;
+                mplp.edlib = 1;
+                mplp.indel_bias = 1;
+                mplp.seqQ_offset = 120;
+                mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.indel_win_size = 80;
+
+            } else if (strcasecmp(optarg, "list") == 0 ||
+                       strcasecmp(optarg, "help") == 0) {
+                print_profiles();
+                goto err;
             } else {
                 fprintf(bcftools_stderr, "Unknown configuration name '%s'\n"
-                        "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
+                        "Please use '-X list' to show available choices.\n",
                         optarg);
-                return 1;
+                goto err;
             }
             break;
         case 13: hts_srand48(atoi(optarg)); break;
@@ -1576,7 +1763,7 @@ int main_mpileup(int argc, char *argv[])
             break;
         default:
             fprintf(bcftools_stderr,"Invalid option: '%c'\n", c);
-            return 1;
+            goto err;
         }
     }
 
@@ -1601,22 +1788,23 @@ int main_mpileup(int argc, char *argv[])
     if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
     {
         fprintf(bcftools_stderr,"Error: The -B option cannot be combined with -E\n");
-        return 1;
+        goto err;
     }
     if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
     if (argc == 1)
     {
         print_usage(bcftools_stderr, &mplp);
-        return 1;
+        goto err;
     }
     if (!mplp.fai && !noref) {
         fprintf(bcftools_stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
-        return 1;
+        goto err;
     }
-    int ret,i;
+
     if (file_list)
     {
-        if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+        if ( read_file_list(file_list,&nfiles,&fn) )
+            goto err;
         mplp.files  = fn;
         mplp.nfiles = nfiles;
     }
@@ -1635,6 +1823,8 @@ int main_mpileup(int argc, char *argv[])
     if (mplp.bed) regidx_destroy(mplp.bed);
     if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
     if (mplp.reg) regidx_destroy(mplp.reg);
+
+ err:
     bam_smpl_destroy(mplp.bsmpl);
 
     return ret;
diff --git a/bcftools/read_consensus.c b/bcftools/read_consensus.c
index 5c8133f28..593b19b5f 100644
--- a/bcftools/read_consensus.c
+++ b/bcftools/read_consensus.c
@@ -275,7 +275,7 @@ int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp)
                 }
                 y += len;
             }
-            else if ( op==BAM_CDEL )
+            else if ( op==BAM_CDEL || op==BAM_CREF_SKIP )   /* note: unsure about BAM_CREF_SKIP, don't have data to test */
             {
                 if ( x>rcns->beg && x+len-1<=rcns->end )
                 {
diff --git a/bcftools/read_consensus.c.pysam.c b/bcftools/read_consensus.c.pysam.c
index a2612fd31..ef2ff089e 100644
--- a/bcftools/read_consensus.c.pysam.c
+++ b/bcftools/read_consensus.c.pysam.c
@@ -277,7 +277,7 @@ int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp)
                 }
                 y += len;
             }
-            else if ( op==BAM_CDEL )
+            else if ( op==BAM_CDEL || op==BAM_CREF_SKIP )   /* note: unsure about BAM_CREF_SKIP, don't have data to test */
             {
                 if ( x>rcns->beg && x+len-1<=rcns->end )
                 {
diff --git a/bcftools/regidx.h b/bcftools/regidx.h
index c40bbd866..09c43f891 100644
--- a/bcftools/regidx.h
+++ b/bcftools/regidx.h
@@ -136,7 +136,7 @@ int regidx_parse_vcf(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
  *
  *  @param fname:  input file name or NULL if regions will be added one-by-one via regidx_insert()
  *  @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
- *                 the format will be autodected, currently either regidx_parse_tab (the default) or
+ *                 the format will be autodetected, currently either regidx_parse_tab (the default) or
  *                 regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
  *                 the exact autodetection algorithm will change.
  *  @param freef:  NULL or see description of regidx_parse_f
diff --git a/bcftools/reheader.c b/bcftools/reheader.c
index ed852173c..37e5d965e 100644
--- a/bcftools/reheader.c
+++ b/bcftools/reheader.c
@@ -1,6 +1,6 @@
 /*  reheader.c -- reheader subcommand.
 
-    Copyright (C) 2014-2022 Genome Research Ltd.
+    Copyright (C) 2014-2022,2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -49,8 +49,9 @@ THE SOFTWARE.  */
 typedef struct _args_t
 {
     char **argv, *fname, *samples_fname, *header_fname, *output_fname;
-    char *fai_fname, *rm_tmpfile, *tmp_prefix;
+    char *fai_fname;
     htsFile *fp;
+    faidx_t *fai;
     htsFormat type;
     htsThreadPool *threads;
     int argc, n_threads;
@@ -168,33 +169,13 @@ char *init_tmp_prefix(const char *tmp_prefix)
     kputs("/bcftools.XXXXXX", &prefix);
     return prefix.s;
 }
-static void update_from_fai(args_t *args)
+static void update_from_fai(faidx_t *fai, kstring_t *hdr_txt)
 {
-    if ( !strcmp("-",args->fname) )
-        error("Cannot use the --fai option when reading from standard input.\n");
-
-    faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
-    if ( !fai ) error("Could not parse %s\n", args->fai_fname);
-    args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix);
-    int fd = mkstemp(args->rm_tmpfile);
-    if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile);
-
-    // get a template header: either from the original VCF or from --header
-    char *ori_hdr_fname = args->header_fname ? args->header_fname : args->fname;
-    htsFile *fp = hts_open(ori_hdr_fname,"r");
-    if ( !fp ) error("Failed to open: %s\n", ori_hdr_fname);
-    bcf_hdr_t *hdr = bcf_hdr_read(fp);
-    if ( !hdr ) error("Failed to read the header: %s\n", ori_hdr_fname);
-    hts_close(fp);  // no need to check the return status here
-
-    // put the header in a text buffer
-    kstring_t hdr_txt_ori = {0,0,0}, hdr_txt_new = {0,0,0};
-    bcf_hdr_format(hdr, 0, &hdr_txt_ori);
-    bcf_hdr_destroy(hdr);
+    kstring_t hdr_txt_new = {0,0,0};
 
     // update the existing contig lines and remove lines not present in the fai file
     void *chr_seen = khash_str2int_init();
-    char *tmp, *beg = hdr_txt_ori.s;
+    char *tmp, *beg = hdr_txt->s;
     while ( beg && *beg )
     {
         tmp = strstr(beg, "\n##contig=<");
@@ -216,13 +197,10 @@ static void update_from_fai(args_t *args)
     }
     kputs(tmp+1,&hdr_txt_new);
 
-    if ( write(fd, hdr_txt_new.s, hdr_txt_new.l)!=hdr_txt_new.l ) error("Failed to write %zu bytes to %s\n", hdr_txt_new.l,args->rm_tmpfile);
-    if ( close(fd)!=0 ) error("Failed to close %s\n", args->rm_tmpfile);
-    args->header_fname = args->rm_tmpfile;
+    // Switch the new header content for the old
+    free(hdr_txt->s);
+    memcpy(hdr_txt, &hdr_txt_new, sizeof(*hdr_txt));
 
-    free(hdr_txt_ori.s);
-    free(hdr_txt_new.s);
-    fai_destroy(fai);
     khash_str2int_destroy_free(chr_seen);
 }
 
@@ -420,6 +398,10 @@ static void reheader_vcf_gz(args_t *args)
         free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
         read_header_file(args->header_fname, &hdr);
     }
+
+    if ( args->fai )
+        update_from_fai(args->fai, &hdr);
+
     if ( samples )
     {
         set_samples(samples, nsamples, &hdr);
@@ -433,7 +415,7 @@ static void reheader_vcf_gz(args_t *args)
     if ( bgzf_write(bgzf_out, hdr.s, hdr.l) < 0 ) error("Can't write BGZF header (code %d)\n", bgzf_out->errcode);
     free(hdr.s);
 
-    // Output all remainig data read with the header block
+    // Output all remaining data read with the header block
     if ( fp->block_length - skip_until > 0 )
     {
         if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode);
@@ -479,6 +461,10 @@ static void reheader_vcf(args_t *args)
         free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
         read_header_file(args->header_fname, &hdr);
     }
+
+    if ( args->fai )
+        update_from_fai(args->fai, &hdr);
+
     if ( samples )
     {
         set_samples(samples, nsamples, &hdr);
@@ -586,6 +572,10 @@ static void reheader_bcf(args_t *args, int is_compressed)
         free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
         read_header_file(args->header_fname, &htxt);
     }
+
+    if ( args->fai )
+        update_from_fai(args->fai, &htxt);
+
     if ( samples )
     {
         set_samples(samples, nsamples, &htxt);
@@ -675,11 +665,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -h, --header FILE          new header\n");
     fprintf(stderr, "    -o, --output FILE          write output to a file [standard output]\n");
     fprintf(stderr, "    -s, --samples FILE         new sample names\n");
-#ifdef _WIN32
-    fprintf(stderr, "    -T, --temp-prefix PATH     template for temporary file name [/bcftools.XXXXXX]\n");
-#else
-    fprintf(stderr, "    -T, --temp-prefix PATH     template for temporary file name [/tmp/bcftools.XXXXXX]\n");
-#endif
+    fprintf(stderr, "    -T, --temp-prefix PATH     ignored; was template for temporary file name\n");
     fprintf(stderr, "        --threads INT          use multithreading with <int> worker threads (BCF only) [0]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Example:\n");
@@ -716,7 +702,7 @@ int main_reheader(int argc, char *argv[])
         switch (c)
         {
             case  1 : args->n_threads = strtol(optarg, 0, 0); break;
-            case 'T': args->tmp_prefix = optarg; break;
+            case 'T': break; // unused - was temp file prefix
             case 'f': args->fai_fname = optarg; break;
             case 'o': args->output_fname = optarg; break;
             case 's': args->samples_fname = optarg; break;
@@ -733,8 +719,11 @@ int main_reheader(int argc, char *argv[])
     }
     else args->fname = argv[optind];
 
-    if ( args->fai_fname ) update_from_fai(args);
-    if ( !args->samples_fname && !args->header_fname ) usage(args);
+    if ( args->fai_fname ) {
+        args->fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
+        if ( !args->fai ) error("Could not parse %s\n", args->fai_fname);
+    }
+    if ( !args->samples_fname && !args->header_fname && !args->fai) usage(args);
     if ( !args->fname ) usage(args);
 
     args->fp = hts_open(args->fname,"r");
@@ -755,11 +744,8 @@ int main_reheader(int argc, char *argv[])
     else
         reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip);
 
-    if ( args->rm_tmpfile )
-    {
-        unlink(args->rm_tmpfile);
-        free(args->rm_tmpfile);
-    }
+    if (args->fai)
+        fai_destroy(args->fai);
     free(args);
     return 0;
 }
diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c
index 44dff8c9c..87d460a80 100644
--- a/bcftools/reheader.c.pysam.c
+++ b/bcftools/reheader.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  reheader.c -- reheader subcommand.
 
-    Copyright (C) 2014-2022 Genome Research Ltd.
+    Copyright (C) 2014-2022,2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -51,8 +51,9 @@ THE SOFTWARE.  */
 typedef struct _args_t
 {
     char **argv, *fname, *samples_fname, *header_fname, *output_fname;
-    char *fai_fname, *rm_tmpfile, *tmp_prefix;
+    char *fai_fname;
     htsFile *fp;
+    faidx_t *fai;
     htsFormat type;
     htsThreadPool *threads;
     int argc, n_threads;
@@ -170,33 +171,13 @@ char *init_tmp_prefix(const char *tmp_prefix)
     kputs("/bcftools.XXXXXX", &prefix);
     return prefix.s;
 }
-static void update_from_fai(args_t *args)
+static void update_from_fai(faidx_t *fai, kstring_t *hdr_txt)
 {
-    if ( !strcmp("-",args->fname) )
-        error("Cannot use the --fai option when reading from standard input.\n");
-
-    faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
-    if ( !fai ) error("Could not parse %s\n", args->fai_fname);
-    args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix);
-    int fd = mkstemp(args->rm_tmpfile);
-    if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile);
-
-    // get a template header: either from the original VCF or from --header
-    char *ori_hdr_fname = args->header_fname ? args->header_fname : args->fname;
-    htsFile *fp = hts_open(ori_hdr_fname,"r");
-    if ( !fp ) error("Failed to open: %s\n", ori_hdr_fname);
-    bcf_hdr_t *hdr = bcf_hdr_read(fp);
-    if ( !hdr ) error("Failed to read the header: %s\n", ori_hdr_fname);
-    hts_close(fp);  // no need to check the return status here
-
-    // put the header in a text buffer
-    kstring_t hdr_txt_ori = {0,0,0}, hdr_txt_new = {0,0,0};
-    bcf_hdr_format(hdr, 0, &hdr_txt_ori);
-    bcf_hdr_destroy(hdr);
+    kstring_t hdr_txt_new = {0,0,0};
 
     // update the existing contig lines and remove lines not present in the fai file
     void *chr_seen = khash_str2int_init();
-    char *tmp, *beg = hdr_txt_ori.s;
+    char *tmp, *beg = hdr_txt->s;
     while ( beg && *beg )
     {
         tmp = strstr(beg, "\n##contig=<");
@@ -218,13 +199,10 @@ static void update_from_fai(args_t *args)
     }
     kputs(tmp+1,&hdr_txt_new);
 
-    if ( write(fd, hdr_txt_new.s, hdr_txt_new.l)!=hdr_txt_new.l ) error("Failed to write %zu bytes to %s\n", hdr_txt_new.l,args->rm_tmpfile);
-    if ( close(fd)!=0 ) error("Failed to close %s\n", args->rm_tmpfile);
-    args->header_fname = args->rm_tmpfile;
+    // Switch the new header content for the old
+    free(hdr_txt->s);
+    memcpy(hdr_txt, &hdr_txt_new, sizeof(*hdr_txt));
 
-    free(hdr_txt_ori.s);
-    free(hdr_txt_new.s);
-    fai_destroy(fai);
     khash_str2int_destroy_free(chr_seen);
 }
 
@@ -422,6 +400,10 @@ static void reheader_vcf_gz(args_t *args)
         free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
         read_header_file(args->header_fname, &hdr);
     }
+
+    if ( args->fai )
+        update_from_fai(args->fai, &hdr);
+
     if ( samples )
     {
         set_samples(samples, nsamples, &hdr);
@@ -435,7 +417,7 @@ static void reheader_vcf_gz(args_t *args)
     if ( bgzf_write(bgzf_out, hdr.s, hdr.l) < 0 ) error("Can't write BGZF header (code %d)\n", bgzf_out->errcode);
     free(hdr.s);
 
-    // Output all remainig data read with the header block
+    // Output all remaining data read with the header block
     if ( fp->block_length - skip_until > 0 )
     {
         if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode);
@@ -481,6 +463,10 @@ static void reheader_vcf(args_t *args)
         free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
         read_header_file(args->header_fname, &hdr);
     }
+
+    if ( args->fai )
+        update_from_fai(args->fai, &hdr);
+
     if ( samples )
     {
         set_samples(samples, nsamples, &hdr);
@@ -588,6 +574,10 @@ static void reheader_bcf(args_t *args, int is_compressed)
         free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
         read_header_file(args->header_fname, &htxt);
     }
+
+    if ( args->fai )
+        update_from_fai(args->fai, &htxt);
+
     if ( samples )
     {
         set_samples(samples, nsamples, &htxt);
@@ -677,11 +667,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "    -h, --header FILE          new header\n");
     fprintf(bcftools_stderr, "    -o, --output FILE          write output to a file [standard output]\n");
     fprintf(bcftools_stderr, "    -s, --samples FILE         new sample names\n");
-#ifdef _WIN32
-    fprintf(bcftools_stderr, "    -T, --temp-prefix PATH     template for temporary file name [/bcftools.XXXXXX]\n");
-#else
-    fprintf(bcftools_stderr, "    -T, --temp-prefix PATH     template for temporary file name [/tmp/bcftools.XXXXXX]\n");
-#endif
+    fprintf(bcftools_stderr, "    -T, --temp-prefix PATH     ignored; was template for temporary file name\n");
     fprintf(bcftools_stderr, "        --threads INT          use multithreading with <int> worker threads (BCF only) [0]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Example:\n");
@@ -718,7 +704,7 @@ int main_reheader(int argc, char *argv[])
         switch (c)
         {
             case  1 : args->n_threads = strtol(optarg, 0, 0); break;
-            case 'T': args->tmp_prefix = optarg; break;
+            case 'T': break; // unused - was temp file prefix
             case 'f': args->fai_fname = optarg; break;
             case 'o': args->output_fname = optarg; break;
             case 's': args->samples_fname = optarg; break;
@@ -735,8 +721,11 @@ int main_reheader(int argc, char *argv[])
     }
     else args->fname = argv[optind];
 
-    if ( args->fai_fname ) update_from_fai(args);
-    if ( !args->samples_fname && !args->header_fname ) usage(args);
+    if ( args->fai_fname ) {
+        args->fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
+        if ( !args->fai ) error("Could not parse %s\n", args->fai_fname);
+    }
+    if ( !args->samples_fname && !args->header_fname && !args->fai) usage(args);
     if ( !args->fname ) usage(args);
 
     args->fp = hts_open(args->fname,"r");
@@ -757,11 +746,8 @@ int main_reheader(int argc, char *argv[])
     else
         reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip);
 
-    if ( args->rm_tmpfile )
-    {
-        unlink(args->rm_tmpfile);
-        free(args->rm_tmpfile);
-    }
+    if (args->fai)
+        fai_destroy(args->fai);
     free(args);
     return 0;
 }
diff --git a/bcftools/str_finder.c b/bcftools/str_finder.c
index 800cbfef9..a9281d811 100644
--- a/bcftools/str_finder.c
+++ b/bcftools/str_finder.c
@@ -1,7 +1,7 @@
 /*  str_finder.c -- Short Tandem Repeat finder.
     Originally from Crumble (https://github.com/jkbonfield/crumble)
 
-    Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+    Copyright (C) 2015-2016, 2021-2022 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -50,7 +50,7 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
 	    return;
     }
 
-    // Find current and last occurence of repeated word.
+    // Find current and last occurrence of repeated word.
 
     cp2 = &cons[pos+1];
     // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
@@ -137,6 +137,86 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
  * Returns a list of rep_ele structs holding the start,end tuples of repeats;
  *         NULL on failure.
  */
+rep_ele *find_STR64(char *cons, int len, int lower_only) {
+    int i, j;
+    uint64_t w = 0;
+    rep_ele *reps = NULL;
+
+    for (i = j = 0; i < len && j < 26; i++) {
+	if (cons[i] == '*') continue;
+
+	w <<= 2;
+	w |= cons[i];
+	//printf("%3d %c w=%08x\n", i, cons[i], w);
+	if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
+	    add_rep(&reps, cons, len, i, 1, lower_only, w);
+	if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
+	    add_rep(&reps, cons, len, i, 2, lower_only, w);
+	if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
+	    add_rep(&reps, cons, len, i, 3, lower_only, w);
+	if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
+	    add_rep(&reps, cons, len, i, 4, lower_only, w);
+	if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
+	    add_rep(&reps, cons, len, i, 5, lower_only, w);
+	if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
+	    add_rep(&reps, cons, len, i, 6, lower_only, w);
+	if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
+	    add_rep(&reps, cons, len, i, 7, lower_only, w);
+	if (j>=15 && (w&0xffff) == ((w>>16)&0xffff))
+	    add_rep(&reps, cons, len, i, 8, lower_only, w);
+	if (j>=17 && (w&0x003ffff) == ((w>>18)&0x003ffff))
+	    add_rep(&reps, cons, len, i, 9, lower_only, w);
+	if (j>=19 && (w&0x00fffff) == ((w>>20)&0x00fffff))
+	    add_rep(&reps, cons, len, i,10, lower_only, w);
+	if (j>=21 && (w&0x03fffff) == ((w>>22)&0x03fffff))
+	    add_rep(&reps, cons, len, i,11, lower_only, w);
+	if (j>=23 && (w&0x0ffffff) == ((w>>24)&0x0ffffff))
+	    add_rep(&reps, cons, len, i,12, lower_only, w);
+	if (j>=24 && (w&0x3ffffff) == ((w>>26)&0x3ffffff))
+	    add_rep(&reps, cons, len, i,13, lower_only, w);
+
+	j++;
+    }
+
+    for (; i < len; i++) {	
+	if (cons[i] == '*') continue;
+
+	w <<= 2;
+	w |= cons[i];
+	//printf("%3d %c w=%08x\n", i, cons[i], w);
+	if      ((w&0xfffffff) == ((w>>28)&0xfffffff))
+	    add_rep(&reps, cons, len, i, 14, lower_only, w);
+	else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff))
+	    add_rep(&reps, cons, len, i, 13, lower_only, w);
+	else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff))
+	    add_rep(&reps, cons, len, i, 12, lower_only, w);
+	else if ((w&0x03fffff) == ((w>>22)&0x03fffff))
+	    add_rep(&reps, cons, len, i, 11, lower_only, w);
+	else if ((w&0x00fffff) == ((w>>20)&0x00fffff))
+	    add_rep(&reps, cons, len, i, 10, lower_only, w);
+	else if ((w&0x003ffff) == ((w>>18)&0x003ffff))
+	    add_rep(&reps, cons, len, i, 9, lower_only, w);
+	else if ((w&0xffff) == ((w>>16)&0xffff))
+	    add_rep(&reps, cons, len, i, 8, lower_only, w);
+	else if ((w&0x3fff) == ((w>>14)&0x3fff))
+	    add_rep(&reps, cons, len, i, 7, lower_only, w);
+	else if ((w&0x0fff) == ((w>>12)&0x0fff))
+	    add_rep(&reps, cons, len, i, 6, lower_only, w);
+	else if ((w&0x03ff) == ((w>>10)&0x03ff))
+	    add_rep(&reps, cons, len, i, 5, lower_only, w);
+	else if ((w&0x00ff) == ((w>> 8)&0x00ff))
+	    add_rep(&reps, cons, len, i, 4, lower_only, w);
+	else if ((w&0x003f) == ((w>> 6)&0x003f))
+	    add_rep(&reps, cons, len, i, 3, lower_only, w);
+	else if ((w&0x000f) == ((w>> 4)&0x000f))
+	    add_rep(&reps, cons, len, i, 2, lower_only, w);
+	else if ((w&0x0003) == ((w>> 2)&0x0003))
+	    add_rep(&reps, cons, len, i, 1, lower_only, w);
+    }
+
+    return reps;
+}
+
 rep_ele *find_STR(char *cons, int len, int lower_only) {
     int i, j;
     uint32_t w = 0;
@@ -172,21 +252,21 @@ rep_ele *find_STR(char *cons, int len, int lower_only) {
 	w <<= 2;
 	w |= cons[i];
 	//printf("%3d %c w=%08x\n", i, cons[i], w);
-	if ((w&0xffff) == ((w>>16)&0xffff)) 
+	if ((w&0xffff) == ((w>>16)&0xffff))
 	    add_rep(&reps, cons, len, i, 8, lower_only, w);
-	else if ((w&0x3fff) == ((w>>14)&0x3fff)) 
+	else if ((w&0x3fff) == ((w>>14)&0x3fff))
 	    add_rep(&reps, cons, len, i, 7, lower_only, w);
-	else if ((w&0x0fff) == ((w>>12)&0x0fff)) 
+	else if ((w&0x0fff) == ((w>>12)&0x0fff))
 	    add_rep(&reps, cons, len, i, 6, lower_only, w);
-	else if ((w&0x03ff) == ((w>>10)&0x03ff)) 
+	else if ((w&0x03ff) == ((w>>10)&0x03ff))
 	    add_rep(&reps, cons, len, i, 5, lower_only, w);
-	else if ((w&0x00ff) == ((w>> 8)&0x00ff)) 
+	else if ((w&0x00ff) == ((w>> 8)&0x00ff))
 	    add_rep(&reps, cons, len, i, 4, lower_only, w);
-	else if ((w&0x003f) == ((w>> 6)&0x003f)) 
+	else if ((w&0x003f) == ((w>> 6)&0x003f))
 	    add_rep(&reps, cons, len, i, 3, lower_only, w);
-	else if ((w&0x000f) == ((w>> 4)&0x000f)) 
+	else if ((w&0x000f) == ((w>> 4)&0x000f))
 	    add_rep(&reps, cons, len, i, 2, lower_only, w);
-	else if ((w&0x0003) == ((w>> 2)&0x0003)) 
+	else if ((w&0x0003) == ((w>> 2)&0x0003))
 	    add_rep(&reps, cons, len, i, 1, lower_only, w);
     }
 
diff --git a/bcftools/str_finder.c.pysam.c b/bcftools/str_finder.c.pysam.c
index 296c8673a..9214d0a01 100644
--- a/bcftools/str_finder.c.pysam.c
+++ b/bcftools/str_finder.c.pysam.c
@@ -3,7 +3,7 @@
 /*  str_finder.c -- Short Tandem Repeat finder.
     Originally from Crumble (https://github.com/jkbonfield/crumble)
 
-    Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+    Copyright (C) 2015-2016, 2021-2022 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -52,7 +52,7 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
 	    return;
     }
 
-    // Find current and last occurence of repeated word.
+    // Find current and last occurrence of repeated word.
 
     cp2 = &cons[pos+1];
     // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
@@ -139,6 +139,86 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
  * Returns a list of rep_ele structs holding the start,end tuples of repeats;
  *         NULL on failure.
  */
+rep_ele *find_STR64(char *cons, int len, int lower_only) {
+    int i, j;
+    uint64_t w = 0;
+    rep_ele *reps = NULL;
+
+    for (i = j = 0; i < len && j < 26; i++) {
+	if (cons[i] == '*') continue;
+
+	w <<= 2;
+	w |= cons[i];
+	//printf("%3d %c w=%08x\n", i, cons[i], w);
+	if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
+	    add_rep(&reps, cons, len, i, 1, lower_only, w);
+	if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
+	    add_rep(&reps, cons, len, i, 2, lower_only, w);
+	if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
+	    add_rep(&reps, cons, len, i, 3, lower_only, w);
+	if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
+	    add_rep(&reps, cons, len, i, 4, lower_only, w);
+	if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
+	    add_rep(&reps, cons, len, i, 5, lower_only, w);
+	if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
+	    add_rep(&reps, cons, len, i, 6, lower_only, w);
+	if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
+	    add_rep(&reps, cons, len, i, 7, lower_only, w);
+	if (j>=15 && (w&0xffff) == ((w>>16)&0xffff))
+	    add_rep(&reps, cons, len, i, 8, lower_only, w);
+	if (j>=17 && (w&0x003ffff) == ((w>>18)&0x003ffff))
+	    add_rep(&reps, cons, len, i, 9, lower_only, w);
+	if (j>=19 && (w&0x00fffff) == ((w>>20)&0x00fffff))
+	    add_rep(&reps, cons, len, i,10, lower_only, w);
+	if (j>=21 && (w&0x03fffff) == ((w>>22)&0x03fffff))
+	    add_rep(&reps, cons, len, i,11, lower_only, w);
+	if (j>=23 && (w&0x0ffffff) == ((w>>24)&0x0ffffff))
+	    add_rep(&reps, cons, len, i,12, lower_only, w);
+	if (j>=24 && (w&0x3ffffff) == ((w>>26)&0x3ffffff))
+	    add_rep(&reps, cons, len, i,13, lower_only, w);
+
+	j++;
+    }
+
+    for (; i < len; i++) {	
+	if (cons[i] == '*') continue;
+
+	w <<= 2;
+	w |= cons[i];
+	//printf("%3d %c w=%08x\n", i, cons[i], w);
+	if      ((w&0xfffffff) == ((w>>28)&0xfffffff))
+	    add_rep(&reps, cons, len, i, 14, lower_only, w);
+	else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff))
+	    add_rep(&reps, cons, len, i, 13, lower_only, w);
+	else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff))
+	    add_rep(&reps, cons, len, i, 12, lower_only, w);
+	else if ((w&0x03fffff) == ((w>>22)&0x03fffff))
+	    add_rep(&reps, cons, len, i, 11, lower_only, w);
+	else if ((w&0x00fffff) == ((w>>20)&0x00fffff))
+	    add_rep(&reps, cons, len, i, 10, lower_only, w);
+	else if ((w&0x003ffff) == ((w>>18)&0x003ffff))
+	    add_rep(&reps, cons, len, i, 9, lower_only, w);
+	else if ((w&0xffff) == ((w>>16)&0xffff))
+	    add_rep(&reps, cons, len, i, 8, lower_only, w);
+	else if ((w&0x3fff) == ((w>>14)&0x3fff))
+	    add_rep(&reps, cons, len, i, 7, lower_only, w);
+	else if ((w&0x0fff) == ((w>>12)&0x0fff))
+	    add_rep(&reps, cons, len, i, 6, lower_only, w);
+	else if ((w&0x03ff) == ((w>>10)&0x03ff))
+	    add_rep(&reps, cons, len, i, 5, lower_only, w);
+	else if ((w&0x00ff) == ((w>> 8)&0x00ff))
+	    add_rep(&reps, cons, len, i, 4, lower_only, w);
+	else if ((w&0x003f) == ((w>> 6)&0x003f))
+	    add_rep(&reps, cons, len, i, 3, lower_only, w);
+	else if ((w&0x000f) == ((w>> 4)&0x000f))
+	    add_rep(&reps, cons, len, i, 2, lower_only, w);
+	else if ((w&0x0003) == ((w>> 2)&0x0003))
+	    add_rep(&reps, cons, len, i, 1, lower_only, w);
+    }
+
+    return reps;
+}
+
 rep_ele *find_STR(char *cons, int len, int lower_only) {
     int i, j;
     uint32_t w = 0;
@@ -174,21 +254,21 @@ rep_ele *find_STR(char *cons, int len, int lower_only) {
 	w <<= 2;
 	w |= cons[i];
 	//printf("%3d %c w=%08x\n", i, cons[i], w);
-	if ((w&0xffff) == ((w>>16)&0xffff)) 
+	if ((w&0xffff) == ((w>>16)&0xffff))
 	    add_rep(&reps, cons, len, i, 8, lower_only, w);
-	else if ((w&0x3fff) == ((w>>14)&0x3fff)) 
+	else if ((w&0x3fff) == ((w>>14)&0x3fff))
 	    add_rep(&reps, cons, len, i, 7, lower_only, w);
-	else if ((w&0x0fff) == ((w>>12)&0x0fff)) 
+	else if ((w&0x0fff) == ((w>>12)&0x0fff))
 	    add_rep(&reps, cons, len, i, 6, lower_only, w);
-	else if ((w&0x03ff) == ((w>>10)&0x03ff)) 
+	else if ((w&0x03ff) == ((w>>10)&0x03ff))
 	    add_rep(&reps, cons, len, i, 5, lower_only, w);
-	else if ((w&0x00ff) == ((w>> 8)&0x00ff)) 
+	else if ((w&0x00ff) == ((w>> 8)&0x00ff))
 	    add_rep(&reps, cons, len, i, 4, lower_only, w);
-	else if ((w&0x003f) == ((w>> 6)&0x003f)) 
+	else if ((w&0x003f) == ((w>> 6)&0x003f))
 	    add_rep(&reps, cons, len, i, 3, lower_only, w);
-	else if ((w&0x000f) == ((w>> 4)&0x000f)) 
+	else if ((w&0x000f) == ((w>> 4)&0x000f))
 	    add_rep(&reps, cons, len, i, 2, lower_only, w);
-	else if ((w&0x0003) == ((w>> 2)&0x0003)) 
+	else if ((w&0x0003) == ((w>> 2)&0x0003))
 	    add_rep(&reps, cons, len, i, 1, lower_only, w);
     }
 
diff --git a/bcftools/str_finder.h b/bcftools/str_finder.h
index 242f59ec1..22f9f5941 100644
--- a/bcftools/str_finder.h
+++ b/bcftools/str_finder.h
@@ -1,7 +1,7 @@
 /*  str_finder.c -- Short Tandem Repeat finder.
     Originally from Crumble (https://github.com/jkbonfield/crumble)
 
-    Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+    Copyright (C) 2015-2016, 2021, 2023 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -48,6 +48,9 @@ typedef struct rep_ele {
  */
 rep_ele *find_STR(char *cons, int len, int lower_only);
 
+/* As above, but use a longer hash with longer STR elements found */
+rep_ele *find_STR64(char *cons, int len, int lower_only);
+
 /*
  * Returns an array of STR vs no-STR values.
  *         0  => non repetitive.
diff --git a/bcftools/variantkey.h b/bcftools/variantkey.h
index a74935fb7..3c7959674 100644
--- a/bcftools/variantkey.h
+++ b/bcftools/variantkey.h
@@ -35,7 +35,7 @@
  * @file variantkey.h
  * @brief VariantKey main functions.
  *
- * The functions provided here allows to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants.
+ * The functions provided here allow the generation and processing of a 64 bit Unsigned Integer Keys for Human Genetic Variants.
  * The VariantKey is sortable for chromosome and position,
  * and it is also fully reversible for variants with up to 11 bases between Reference and Alternate alleles.
  * It can be used to sort, search and match variant-based data easily and very quickly.
@@ -92,7 +92,7 @@ static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size)
     {
         if ((chrom[i] > '9') || (chrom[i] < '0'))
         {
-            return 0; // NA: a character that is not a numebr was found.
+            return 0; // NA: a character that is not a number was found.
         }
         v = ((v * 10) + (chrom[i] - '0'));
     }
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c
index b2e39ef7b..b66c8cf51 100644
--- a/bcftools/vcfannotate.c
+++ b/bcftools/vcfannotate.c
@@ -1,6 +1,6 @@
 /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -104,6 +104,19 @@ typedef struct _annot_col_t
 }
 annot_col_t;
 
+typedef struct
+{
+    char *name;     // column name
+    int ht_type;    // type, one of BCF_HT_STR,BCF_HT_INT,BCF_HT_REAL
+    int icol;       // index of the annotation column to use
+    union {         // memory area with the current annotation value to pass to filter_test_ext
+        int i;
+        float f;
+        char *s;
+    };
+}
+ext_t;
+
 // Logic of the filters: include or exclude sites which match the filters?
 #define FLT_INCLUDE 1
 #define FLT_EXCLUDE 2
@@ -125,7 +138,7 @@ typedef struct _args_t
     regitr_t *tgt_itr;
     int tgt_is_bed;
 
-    filter_t *filter;
+    filter_t *filter, *filter_ext;  // only one is initialized, the latter contains external values to set dynamically on the fly
     char *filter_str;
     int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
     int keep_sites;
@@ -149,6 +162,11 @@ typedef struct _args_t
     convert_t *set_ids;
     int set_ids_replace;
 
+    // external values for dynamic -i/-e expressions
+    int n_ext;
+    ext_t *ext;
+    void **ext_ptr;
+
     int nsmpl_annot;
     int *sample_map, nsample_map, sample_is_file;   // map[idst] -> isrc
     uint8_t *src_smpl_pld, *dst_smpl_pld;   // for Number=G format fields
@@ -170,6 +188,7 @@ typedef struct _args_t
     int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
     int columns_is_file, has_append_mode, pair_logic;
     dbuf_t *header_lines;
+    bcf1_t *current_rec;    // current record for local setters
 }
 args_t;
 
@@ -510,17 +529,21 @@ static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col,
 static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
 {
     char *str = *((char**)ptr);
-    int len = strlen(rec->d.id);
+    int i, len = strlen(rec->d.id);
     if ( len >= *mptr ) str = realloc(str, len+1);
-    strcpy(str, rec->d.id);
+    for (i=0; i<len; i++)
+        str[i] = rec->d.id[i]==';' ? ',' : rec->d.id[i];
+    str[len] = 0;
     *((char**)ptr) = str;
     *mptr = len+1;
     return len;
 }
-static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+inline static int vcf_getter_filter2str_core(bcf_hdr_t *hdr, bcf1_t *rec, char **ptr, int *mptr)
 {
+    if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
+
     kstring_t str;
-    str.s = *((char**)ptr);
+    str.s = *ptr;
     str.m = *mptr;
     str.l = 0;
 
@@ -529,16 +552,24 @@ static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, vo
     {
         for (i=0; i<rec->d.n_flt; i++)
         {
-            if (i) kputc(';', &str);
-            kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
+            if (i) kputc(',', &str);
+            kputs(bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i]), &str);
         }
     }
     else kputc('.', &str);
 
-    *((char**)ptr) = str.s;
+    *ptr  = str.s;
     *mptr = str.m;
     return str.l;
 }
+static int vcf_getter_filter2str_local(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    return vcf_getter_filter2str_core(args->hdr_out, args->current_rec, (char**)ptr, mptr);
+}
+static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    return vcf_getter_filter2str_core(args->tgts_hdr, rec, (char**)ptr, mptr);
+}
 static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
     if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
@@ -604,7 +635,7 @@ static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
     char *tmp;
     int pos = strtol(tab->cols[col->icol], &tmp, 10);
     if ( tmp==tab->cols[col->icol] )
-        error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
+        error("Could not parse -POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
     line->pos = pos - 1;
     return 0;
 }
@@ -1155,6 +1186,29 @@ void khash_str2int_clear_free(void *_hash)
         if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
     kh_clear(str2int, hash);
 }
+static const char *escape_string(const char *str, char needle[], char **rmme, size_t *len)
+{
+    kstring_t tmp = {0,0,0};
+    const char *bp = str, *ep = str;
+    while ( *ep )
+    {
+        int i = 0;
+        while ( needle[i] && needle[i]!=*ep ) i++;
+        if ( !needle[i] ) { ep++; continue; }
+        kputsn(bp,ep-bp,&tmp);
+        ksprintf(&tmp,"%%%X",*ep);
+        bp = ++ep;
+    }
+    if ( !tmp.l )
+    {
+        *len = strlen(str);
+        return str;
+    }
+    kputs(bp,&tmp);
+    *len  = tmp.l;
+    *rmme = tmp.s;
+    return tmp.s;
+}
 static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
     if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
@@ -1168,13 +1222,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
     if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
 
     annot_line_t *tab = (annot_line_t*) data;
+    const char *escaped = NULL;
+    char *rmme = NULL;
 
-    int len = 0;
+    size_t len = 0;
     if ( tab )
     {
-        len = strlen(tab->cols[col->icol]);
-        if ( !len ) return 0;
-        if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
+        char *str = tab->cols[col->icol];
+        if ( !str || !*str ) return 0;
+        if ( !str[1] && str[0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
+        char needle[] = {';','=',0};
+        escaped = escape_string(tab->cols[col->icol],needle,&rmme,&len);
     }
 
     if ( col->merge_method!=MM_FIRST )
@@ -1188,8 +1246,12 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
             if ( col->merge_method==MM_UNIQUE )
             {
                 if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
-                if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
-                khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
+                if ( khash_str2int_has_key(col->mm_str_hash, escaped) )
+                {
+                    free(rmme);
+                    return 1;
+                }
+                khash_str2int_inc(col->mm_str_hash, strdup(escaped));
             }
 
             if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l )
@@ -1201,17 +1263,20 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
             }
 
             if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
-            kputs(tab->cols[col->icol], &col->mm_kstr);
+            kputs(escaped, &col->mm_kstr);
+            free(rmme);
             return 1;
         }
-
         if ( col->mm_kstr.l )
         {
             hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps);
             memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1);
         }
         else
+        {
+            free(rmme);
             return 0;
+        }
 
         // flush the line
         if ( col->merge_method==MM_UNIQUE )
@@ -1222,13 +1287,13 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
     {
         assert(tab);
         hts_expand(char,len+1,args->mtmps,args->tmps);
-        memcpy(args->tmps,tab->cols[col->icol],len+1);
-
+        memcpy(args->tmps,escaped,len+1);
         if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
             return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
     }
-
-    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
+    int ret = bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
+    free(rmme);
+    return ret;
 }
 static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
@@ -1662,11 +1727,18 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void
     if ( col->icol+args->nsmpl_annot > tab->ncols )
         error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
 
+    char needle[] = {':',0};
     int ismpl;
     for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
-        args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
-
-    return core_setter_format_str(args,line,col,args->tmpp);
+    {
+        size_t len;
+        char *rmme = NULL;
+        const char *str = escape_string(tab->cols[col->icol + ismpl],needle,&rmme,&len);
+        args->tmpp[ismpl] = rmme ? rmme : strdup(str);
+    }
+    int ret = core_setter_format_str(args,line,col,args->tmpp);
+    for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++) free(args->tmpp[ismpl]);
+    return ret;
 }
 static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl)
 {
@@ -2198,7 +2270,23 @@ static void init_columns(args_t *args)
         kputsn(ss, se-ss, &str);
         if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
         else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
-        else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol;
+        else if ( !strcasecmp("POS",str.s) )
+        {
+            if ( replace==REPLACE_NON_MISSING && !args->tgts_is_vcf )
+            {
+                args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+                annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
+                col->icol = icol;
+                col->replace = replace;
+                col->setter  = setter_pos;
+                col->hdr_key_src = strdup(str.s);
+                col->hdr_key_dst = strdup(str.s);
+                args->match_end = icol;
+            }
+            else
+                args->beg_idx = icol;
+        }
         else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol;
         else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol;
         else if ( !strcasecmp("REF",str.s) )
@@ -2257,9 +2345,23 @@ static void init_columns(args_t *args)
             col->hdr_key_dst = strdup(str.s);
             args->match_end = icol;
         }
-        else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf )
+        else if ( !strcasecmp("~POS",str.s) )
+        {
+            error("Error: the use of ~POS has been deprecated, use -POS to transfer the column POS.\n");
+        }
+        else if ( str.s[0]=='~' )
         {
-            if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n");
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = MATCH_VALUE;
+            col->setter  = NULL;
+            col->hdr_key_src = strdup(str.s+1);
+        }
+        else if ( !strcasecmp("-POS",str.s) && !args->tgts_is_vcf )
+        {
+            if ( args->tgts_is_vcf ) error("Error: cannot use -POS, position can be replaced only from a tab-delimited file\n");
             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
             annot_col_t *col = &args->cols[args->ncols-1];
             memset(col,0,sizeof(*col));
@@ -2290,10 +2392,30 @@ static void init_columns(args_t *args)
             if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
                 error("Only Type=String tags can be used to annotate the ID column\n");
         }
-        else if ( (ptr=strstr(str.s,":=")) && !args->targets_fname )
+        else if ( (ptr=strstr(str.s,":=")) && (!args->targets_fname || !strncasecmp(ptr+2,"./",2)) )
         {
             *ptr = 0;
-            rename_annots_push(args,ptr+2,str.s);
+            if ( !strncasecmp(str.s,"INFO/",5) && (!strcasecmp(ptr+2,"FILTER") || !strcasecmp(ptr+2,"./FILTER")) )
+            {
+                // -a not present and transferring filter, needs to be a local transfer
+                args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+                annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
+                col->icol = icol;
+                col->replace = replace;
+                col->setter = vcf_setter_info_str;
+                col->getter = vcf_getter_filter2str_local;
+                col->hdr_key_src = strdup(ptr+2);
+                col->hdr_key_dst = strdup(str.s+5);
+                tmp.l = 0;
+                ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",col->hdr_key_dst);
+                bcf_hdr_append(args->hdr_out, tmp.s);
+                if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__);
+                int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, col->hdr_key_dst);
+                col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+            }
+            else
+                rename_annots_push(args,ptr+2,str.s);
             *ptr = ':';
         }
         else if ( !strcasecmp("FILTER",str.s) )
@@ -2487,6 +2609,13 @@ static void init_columns(args_t *args)
                           "       (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
                 fprintf(stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
             }
+
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = replace;
+
             int explicit_src_info = 0;
             int explicit_dst_info = 0;
             char *key_dst;
@@ -2517,15 +2646,14 @@ static void init_columns(args_t *args)
                     key_src[-2] = ':';
                     error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
                 }
+                else if ( !strcasecmp("FILTER",key_src) && args->tgts_is_vcf )
+                {
+                    col->getter = vcf_getter_filter2str;
+                }
             }
             else
                 key_src = key_dst;
 
-            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
-            annot_col_t *col = &args->cols[args->ncols-1];
-            memset(col,0,sizeof(*col));
-            col->icol = icol;
-            col->replace = replace;
             col->hdr_key_src = strdup(key_src);
             col->hdr_key_dst = strdup(key_dst);
 
@@ -2782,7 +2910,7 @@ static void rename_annots(args_t *args)
         while ( *ptr && isspace(*ptr) ) ptr++;
         if ( !*ptr ) { *rmme = ' '; error("Could not parse: %s\n", args->rename_annots_map[i]); }
         if ( rename_annots_core(args, args->rename_annots_map[i], ptr) < 0 )
-            error("Could not parse \"%s %s\", expected INFO, FORMAT, or FILTER prefix\n",args->rename_annots_map[i],ptr);
+            error("Cannot rename \"%s\" to \"%s\"\n",args->rename_annots_map[i],ptr);
     }
 }
 static void rename_annots_push(args_t *args, char *src, char *dst)
@@ -2793,6 +2921,106 @@ static void rename_annots_push(args_t *args, char *src, char *dst)
     ksprintf(&str,"%s %s",src,dst);
     args->rename_annots_map[ args->rename_annots_nmap - 1 ] = str.s;
 }
+static void init_filters(args_t *args)
+{
+    // Check if the -i/-e expressions contain external values that should be determined
+    // on the fly from the annotation file. The expressions can be given as
+    //      TAG={NAME}
+    //      TAG={str:NAME}
+    //      TAG={int:NAME}
+    //      TAG={float:NAME}
+    kstring_t str = {0,0,0};
+    char *src = strdup(args->filter_str);
+    int len = 0;
+    while (1)
+    {
+        char *beg = strchr(src+len,'{');
+        if ( !beg ) break;
+
+        // check if "{" appears inside quotes, in such case do not modify
+        char skip = 0;
+        char *tmp = src;
+        while ( tmp<beg )
+        {
+            if ( tmp[0]!='"' && tmp[0]!='\'' ) { tmp++; continue; }
+
+            // quote character found
+            int quote = tmp[0];
+            tmp++;
+            while ( *tmp && tmp[0]!=quote ) tmp++;
+            if ( !*tmp ) error("Could not parse the expression: %s\n",args->filter_str);    // unbalanced quotation; todo: check for escape char
+            len = tmp - src + 1;
+            skip = 1;
+        }
+        if ( skip ) continue;
+
+        char *end = ++beg;
+        while ( *end && *end!='}' ) end++;
+        if ( !*end ) error("Could not parse the expression: %s\n",args->filter_str);
+        *end = 0;
+
+        // explicit typing?
+        int type = -1;
+        tmp = beg;
+        while ( *tmp && *tmp!=':' ) tmp++;
+        if ( *tmp )
+        {
+            *tmp = 0;
+            if ( !strcasecmp(beg,"str") ) type = BCF_HT_STR;
+            else if ( !strcasecmp(beg,"int") ) type = BCF_HT_INT;
+            else if ( !strcasecmp(beg,"float") ) type = BCF_HT_REAL;
+        }
+        args->n_ext++;
+        args->ext = (ext_t*)realloc(args->ext,sizeof(*args->ext)*args->n_ext);
+        ext_t *ext = &args->ext[args->n_ext-1];
+        ext->ht_type = type;
+        ext->name = strdup(beg);
+        if ( beg-1 > src ) kputsn(src,beg-1-src,&str);
+        if ( type==-1 ) kputs("{}",&str);
+        else if ( type==BCF_HT_STR ) kputs("{str}",&str);
+        else if ( type==BCF_HT_INT ) kputs("{int}",&str);
+        else if ( type==BCF_HT_REAL ) kputs("{float}",&str);
+        len = str.l;
+        kputs(end+1,&str);
+        free(src);
+        src = strdup(str.s);
+        str.l = 0;
+    }
+    args->filter = filter_init(args->hdr, src);
+    free(src);
+    free(str.s);
+
+    int i,j,n_ext;
+    const int *ext_type = filter_ext_types(args->filter, &n_ext);
+    if ( n_ext != args->n_ext )
+        error("Failed to parse the expression, unexpected number of dynamic variables (%d vs %d): %s\n",n_ext,args->n_ext,args->filter_str);
+
+    if ( !args->n_ext ) return;
+
+    if ( !args->tgts )
+        error("Error: dynamic variables in -i/-e expressions can be currently used only with tab-delimited file, not with VCF (todo)\n");
+
+    // contains external values
+    args->ext_ptr = malloc(sizeof(*args->ext_ptr)*args->n_ext);
+    for (i=0; i<args->n_ext; i++) args->ext[i].ht_type = ext_type[i];
+    args->filter_ext = args->filter;
+    args->filter = NULL;
+
+    // set the column idx
+    if ( args->ncols )
+    {
+        for (i=0; i<args->n_ext; i++)
+        {
+            for (j=0; j<args->ncols; j++)
+            {
+                if ( strcmp(args->ext[i].name,args->cols[j].hdr_key_src) ) continue;
+                args->ext[i].icol = args->cols[j].icol;
+                break;
+            }
+            if ( j==args->ncols ) error("No such column: %s\n",args->ext[i].name);
+        }
+    }
+}
 
 static void init_data(args_t *args)
 {
@@ -2861,7 +3089,7 @@ static void init_data(args_t *args)
     args->vcmp = vcmp_init();
 
     if ( args->filter_str )
-        args->filter = filter_init(args->hdr, args->filter_str);
+        init_filters(args);
 
     if ( args->mark_sites )
     {
@@ -2890,13 +3118,22 @@ static void init_data(args_t *args)
         if ( args->n_threads )
             hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
-        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+        if ( init_index2(args->out_fh,args->hdr,args->output_fname,
+                         &args->index_fn, args->write_index) < 0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
 static void destroy_data(args_t *args)
 {
     int i;
+    for (i=0; i<args->n_ext; i++)
+    {
+        free(args->ext[i].name);
+        if ( args->ext[i].ht_type!=BCF_HT_STR ) continue;
+    }
+    free(args->ext_ptr);
+    free(args->ext);
     for (i=0; i<args->nrm; i++) free(args->rm[i].key);
     free(args->rm);
     if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
@@ -2951,8 +3188,8 @@ static void destroy_data(args_t *args)
     free(args->dst_smpl_pld);
     if ( args->set_ids )
         convert_destroy(args->set_ids);
-    if ( args->filter )
-        filter_destroy(args->filter);
+    if ( args->filter ) filter_destroy(args->filter);
+    if ( args->filter_ext ) filter_destroy(args->filter_ext);
     if (args->out_fh)
     {
         if ( args->write_index )
@@ -3031,7 +3268,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
         }
         else i++;
     }
-    if ( args->ref_idx==-1 && args->nalines ) return;
+    if ( !args->filter_ext && args->ref_idx==-1 && args->nalines ) return;
 
     while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
     {
@@ -3043,7 +3280,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
         tmp->start = args->tgts->start;
         tmp->end   = args->tgts->end;
         parse_annot_line(args, args->tgts->line.s, tmp);
-        if ( args->ref_idx != -1 )
+        if ( args->filter_ext || args->ref_idx != -1 )
         {
             int iseq = args->tgts->iseq;
             if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break;
@@ -3082,164 +3319,181 @@ static int strstr_match(char *a, char *b)
     }
     return 0;
 }
-static void annotate(args_t *args, bcf1_t *line)
+static int annotate_from_regidx(args_t *args, bcf1_t *line)
 {
-    int i, j;
-    for (i=0; i<args->nrm; i++)
-        args->rm[i].handler(args, line, &args->rm[i]);
-
+    int j;
     int has_overlap = 0;
-    if ( args->tgt_idx )
+
+    for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+    if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
     {
-        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
-        if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
+        hts_pos_t vcf_end = line->pos + line->rlen - 1;
+        while ( regitr_overlap(args->tgt_itr) )
         {
-            hts_pos_t vcf_end = line->pos + line->rlen - 1;
-            while ( regitr_overlap(args->tgt_itr) )
-            {
-                annot_line_t *tmp = &args->alines[0];
-                tmp->rid   = line->rid;
-                tmp->start = args->tgt_itr->beg;
-                tmp->end   = args->tgt_itr->end;
+            annot_line_t *tmp = &args->alines[0];
+            tmp->rid   = line->rid;
+            tmp->start = args->tgt_itr->beg;
+            tmp->end   = args->tgt_itr->end;
 
-                // Check min overlap
-                int len_ann = tmp->end - tmp->start + 1;
-                int len_vcf = line->rlen;
-                int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
-                assert( isec > 0 );
-                if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
-                if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
+            // Check min overlap
+            int len_ann = tmp->end - tmp->start + 1;
+            int len_vcf = line->rlen;
+            int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+            assert( isec > 0 );
+            if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
+            if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
 
-                parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
-                for (j=0; j<args->ncols; j++)
-                {
-                    if ( args->cols[j].done==1 ) continue;
-                    int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
-                    if ( ret < 0 )
-                        error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-                    if ( ret==0 )
-                        args->cols[j].done = 1;
-                    has_overlap = 1;
-                }
+            parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
+
+            // If a plain BED file is provided and we are asked to just mark overlapping sites, there are
+            // no additional columns. Not sure if there can be any side effects for ill-formatted BED files
+            // with variable number of columns
+            if ( !args->ncols && args->mark_sites ) has_overlap = 1;
+
+            for (j=0; j<args->ncols; j++)
+            {
+                if ( args->cols[j].done==1 ) continue;
+                int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
+                if ( ret < 0 )
+                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                if ( ret==0 )
+                    args->cols[j].done = 1;
+                has_overlap = 1;
             }
         }
-        for (j=0; j<args->ncols; j++)
+    }
+    for (j=0; j<args->ncols; j++)
+    {
+        if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+        if ( !args->cols[j].setter ) continue;
+        if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
+            error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+    }
+    return has_overlap;
+}
+static int pass_filter_test_ext(args_t *args, bcf1_t *line, annot_line_t *ann)
+{
+    char *tmp;
+    int i;
+    for (i=0; i<args->n_ext; i++)
+    {
+        int j = args->ext[i].icol;
+        if ( args->ext[i].ht_type==BCF_HT_STR ) args->ext_ptr[i] = args->ext[i].s = ann->cols[j];
+        else if ( args->ext[i].ht_type==BCF_HT_INT )
         {
-            if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
-            if ( !args->cols[j].setter ) continue;
-            if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
-                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+            args->ext[i].i = strtol(ann->cols[j],&tmp,10);
+            if ( *tmp )
+            {
+                if ( strcmp(".",ann->cols[j]) ) error("Error: could not parse the annotation file, expected an integer, found \"%s\"\n",ann->cols[j]);
+                args->ext_ptr[i] = NULL;
+            }
+            else
+                args->ext_ptr[i] = &args->ext[i].i;
+        }
+        else if ( args->ext[i].ht_type==BCF_HT_REAL )
+        {
+            args->ext[i].f = strtod(ann->cols[j],&tmp);
+            if ( *tmp )
+            {
+                if ( strcmp(".",ann->cols[j]) ) error("Error: could not parse the annotation file, expected a float, found \"%s\"\n",ann->cols[j]);
+                args->ext_ptr[i] = NULL;
+            }
+            else
+                args->ext_ptr[i] = &args->ext[i].f;
         }
     }
-    else if ( args->tgts )
-    {
-        // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
-        // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
-        // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
-        // for an ALT, missing value is appended instead.
-        int end_pos = line->pos + line->rlen - 1;
-        buffer_annot_lines(args, line, line->pos, end_pos);
+    int pass = filter_test_ext(args->filter_ext,line,NULL,(const void**)args->ext_ptr);
+    if ( args->filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1;
+    return pass;
+}
+static int annotate_from_tab(args_t *args, bcf1_t *line)
+{
+    int i,j;
+    int has_overlap = 0;
+
+    // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
+    // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
+    // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
+    // for an ALT, missing value is appended instead.
+    int end_pos = line->pos + line->rlen - 1;
+    buffer_annot_lines(args, line, line->pos, end_pos);
 
-        args->nsrt_alines = 0;
-        hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
-        if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
-            error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+    args->nsrt_alines = 0;
+    hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
+    if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
+        error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
 
-        kstring_t match_end = {0,0,0};
-        if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
-            kputw(args->tmpi[0],&match_end);
+    kstring_t match_end = {0,0,0};
+    if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
+        kputw(args->tmpi[0],&match_end);
 
-        // Find matching lines
-        for (i=0; i<args->nalines; i++)
+    // Find matching lines
+    for (i=0; i<args->nalines; i++)
+    {
+        if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
+        if ( args->ref_idx != -1 )  // REF+ALT matching requested
         {
-            if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
-            if ( args->ref_idx != -1 )  // REF+ALT matching requested
+            if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
+            for (j=1; j<args->alines[i].nals; j++)
             {
-                if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
-                for (j=1; j<args->alines[i].nals; j++)
+                int ialt;
+                if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
+                    ialt = 0;
+                else
                 {
-                    int ialt;
-                    if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
-                        ialt = 0;
-                    else
-                    {
-                        ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
-                        if ( ialt < 0 ) continue;
-                        ialt++;
-                    }
-                    if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
-                    if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
-                    args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
-                    has_overlap = 1;
-                    break;
+                    ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
+                    if ( ialt < 0 ) continue;
+                    ialt++;
                 }
+                if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+                if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
+                if ( args->filter_ext && !pass_filter_test_ext(args,line,&args->alines[i]) ) continue;
+                args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
+                has_overlap = 1;
+                break;
             }
-            else    // overlap, REF+ALT matching not requested
+        }
+        else if ( args->filter_ext )
+        {
+            if ( pass_filter_test_ext(args,line,&args->alines[i]) )
             {
                 args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
                 has_overlap = 1;
             }
         }
+        else    // overlap, REF+ALT matching not requested
+        {
+            args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
+            has_overlap = 1;
+        }
+    }
 
-        free(match_end.s);
+    free(match_end.s);
+    if ( !has_overlap && args->filter_ext && !args->keep_sites ) return has_overlap;
 
-        // Sort lines if needed
+    // Sort lines if needed
+    if ( args->has_append_mode )
+    {
+        // insertion sort by VCF ALT index (top bits) and alines index (low bits)
+        uint32_t tmp;
+        for (i=1; i<args->nsrt_alines; i++)
+            for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
+                tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
+    }
+    // Annotate
+    for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+    int ialt_exp = 1;
+    for (i=0; i<args->nsrt_alines; i++)
+    {
+        int ialt = args->srt_alines[i] >> 16;
+        int ilin = args->srt_alines[i] & 0xffff;
         if ( args->has_append_mode )
         {
-            // insertion sort by VCF ALT index (top bits) and alines index (low bits)
-            uint32_t tmp;
-            for (i=1; i<args->nsrt_alines; i++)
-                for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
-                    tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
-        }
-        // Annotate
-        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
-        int ialt_exp = 1;
-        for (i=0; i<args->nsrt_alines; i++)
-        {
-            int ialt = args->srt_alines[i] >> 16;
-            int ilin = args->srt_alines[i] & 0xffff;
-            if ( args->has_append_mode )
+            if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
+            if ( ialt_exp < ialt )
             {
-                if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
-                if ( ialt_exp < ialt )
-                {
-                    // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
-                    while ( ialt_exp++ < ialt )
-                    {
-                        for (j=0; j<args->ncols; j++)
-                        {
-                            if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
-                            if ( args->cols[j].done==1 ) continue;
-                            if ( !args->cols[j].setter ) continue;
-                            int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
-                            if ( ret < 0 )
-                                error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-                            if ( ret==0 )
-                                args->cols[j].done = 1;
-                        }
-                    }
-                }
-            }
-            for (j=0; j<args->ncols; j++)
-            {
-                if ( args->cols[j].done==1 ) continue;
-                if ( !args->cols[j].setter ) continue;
-                int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
-                if ( ret < 0 )
-                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-                if ( ret==0 )
-                    args->cols[j].done = 1;
-            }
-            ialt_exp = ialt + 1;
-        }
-        if ( args->nsrt_alines )
-        {
-            // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
-            // record was found. Otherwise leave the row will be left without annotation.
-            if ( args->has_append_mode && ialt_exp < line->n_allele )
-            {
-                while ( ialt_exp++ < line->n_allele )
+                // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
+                while ( ialt_exp++ < ialt )
                 {
                     for (j=0; j<args->ncols; j++)
                     {
@@ -3254,32 +3508,97 @@ static void annotate(args_t *args, bcf1_t *line)
                     }
                 }
             }
-            // Flush
-            for (j=0; j<args->ncols; j++)
-            {
-                if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
-                if ( !args->cols[j].setter ) continue;
-                int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
-                if ( ret < 0 )
-                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-            }
         }
+        for (j=0; j<args->ncols; j++)
+        {
+            if ( args->cols[j].done==1 ) continue;
+            if ( !args->cols[j].setter ) continue;
+            int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
+            if ( ret < 0 )
+                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+            if ( ret==0 )
+                args->cols[j].done = 1;
+        }
+        ialt_exp = ialt + 1;
     }
-    else if ( args->files->nreaders == 2 )
+    if ( args->nsrt_alines )
     {
-        if ( bcf_sr_has_line(args->files,1) )
+        // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
+        // record was found. Otherwise leave the row will be left without annotation.
+        if ( args->has_append_mode && ialt_exp < line->n_allele )
         {
-            bcf1_t *aline = bcf_sr_get_line(args->files,1);
-            for (j=0; j<args->ncols; j++)
+            while ( ialt_exp++ < line->n_allele )
             {
-                if ( !args->cols[j].setter ) continue;
-                if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
-                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                for (j=0; j<args->ncols; j++)
+                {
+                    if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+                    if ( args->cols[j].done==1 ) continue;
+                    if ( !args->cols[j].setter ) continue;
+                    int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+                    if ( ret < 0 )
+                        error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                    if ( ret==0 )
+                        args->cols[j].done = 1;
+                }
             }
-
-            has_overlap = 1;
         }
+        // Flush
+        for (j=0; j<args->ncols; j++)
+        {
+            if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+            if ( !args->cols[j].setter ) continue;
+            int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
+            if ( ret < 0 )
+                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+        }
+    }
+    return has_overlap;
+}
+static int annotate_from_vcf(args_t *args, bcf1_t *line)
+{
+    if ( !bcf_sr_has_line(args->files,1) ) return 0;
+    int j;
+    bcf1_t *aline = bcf_sr_get_line(args->files,1);
+    for (j=0; j<args->ncols; j++)
+    {
+        if ( !args->cols[j].setter ) continue;
+        if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+            error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
     }
+    return 1;
+}
+static int annotate_from_self(args_t *args, bcf1_t *line)
+{
+    int j;
+    for (j=0; j<args->ncols; j++)
+    {
+        if ( !args->cols[j].setter ) continue;
+        if ( args->cols[j].setter(args,line,&args->cols[j],NULL) )
+            error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+    }
+    return 0;
+}
+static int annotate_line(args_t *args, bcf1_t *line)
+{
+    args->current_rec = line;
+
+    int i;
+    for (i=0; i<args->nrm; i++)
+        args->rm[i].handler(args, line, &args->rm[i]);
+
+    int has_overlap = 0;
+    if ( args->tgt_idx )
+        has_overlap = annotate_from_regidx(args,line);
+
+    else if ( args->tgts )
+        has_overlap = annotate_from_tab(args,line);
+
+    else if ( args->files->nreaders == 2 )
+        has_overlap = annotate_from_vcf(args,line);
+
+    else if ( args->ncols )
+        has_overlap = annotate_from_self(args,line);
+
     if ( args->set_ids )
     {
         args->tmpks.l = 0;
@@ -3304,6 +3623,8 @@ static void annotate(args_t *args, bcf1_t *line)
         else
             bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1);
     }
+
+    return has_overlap;
 }
 
 static void usage(args_t *args)
@@ -3340,7 +3661,7 @@ static void usage(args_t *args)
     fprintf(stderr, "       --single-overlaps           Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
     fprintf(stderr, "   -x, --remove LIST               List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
     fprintf(stderr, "       --threads INT               Number of extra output compression threads [0]\n");
-    fprintf(stderr, "       --write-index               Automatically index the output files [off]\n");
+    fprintf(stderr, "   -W, --write-index[=FMT]         Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   http://samtools.github.io/bcftools/howtos/annotate.html\n");
@@ -3397,11 +3718,11 @@ int main_vcfannotate(int argc, char *argv[])
         {"min-overlap",required_argument,NULL,12},
         {"no-version",no_argument,NULL,8},
         {"force",no_argument,NULL,'f'},
-        {"write-index",no_argument,NULL,13},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'f': args->force = 1; break;
@@ -3474,7 +3795,10 @@ int main_vcfannotate(int argc, char *argv[])
             case 10 : args->single_overlaps = 1; break;
             case 11 : args->rename_annots = optarg; break;
             case 12 : args->min_overlap_str = optarg; break;
-            case 13 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
         }
@@ -3543,7 +3867,8 @@ int main_vcfannotate(int argc, char *argv[])
                 continue;
             }
         }
-        annotate(args, line);
+        int keep = annotate_line(args, line);
+        if ( args->filter_ext && !args->keep_sites && !keep ) continue;
         if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
     }
     destroy_data(args);
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c
index 2234ddca9..3d4d75eeb 100644
--- a/bcftools/vcfannotate.c.pysam.c
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -106,6 +106,19 @@ typedef struct _annot_col_t
 }
 annot_col_t;
 
+typedef struct
+{
+    char *name;     // column name
+    int ht_type;    // type, one of BCF_HT_STR,BCF_HT_INT,BCF_HT_REAL
+    int icol;       // index of the annotation column to use
+    union {         // memory area with the current annotation value to pass to filter_test_ext
+        int i;
+        float f;
+        char *s;
+    };
+}
+ext_t;
+
 // Logic of the filters: include or exclude sites which match the filters?
 #define FLT_INCLUDE 1
 #define FLT_EXCLUDE 2
@@ -127,7 +140,7 @@ typedef struct _args_t
     regitr_t *tgt_itr;
     int tgt_is_bed;
 
-    filter_t *filter;
+    filter_t *filter, *filter_ext;  // only one is initialized, the latter contains external values to set dynamically on the fly
     char *filter_str;
     int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
     int keep_sites;
@@ -151,6 +164,11 @@ typedef struct _args_t
     convert_t *set_ids;
     int set_ids_replace;
 
+    // external values for dynamic -i/-e expressions
+    int n_ext;
+    ext_t *ext;
+    void **ext_ptr;
+
     int nsmpl_annot;
     int *sample_map, nsample_map, sample_is_file;   // map[idst] -> isrc
     uint8_t *src_smpl_pld, *dst_smpl_pld;   // for Number=G format fields
@@ -172,6 +190,7 @@ typedef struct _args_t
     int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
     int columns_is_file, has_append_mode, pair_logic;
     dbuf_t *header_lines;
+    bcf1_t *current_rec;    // current record for local setters
 }
 args_t;
 
@@ -512,17 +531,21 @@ static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col,
 static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
 {
     char *str = *((char**)ptr);
-    int len = strlen(rec->d.id);
+    int i, len = strlen(rec->d.id);
     if ( len >= *mptr ) str = realloc(str, len+1);
-    strcpy(str, rec->d.id);
+    for (i=0; i<len; i++)
+        str[i] = rec->d.id[i]==';' ? ',' : rec->d.id[i];
+    str[len] = 0;
     *((char**)ptr) = str;
     *mptr = len+1;
     return len;
 }
-static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+inline static int vcf_getter_filter2str_core(bcf_hdr_t *hdr, bcf1_t *rec, char **ptr, int *mptr)
 {
+    if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
+
     kstring_t str;
-    str.s = *((char**)ptr);
+    str.s = *ptr;
     str.m = *mptr;
     str.l = 0;
 
@@ -531,16 +554,24 @@ static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, vo
     {
         for (i=0; i<rec->d.n_flt; i++)
         {
-            if (i) kputc(';', &str);
-            kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
+            if (i) kputc(',', &str);
+            kputs(bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i]), &str);
         }
     }
     else kputc('.', &str);
 
-    *((char**)ptr) = str.s;
+    *ptr  = str.s;
     *mptr = str.m;
     return str.l;
 }
+static int vcf_getter_filter2str_local(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    return vcf_getter_filter2str_core(args->hdr_out, args->current_rec, (char**)ptr, mptr);
+}
+static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    return vcf_getter_filter2str_core(args->tgts_hdr, rec, (char**)ptr, mptr);
+}
 static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
     if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
@@ -606,7 +637,7 @@ static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
     char *tmp;
     int pos = strtol(tab->cols[col->icol], &tmp, 10);
     if ( tmp==tab->cols[col->icol] )
-        error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
+        error("Could not parse -POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
     line->pos = pos - 1;
     return 0;
 }
@@ -1157,6 +1188,29 @@ void khash_str2int_clear_free(void *_hash)
         if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
     kh_clear(str2int, hash);
 }
+static const char *escape_string(const char *str, char needle[], char **rmme, size_t *len)
+{
+    kstring_t tmp = {0,0,0};
+    const char *bp = str, *ep = str;
+    while ( *ep )
+    {
+        int i = 0;
+        while ( needle[i] && needle[i]!=*ep ) i++;
+        if ( !needle[i] ) { ep++; continue; }
+        kputsn(bp,ep-bp,&tmp);
+        ksprintf(&tmp,"%%%X",*ep);
+        bp = ++ep;
+    }
+    if ( !tmp.l )
+    {
+        *len = strlen(str);
+        return str;
+    }
+    kputs(bp,&tmp);
+    *len  = tmp.l;
+    *rmme = tmp.s;
+    return tmp.s;
+}
 static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
     if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
@@ -1170,13 +1224,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
     if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
 
     annot_line_t *tab = (annot_line_t*) data;
+    const char *escaped = NULL;
+    char *rmme = NULL;
 
-    int len = 0;
+    size_t len = 0;
     if ( tab )
     {
-        len = strlen(tab->cols[col->icol]);
-        if ( !len ) return 0;
-        if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
+        char *str = tab->cols[col->icol];
+        if ( !str || !*str ) return 0;
+        if ( !str[1] && str[0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
+        char needle[] = {';','=',0};
+        escaped = escape_string(tab->cols[col->icol],needle,&rmme,&len);
     }
 
     if ( col->merge_method!=MM_FIRST )
@@ -1190,8 +1248,12 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
             if ( col->merge_method==MM_UNIQUE )
             {
                 if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
-                if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
-                khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
+                if ( khash_str2int_has_key(col->mm_str_hash, escaped) )
+                {
+                    free(rmme);
+                    return 1;
+                }
+                khash_str2int_inc(col->mm_str_hash, strdup(escaped));
             }
 
             if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l )
@@ -1203,17 +1265,20 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
             }
 
             if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
-            kputs(tab->cols[col->icol], &col->mm_kstr);
+            kputs(escaped, &col->mm_kstr);
+            free(rmme);
             return 1;
         }
-
         if ( col->mm_kstr.l )
         {
             hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps);
             memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1);
         }
         else
+        {
+            free(rmme);
             return 0;
+        }
 
         // flush the line
         if ( col->merge_method==MM_UNIQUE )
@@ -1224,13 +1289,13 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
     {
         assert(tab);
         hts_expand(char,len+1,args->mtmps,args->tmps);
-        memcpy(args->tmps,tab->cols[col->icol],len+1);
-
+        memcpy(args->tmps,escaped,len+1);
         if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
             return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
     }
-
-    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
+    int ret = bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
+    free(rmme);
+    return ret;
 }
 static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
@@ -1664,11 +1729,18 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void
     if ( col->icol+args->nsmpl_annot > tab->ncols )
         error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
 
+    char needle[] = {':',0};
     int ismpl;
     for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
-        args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
-
-    return core_setter_format_str(args,line,col,args->tmpp);
+    {
+        size_t len;
+        char *rmme = NULL;
+        const char *str = escape_string(tab->cols[col->icol + ismpl],needle,&rmme,&len);
+        args->tmpp[ismpl] = rmme ? rmme : strdup(str);
+    }
+    int ret = core_setter_format_str(args,line,col,args->tmpp);
+    for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++) free(args->tmpp[ismpl]);
+    return ret;
 }
 static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl)
 {
@@ -2200,7 +2272,23 @@ static void init_columns(args_t *args)
         kputsn(ss, se-ss, &str);
         if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
         else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
-        else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol;
+        else if ( !strcasecmp("POS",str.s) )
+        {
+            if ( replace==REPLACE_NON_MISSING && !args->tgts_is_vcf )
+            {
+                args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+                annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
+                col->icol = icol;
+                col->replace = replace;
+                col->setter  = setter_pos;
+                col->hdr_key_src = strdup(str.s);
+                col->hdr_key_dst = strdup(str.s);
+                args->match_end = icol;
+            }
+            else
+                args->beg_idx = icol;
+        }
         else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol;
         else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol;
         else if ( !strcasecmp("REF",str.s) )
@@ -2259,9 +2347,23 @@ static void init_columns(args_t *args)
             col->hdr_key_dst = strdup(str.s);
             args->match_end = icol;
         }
-        else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf )
+        else if ( !strcasecmp("~POS",str.s) )
+        {
+            error("Error: the use of ~POS has been deprecated, use -POS to transfer the column POS.\n");
+        }
+        else if ( str.s[0]=='~' )
         {
-            if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n");
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = MATCH_VALUE;
+            col->setter  = NULL;
+            col->hdr_key_src = strdup(str.s+1);
+        }
+        else if ( !strcasecmp("-POS",str.s) && !args->tgts_is_vcf )
+        {
+            if ( args->tgts_is_vcf ) error("Error: cannot use -POS, position can be replaced only from a tab-delimited file\n");
             args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
             annot_col_t *col = &args->cols[args->ncols-1];
             memset(col,0,sizeof(*col));
@@ -2292,10 +2394,30 @@ static void init_columns(args_t *args)
             if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
                 error("Only Type=String tags can be used to annotate the ID column\n");
         }
-        else if ( (ptr=strstr(str.s,":=")) && !args->targets_fname )
+        else if ( (ptr=strstr(str.s,":=")) && (!args->targets_fname || !strncasecmp(ptr+2,"./",2)) )
         {
             *ptr = 0;
-            rename_annots_push(args,ptr+2,str.s);
+            if ( !strncasecmp(str.s,"INFO/",5) && (!strcasecmp(ptr+2,"FILTER") || !strcasecmp(ptr+2,"./FILTER")) )
+            {
+                // -a not present and transferring filter, needs to be a local transfer
+                args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+                annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
+                col->icol = icol;
+                col->replace = replace;
+                col->setter = vcf_setter_info_str;
+                col->getter = vcf_getter_filter2str_local;
+                col->hdr_key_src = strdup(ptr+2);
+                col->hdr_key_dst = strdup(str.s+5);
+                tmp.l = 0;
+                ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",col->hdr_key_dst);
+                bcf_hdr_append(args->hdr_out, tmp.s);
+                if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__);
+                int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, col->hdr_key_dst);
+                col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+            }
+            else
+                rename_annots_push(args,ptr+2,str.s);
             *ptr = ':';
         }
         else if ( !strcasecmp("FILTER",str.s) )
@@ -2489,6 +2611,13 @@ static void init_columns(args_t *args)
                           "       (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
                 fprintf(bcftools_stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
             }
+
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = replace;
+
             int explicit_src_info = 0;
             int explicit_dst_info = 0;
             char *key_dst;
@@ -2519,15 +2648,14 @@ static void init_columns(args_t *args)
                     key_src[-2] = ':';
                     error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
                 }
+                else if ( !strcasecmp("FILTER",key_src) && args->tgts_is_vcf )
+                {
+                    col->getter = vcf_getter_filter2str;
+                }
             }
             else
                 key_src = key_dst;
 
-            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
-            annot_col_t *col = &args->cols[args->ncols-1];
-            memset(col,0,sizeof(*col));
-            col->icol = icol;
-            col->replace = replace;
             col->hdr_key_src = strdup(key_src);
             col->hdr_key_dst = strdup(key_dst);
 
@@ -2784,7 +2912,7 @@ static void rename_annots(args_t *args)
         while ( *ptr && isspace(*ptr) ) ptr++;
         if ( !*ptr ) { *rmme = ' '; error("Could not parse: %s\n", args->rename_annots_map[i]); }
         if ( rename_annots_core(args, args->rename_annots_map[i], ptr) < 0 )
-            error("Could not parse \"%s %s\", expected INFO, FORMAT, or FILTER prefix\n",args->rename_annots_map[i],ptr);
+            error("Cannot rename \"%s\" to \"%s\"\n",args->rename_annots_map[i],ptr);
     }
 }
 static void rename_annots_push(args_t *args, char *src, char *dst)
@@ -2795,6 +2923,106 @@ static void rename_annots_push(args_t *args, char *src, char *dst)
     ksprintf(&str,"%s %s",src,dst);
     args->rename_annots_map[ args->rename_annots_nmap - 1 ] = str.s;
 }
+static void init_filters(args_t *args)
+{
+    // Check if the -i/-e expressions contain external values that should be determined
+    // on the fly from the annotation file. The expressions can be given as
+    //      TAG={NAME}
+    //      TAG={str:NAME}
+    //      TAG={int:NAME}
+    //      TAG={float:NAME}
+    kstring_t str = {0,0,0};
+    char *src = strdup(args->filter_str);
+    int len = 0;
+    while (1)
+    {
+        char *beg = strchr(src+len,'{');
+        if ( !beg ) break;
+
+        // check if "{" appears inside quotes, in such case do not modify
+        char skip = 0;
+        char *tmp = src;
+        while ( tmp<beg )
+        {
+            if ( tmp[0]!='"' && tmp[0]!='\'' ) { tmp++; continue; }
+
+            // quote character found
+            int quote = tmp[0];
+            tmp++;
+            while ( *tmp && tmp[0]!=quote ) tmp++;
+            if ( !*tmp ) error("Could not parse the expression: %s\n",args->filter_str);    // unbalanced quotation; todo: check for escape char
+            len = tmp - src + 1;
+            skip = 1;
+        }
+        if ( skip ) continue;
+
+        char *end = ++beg;
+        while ( *end && *end!='}' ) end++;
+        if ( !*end ) error("Could not parse the expression: %s\n",args->filter_str);
+        *end = 0;
+
+        // explicit typing?
+        int type = -1;
+        tmp = beg;
+        while ( *tmp && *tmp!=':' ) tmp++;
+        if ( *tmp )
+        {
+            *tmp = 0;
+            if ( !strcasecmp(beg,"str") ) type = BCF_HT_STR;
+            else if ( !strcasecmp(beg,"int") ) type = BCF_HT_INT;
+            else if ( !strcasecmp(beg,"float") ) type = BCF_HT_REAL;
+        }
+        args->n_ext++;
+        args->ext = (ext_t*)realloc(args->ext,sizeof(*args->ext)*args->n_ext);
+        ext_t *ext = &args->ext[args->n_ext-1];
+        ext->ht_type = type;
+        ext->name = strdup(beg);
+        if ( beg-1 > src ) kputsn(src,beg-1-src,&str);
+        if ( type==-1 ) kputs("{}",&str);
+        else if ( type==BCF_HT_STR ) kputs("{str}",&str);
+        else if ( type==BCF_HT_INT ) kputs("{int}",&str);
+        else if ( type==BCF_HT_REAL ) kputs("{float}",&str);
+        len = str.l;
+        kputs(end+1,&str);
+        free(src);
+        src = strdup(str.s);
+        str.l = 0;
+    }
+    args->filter = filter_init(args->hdr, src);
+    free(src);
+    free(str.s);
+
+    int i,j,n_ext;
+    const int *ext_type = filter_ext_types(args->filter, &n_ext);
+    if ( n_ext != args->n_ext )
+        error("Failed to parse the expression, unexpected number of dynamic variables (%d vs %d): %s\n",n_ext,args->n_ext,args->filter_str);
+
+    if ( !args->n_ext ) return;
+
+    if ( !args->tgts )
+        error("Error: dynamic variables in -i/-e expressions can be currently used only with tab-delimited file, not with VCF (todo)\n");
+
+    // contains external values
+    args->ext_ptr = malloc(sizeof(*args->ext_ptr)*args->n_ext);
+    for (i=0; i<args->n_ext; i++) args->ext[i].ht_type = ext_type[i];
+    args->filter_ext = args->filter;
+    args->filter = NULL;
+
+    // set the column idx
+    if ( args->ncols )
+    {
+        for (i=0; i<args->n_ext; i++)
+        {
+            for (j=0; j<args->ncols; j++)
+            {
+                if ( strcmp(args->ext[i].name,args->cols[j].hdr_key_src) ) continue;
+                args->ext[i].icol = args->cols[j].icol;
+                break;
+            }
+            if ( j==args->ncols ) error("No such column: %s\n",args->ext[i].name);
+        }
+    }
+}
 
 static void init_data(args_t *args)
 {
@@ -2863,7 +3091,7 @@ static void init_data(args_t *args)
     args->vcmp = vcmp_init();
 
     if ( args->filter_str )
-        args->filter = filter_init(args->hdr, args->filter_str);
+        init_filters(args);
 
     if ( args->mark_sites )
     {
@@ -2892,13 +3120,22 @@ static void init_data(args_t *args)
         if ( args->n_threads )
             hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
-        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+        if ( init_index2(args->out_fh,args->hdr,args->output_fname,
+                         &args->index_fn, args->write_index) < 0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
 static void destroy_data(args_t *args)
 {
     int i;
+    for (i=0; i<args->n_ext; i++)
+    {
+        free(args->ext[i].name);
+        if ( args->ext[i].ht_type!=BCF_HT_STR ) continue;
+    }
+    free(args->ext_ptr);
+    free(args->ext);
     for (i=0; i<args->nrm; i++) free(args->rm[i].key);
     free(args->rm);
     if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
@@ -2953,8 +3190,8 @@ static void destroy_data(args_t *args)
     free(args->dst_smpl_pld);
     if ( args->set_ids )
         convert_destroy(args->set_ids);
-    if ( args->filter )
-        filter_destroy(args->filter);
+    if ( args->filter ) filter_destroy(args->filter);
+    if ( args->filter_ext ) filter_destroy(args->filter_ext);
     if (args->out_fh)
     {
         if ( args->write_index )
@@ -3033,7 +3270,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
         }
         else i++;
     }
-    if ( args->ref_idx==-1 && args->nalines ) return;
+    if ( !args->filter_ext && args->ref_idx==-1 && args->nalines ) return;
 
     while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
     {
@@ -3045,7 +3282,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
         tmp->start = args->tgts->start;
         tmp->end   = args->tgts->end;
         parse_annot_line(args, args->tgts->line.s, tmp);
-        if ( args->ref_idx != -1 )
+        if ( args->filter_ext || args->ref_idx != -1 )
         {
             int iseq = args->tgts->iseq;
             if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break;
@@ -3084,164 +3321,181 @@ static int strstr_match(char *a, char *b)
     }
     return 0;
 }
-static void annotate(args_t *args, bcf1_t *line)
+static int annotate_from_regidx(args_t *args, bcf1_t *line)
 {
-    int i, j;
-    for (i=0; i<args->nrm; i++)
-        args->rm[i].handler(args, line, &args->rm[i]);
-
+    int j;
     int has_overlap = 0;
-    if ( args->tgt_idx )
+
+    for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+    if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
     {
-        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
-        if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
+        hts_pos_t vcf_end = line->pos + line->rlen - 1;
+        while ( regitr_overlap(args->tgt_itr) )
         {
-            hts_pos_t vcf_end = line->pos + line->rlen - 1;
-            while ( regitr_overlap(args->tgt_itr) )
-            {
-                annot_line_t *tmp = &args->alines[0];
-                tmp->rid   = line->rid;
-                tmp->start = args->tgt_itr->beg;
-                tmp->end   = args->tgt_itr->end;
+            annot_line_t *tmp = &args->alines[0];
+            tmp->rid   = line->rid;
+            tmp->start = args->tgt_itr->beg;
+            tmp->end   = args->tgt_itr->end;
 
-                // Check min overlap
-                int len_ann = tmp->end - tmp->start + 1;
-                int len_vcf = line->rlen;
-                int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
-                assert( isec > 0 );
-                if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
-                if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
+            // Check min overlap
+            int len_ann = tmp->end - tmp->start + 1;
+            int len_vcf = line->rlen;
+            int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+            assert( isec > 0 );
+            if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
+            if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
 
-                parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
-                for (j=0; j<args->ncols; j++)
-                {
-                    if ( args->cols[j].done==1 ) continue;
-                    int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
-                    if ( ret < 0 )
-                        error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-                    if ( ret==0 )
-                        args->cols[j].done = 1;
-                    has_overlap = 1;
-                }
+            parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
+
+            // If a plain BED file is provided and we are asked to just mark overlapping sites, there are
+            // no additional columns. Not sure if there can be any side effects for ill-formatted BED files
+            // with variable number of columns
+            if ( !args->ncols && args->mark_sites ) has_overlap = 1;
+
+            for (j=0; j<args->ncols; j++)
+            {
+                if ( args->cols[j].done==1 ) continue;
+                int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
+                if ( ret < 0 )
+                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                if ( ret==0 )
+                    args->cols[j].done = 1;
+                has_overlap = 1;
             }
         }
-        for (j=0; j<args->ncols; j++)
+    }
+    for (j=0; j<args->ncols; j++)
+    {
+        if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+        if ( !args->cols[j].setter ) continue;
+        if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
+            error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+    }
+    return has_overlap;
+}
+static int pass_filter_test_ext(args_t *args, bcf1_t *line, annot_line_t *ann)
+{
+    char *tmp;
+    int i;
+    for (i=0; i<args->n_ext; i++)
+    {
+        int j = args->ext[i].icol;
+        if ( args->ext[i].ht_type==BCF_HT_STR ) args->ext_ptr[i] = args->ext[i].s = ann->cols[j];
+        else if ( args->ext[i].ht_type==BCF_HT_INT )
         {
-            if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
-            if ( !args->cols[j].setter ) continue;
-            if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
-                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+            args->ext[i].i = strtol(ann->cols[j],&tmp,10);
+            if ( *tmp )
+            {
+                if ( strcmp(".",ann->cols[j]) ) error("Error: could not parse the annotation file, expected an integer, found \"%s\"\n",ann->cols[j]);
+                args->ext_ptr[i] = NULL;
+            }
+            else
+                args->ext_ptr[i] = &args->ext[i].i;
+        }
+        else if ( args->ext[i].ht_type==BCF_HT_REAL )
+        {
+            args->ext[i].f = strtod(ann->cols[j],&tmp);
+            if ( *tmp )
+            {
+                if ( strcmp(".",ann->cols[j]) ) error("Error: could not parse the annotation file, expected a float, found \"%s\"\n",ann->cols[j]);
+                args->ext_ptr[i] = NULL;
+            }
+            else
+                args->ext_ptr[i] = &args->ext[i].f;
         }
     }
-    else if ( args->tgts )
-    {
-        // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
-        // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
-        // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
-        // for an ALT, missing value is appended instead.
-        int end_pos = line->pos + line->rlen - 1;
-        buffer_annot_lines(args, line, line->pos, end_pos);
+    int pass = filter_test_ext(args->filter_ext,line,NULL,(const void**)args->ext_ptr);
+    if ( args->filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1;
+    return pass;
+}
+static int annotate_from_tab(args_t *args, bcf1_t *line)
+{
+    int i,j;
+    int has_overlap = 0;
+
+    // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
+    // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
+    // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
+    // for an ALT, missing value is appended instead.
+    int end_pos = line->pos + line->rlen - 1;
+    buffer_annot_lines(args, line, line->pos, end_pos);
 
-        args->nsrt_alines = 0;
-        hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
-        if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
-            error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+    args->nsrt_alines = 0;
+    hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
+    if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
+        error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
 
-        kstring_t match_end = {0,0,0};
-        if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
-            kputw(args->tmpi[0],&match_end);
+    kstring_t match_end = {0,0,0};
+    if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
+        kputw(args->tmpi[0],&match_end);
 
-        // Find matching lines
-        for (i=0; i<args->nalines; i++)
+    // Find matching lines
+    for (i=0; i<args->nalines; i++)
+    {
+        if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
+        if ( args->ref_idx != -1 )  // REF+ALT matching requested
         {
-            if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
-            if ( args->ref_idx != -1 )  // REF+ALT matching requested
+            if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
+            for (j=1; j<args->alines[i].nals; j++)
             {
-                if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
-                for (j=1; j<args->alines[i].nals; j++)
+                int ialt;
+                if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
+                    ialt = 0;
+                else
                 {
-                    int ialt;
-                    if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
-                        ialt = 0;
-                    else
-                    {
-                        ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
-                        if ( ialt < 0 ) continue;
-                        ialt++;
-                    }
-                    if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
-                    if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
-                    args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
-                    has_overlap = 1;
-                    break;
+                    ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
+                    if ( ialt < 0 ) continue;
+                    ialt++;
                 }
+                if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+                if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
+                if ( args->filter_ext && !pass_filter_test_ext(args,line,&args->alines[i]) ) continue;
+                args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
+                has_overlap = 1;
+                break;
             }
-            else    // overlap, REF+ALT matching not requested
+        }
+        else if ( args->filter_ext )
+        {
+            if ( pass_filter_test_ext(args,line,&args->alines[i]) )
             {
                 args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
                 has_overlap = 1;
             }
         }
+        else    // overlap, REF+ALT matching not requested
+        {
+            args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
+            has_overlap = 1;
+        }
+    }
 
-        free(match_end.s);
+    free(match_end.s);
+    if ( !has_overlap && args->filter_ext && !args->keep_sites ) return has_overlap;
 
-        // Sort lines if needed
+    // Sort lines if needed
+    if ( args->has_append_mode )
+    {
+        // insertion sort by VCF ALT index (top bits) and alines index (low bits)
+        uint32_t tmp;
+        for (i=1; i<args->nsrt_alines; i++)
+            for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
+                tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
+    }
+    // Annotate
+    for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+    int ialt_exp = 1;
+    for (i=0; i<args->nsrt_alines; i++)
+    {
+        int ialt = args->srt_alines[i] >> 16;
+        int ilin = args->srt_alines[i] & 0xffff;
         if ( args->has_append_mode )
         {
-            // insertion sort by VCF ALT index (top bits) and alines index (low bits)
-            uint32_t tmp;
-            for (i=1; i<args->nsrt_alines; i++)
-                for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
-                    tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
-        }
-        // Annotate
-        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
-        int ialt_exp = 1;
-        for (i=0; i<args->nsrt_alines; i++)
-        {
-            int ialt = args->srt_alines[i] >> 16;
-            int ilin = args->srt_alines[i] & 0xffff;
-            if ( args->has_append_mode )
+            if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
+            if ( ialt_exp < ialt )
             {
-                if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
-                if ( ialt_exp < ialt )
-                {
-                    // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
-                    while ( ialt_exp++ < ialt )
-                    {
-                        for (j=0; j<args->ncols; j++)
-                        {
-                            if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
-                            if ( args->cols[j].done==1 ) continue;
-                            if ( !args->cols[j].setter ) continue;
-                            int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
-                            if ( ret < 0 )
-                                error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-                            if ( ret==0 )
-                                args->cols[j].done = 1;
-                        }
-                    }
-                }
-            }
-            for (j=0; j<args->ncols; j++)
-            {
-                if ( args->cols[j].done==1 ) continue;
-                if ( !args->cols[j].setter ) continue;
-                int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
-                if ( ret < 0 )
-                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-                if ( ret==0 )
-                    args->cols[j].done = 1;
-            }
-            ialt_exp = ialt + 1;
-        }
-        if ( args->nsrt_alines )
-        {
-            // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
-            // record was found. Otherwise leave the row will be left without annotation.
-            if ( args->has_append_mode && ialt_exp < line->n_allele )
-            {
-                while ( ialt_exp++ < line->n_allele )
+                // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
+                while ( ialt_exp++ < ialt )
                 {
                     for (j=0; j<args->ncols; j++)
                     {
@@ -3256,32 +3510,97 @@ static void annotate(args_t *args, bcf1_t *line)
                     }
                 }
             }
-            // Flush
-            for (j=0; j<args->ncols; j++)
-            {
-                if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
-                if ( !args->cols[j].setter ) continue;
-                int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
-                if ( ret < 0 )
-                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
-            }
         }
+        for (j=0; j<args->ncols; j++)
+        {
+            if ( args->cols[j].done==1 ) continue;
+            if ( !args->cols[j].setter ) continue;
+            int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
+            if ( ret < 0 )
+                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+            if ( ret==0 )
+                args->cols[j].done = 1;
+        }
+        ialt_exp = ialt + 1;
     }
-    else if ( args->files->nreaders == 2 )
+    if ( args->nsrt_alines )
     {
-        if ( bcf_sr_has_line(args->files,1) )
+        // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
+        // record was found. Otherwise leave the row will be left without annotation.
+        if ( args->has_append_mode && ialt_exp < line->n_allele )
         {
-            bcf1_t *aline = bcf_sr_get_line(args->files,1);
-            for (j=0; j<args->ncols; j++)
+            while ( ialt_exp++ < line->n_allele )
             {
-                if ( !args->cols[j].setter ) continue;
-                if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
-                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                for (j=0; j<args->ncols; j++)
+                {
+                    if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+                    if ( args->cols[j].done==1 ) continue;
+                    if ( !args->cols[j].setter ) continue;
+                    int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+                    if ( ret < 0 )
+                        error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                    if ( ret==0 )
+                        args->cols[j].done = 1;
+                }
             }
-
-            has_overlap = 1;
         }
+        // Flush
+        for (j=0; j<args->ncols; j++)
+        {
+            if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+            if ( !args->cols[j].setter ) continue;
+            int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
+            if ( ret < 0 )
+                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+        }
+    }
+    return has_overlap;
+}
+static int annotate_from_vcf(args_t *args, bcf1_t *line)
+{
+    if ( !bcf_sr_has_line(args->files,1) ) return 0;
+    int j;
+    bcf1_t *aline = bcf_sr_get_line(args->files,1);
+    for (j=0; j<args->ncols; j++)
+    {
+        if ( !args->cols[j].setter ) continue;
+        if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+            error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
     }
+    return 1;
+}
+static int annotate_from_self(args_t *args, bcf1_t *line)
+{
+    int j;
+    for (j=0; j<args->ncols; j++)
+    {
+        if ( !args->cols[j].setter ) continue;
+        if ( args->cols[j].setter(args,line,&args->cols[j],NULL) )
+            error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+    }
+    return 0;
+}
+static int annotate_line(args_t *args, bcf1_t *line)
+{
+    args->current_rec = line;
+
+    int i;
+    for (i=0; i<args->nrm; i++)
+        args->rm[i].handler(args, line, &args->rm[i]);
+
+    int has_overlap = 0;
+    if ( args->tgt_idx )
+        has_overlap = annotate_from_regidx(args,line);
+
+    else if ( args->tgts )
+        has_overlap = annotate_from_tab(args,line);
+
+    else if ( args->files->nreaders == 2 )
+        has_overlap = annotate_from_vcf(args,line);
+
+    else if ( args->ncols )
+        has_overlap = annotate_from_self(args,line);
+
     if ( args->set_ids )
     {
         args->tmpks.l = 0;
@@ -3306,6 +3625,8 @@ static void annotate(args_t *args, bcf1_t *line)
         else
             bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1);
     }
+
+    return has_overlap;
 }
 
 static void usage(args_t *args)
@@ -3342,7 +3663,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "       --single-overlaps           Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
     fprintf(bcftools_stderr, "   -x, --remove LIST               List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
     fprintf(bcftools_stderr, "       --threads INT               Number of extra output compression threads [0]\n");
-    fprintf(bcftools_stderr, "       --write-index               Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "   -W, --write-index[=FMT]         Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Examples:\n");
     fprintf(bcftools_stderr, "   http://samtools.github.io/bcftools/howtos/annotate.html\n");
@@ -3399,11 +3720,11 @@ int main_vcfannotate(int argc, char *argv[])
         {"min-overlap",required_argument,NULL,12},
         {"no-version",no_argument,NULL,8},
         {"force",no_argument,NULL,'f'},
-        {"write-index",no_argument,NULL,13},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:fW::",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'f': args->force = 1; break;
@@ -3476,7 +3797,10 @@ int main_vcfannotate(int argc, char *argv[])
             case 10 : args->single_overlaps = 1; break;
             case 11 : args->rename_annots = optarg; break;
             case 12 : args->min_overlap_str = optarg; break;
-            case 13 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
         }
@@ -3545,7 +3869,8 @@ int main_vcfannotate(int argc, char *argv[])
                 continue;
             }
         }
-        annotate(args, line);
+        int keep = annotate_line(args, line);
+        if ( args->filter_ext && !args->keep_sites && !keep ) continue;
         if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
     }
     destroy_data(args);
diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c
index 9d60c493c..22390d0fa 100644
--- a/bcftools/vcfbuf.c
+++ b/bcftools/vcfbuf.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2016-2022 Genome Research Ltd.
+   Copyright (c) 2016-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -29,6 +29,7 @@
 #include <htslib/vcf.h>
 #include <htslib/vcfutils.h>
 #include <htslib/hts_os.h>
+#include <htslib/kbitset.h>
 #include "bcftools.h"
 #include "vcfbuf.h"
 #include "rbuf.h"
@@ -44,7 +45,7 @@ typedef struct
 {
     bcf1_t *rec;
     double af;
-    int af_set:1, filter:1, idx:30;
+    unsigned int af_set:1, filter:1, idx:30;
 }
 vcfrec_t;
 
@@ -61,28 +62,61 @@ typedef struct
 }
 prune_t;
 
+
+#define MARK_OVERLAP 1
+#define MARK_DUP     2
+#define MARK_EXPR    3
+
+#define MARK_MISSING_SCALAR 0   // actual value to use
+#define MARK_MISSING_MAX_DP 1   // max overlap_t.value scaled by INFO/DP
+
+// temporary internal structure for iterative overlap removal by mark_t.expr
 typedef struct
 {
-    int active;
+    double value;       // the sort value
+    int rmme, idx;      // mark for removal, index in vcfbuf_t.rbuf
+    int dp;             // with MARK_MISSING_MAX_DP, INFO/DP is used extrapolate missing QUAL
+    kbitset_t *bset;    // mark which records it overlaps with, given as 0-based indexes to vcfbuf_t.rbuf
+    bcf1_t *rec;
 }
-rmdup_t;
-
+overlap_t;
 typedef struct
 {
-    int active, rid, end;
+    // modes
+    int mode;
+    char *expr;
+
+    // sites marked according to expr, returned to the caller via vcfbuf_get()
+    rbuf_t rbuf;
+    uint8_t *mark;
+    int last;
+
+    // MARK_OVERLAP
+    int overlap_rid, overlap_end;
+
+    // MARK_EXPR
+    int nbuf;
+    overlap_t *buf, **buf_ptr;
+    int missing_expr;       // the value to use when min(QUAL) encounters a missing value
+    float missing_value;    // the default missing value
+    float max_qual;         // with MARK_MISSING_MAX_DP
+    int max_qual_dp;        //
+    int ntmpi;              // temporary int array and the allocated memory
+    int32_t *tmpi;
 }
-overlap_t;
+mark_t;
 
 struct _vcfbuf_t
 {
-    int win, dummy;
+    int win,            // maximum number of sites in the buffer, either number of sites (<0) or bp (<0)
+        dummy;          // the caller maintains the buffer via push/peek/flush
     bcf_hdr_t *hdr;
     vcfrec_t *vcf;
     rbuf_t rbuf;
     ld_t ld;
     prune_t prune;
-    overlap_t overlap;
-    rmdup_t rmdup;
+    mark_t mark;
+    enum { clean, dirty } status;
 };
 
 vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
@@ -90,7 +124,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
     vcfbuf_t *buf = (vcfbuf_t*) calloc(1,sizeof(vcfbuf_t));
     buf->hdr = hdr;
     buf->win = win;
-    buf->overlap.rid = -1;
+    buf->status = clean;
+    buf->mark.overlap_rid = -1;
     int i;
     for (i=0; i<VCFBUF_LD_N; i++) buf->ld.max[i] = HUGE_VAL;
     rbuf_init(&buf->rbuf, 0);
@@ -106,38 +141,119 @@ void vcfbuf_destroy(vcfbuf_t *buf)
     free(buf->prune.farr);
     free(buf->prune.vrec);
     free(buf->prune.ac);
+    free(buf->prune.af_tag);
     free(buf->prune.idx);
+    free(buf->mark.mark);
+    free(buf->mark.expr);
+    for (i=0; i<buf->mark.nbuf; i++) kbs_destroy(buf->mark.buf[i].bset);
+    free(buf->mark.buf);
+    free(buf->mark.buf_ptr);
+    free(buf->mark.tmpi);
     free(buf);
 }
 
-void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
+int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
 {
-    if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; }
-    if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
-    if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; }
-    if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
-    if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
-
-    if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; }
-    if ( key==VCFBUF_NSITES )
+    va_list args;
+    switch (key)
     {
-        buf->prune.max_sites = *((int*)value);
-        if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
-        return;
+        case LD_FILTER1:
+            va_start(args, key);
+            buf->ld.filter1 = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case LD_RAND_MISSING:
+            va_start(args, key);
+            buf->ld.rand_missing = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case LD_MAX_R2:
+            va_start(args, key);
+            buf->ld.max[VCFBUF_LD_IDX_R2] = va_arg(args,double);
+            va_end(args);
+            return 0;
+
+        case LD_MAX_LD:
+            va_start(args, key);
+            buf->ld.max[VCFBUF_LD_IDX_LD] = va_arg(args,double);
+            va_end(args);
+            return 0;
+
+        case LD_MAX_HD:
+            va_start(args, key);
+            buf->ld.max[VCFBUF_LD_IDX_HD] = va_arg(args,double);
+            va_end(args);
+            return 0;
+
+        case VCFBUF_DUMMY:
+            va_start(args, key);
+            buf->dummy = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case PRUNE_NSITES:
+            va_start(args, key);
+            buf->prune.max_sites = va_arg(args,int);
+            if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+            va_end(args);
+            return 0;
+
+        case PRUNE_NSITES_MODE:
+            va_start(args, key);
+            char *mode = va_arg(args,char*);
+            va_end(args);
+            if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+            else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
+            else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
+            else error("The mode \"%s\" is not recognised\n",mode);
+            return 0;
+
+        case PRUNE_AF_TAG:
+            va_start(args, key);
+            buf->prune.af_tag = strdup(va_arg(args,char*));
+            va_end(args);
+            return 0;
+
+        case MARK:
+            va_start(args, key);
+            buf->mark.expr = strdup(va_arg(args,char*));
+            if ( !strcasecmp(buf->mark.expr,"overlap") ) buf->mark.mode = MARK_OVERLAP;
+            else if ( !strcasecmp(buf->mark.expr,"dup") ) buf->mark.mode = MARK_DUP;
+            else buf->mark.mode = MARK_EXPR;
+            va_end(args);
+            return 0;
+
+        case MARK_MISSING_EXPR:
+            va_start(args, key);
+            char *expr = va_arg(args,char*);
+            if ( !strcasecmp(expr,"0") )
+            {
+                buf->mark.missing_expr = MARK_MISSING_SCALAR;
+                buf->mark.missing_value = 0;
+            }
+            else if ( !strcasecmp(expr,"DP") )
+            {
+                if ( buf->mark.mode!=MARK_EXPR ) error("Only the combination of --mark 'min(QUAL)' with --missing DP is currently supported\n");
+                buf->mark.missing_expr = MARK_MISSING_MAX_DP;
+            }
+            else
+                error("todo: MARK_MISSING_EXPR=%s\n",expr);
+            va_end(args);
+            return 0;
     }
-    if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
-    if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
-    if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; }
+    return 0;
+}
 
-    if ( key==VCFBUF_NSITES_MODE )
-    {
-        char *mode = *((char**)value);
-        if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
-        else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
-        else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
-        else error("The mode \"%s\" is not recognised\n",mode);
-        return;
-    }
+void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
+{
+    va_list args;
+    va_start(args, key);
+    if ( key==MARK )
+        return &buf->mark.last;
+    va_end(args);
+    return NULL;
 }
 
 int vcfbuf_nsites(vcfbuf_t *buf)
@@ -147,8 +263,12 @@ int vcfbuf_nsites(vcfbuf_t *buf)
 
 bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
 {
-    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
+    // make sure the caller is using the buffer correctly and calls vcfbuf_flush()
+    // before placing next vcfbuf_push() call
+    assert(buf->status!=dirty);
+    if ( !buf->dummy ) buf->status = dirty;
 
+    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
     int i = rbuf_append(&buf->rbuf);
     if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
 
@@ -163,6 +283,7 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
 
 bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx)
 {
+    buf->status = clean;
     int i = rbuf_kth(&buf->rbuf, idx);
     return i<0 ? NULL : buf->vcf[i].rec;
 }
@@ -195,6 +316,7 @@ static int cmpint_desc(const void *_a, const void *_b)
 
 static void _prune_sites(vcfbuf_t *buf, int flush_all)
 {
+
     int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
 
     int nprune = nbuf - buf->prune.max_sites;
@@ -266,37 +388,75 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
         rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf);
 }
 
-static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all)
+static int mark_dup_can_flush_(vcfbuf_t *buf, int flush_all)
 {
-    if ( flush_all ) return 1;
+    int flush = flush_all;
+    mark_t *mark = &buf->mark;
+    if  ( buf->status==dirty )
+    {
+        // a new site was just added by vcfbuf_push()
+        rbuf_expand0(&mark->rbuf, uint8_t, buf->rbuf.n, mark->mark);
+        int i = rbuf_append(&mark->rbuf);
+        mark->mark[i] = 0;
+
+        if ( buf->rbuf.n==1 ) goto flush;
 
-    if ( buf->rbuf.n==1 ) return 0;
+        // there is at least one previous site, check if it's a duplicate
+        int k1 = rbuf_kth(&buf->rbuf, -1);
+        int k2 = rbuf_kth(&buf->rbuf, -2);
+        vcfrec_t *rec1 = &buf->vcf[k1];
+        vcfrec_t *rec2 = &buf->vcf[k2];
 
-    int k1 = rbuf_kth(&buf->rbuf, -1);
-    int k2 = rbuf_kth(&buf->rbuf, -2);
+        int is_dup = 1;
+        if ( rec1->rec->rid!=rec2->rec->rid ) is_dup = 0;
+        else if ( rec1->rec->pos!=rec2->rec->pos ) is_dup = 0;
 
-    vcfrec_t *rec1 = &buf->vcf[k1];
-    vcfrec_t *rec2 = &buf->vcf[k2];
+        if ( is_dup )
+        {
+            // it is, mark the last two sites as duplicates
+            int k1 = rbuf_kth(&mark->rbuf, -1);
+            int k2 = rbuf_kth(&mark->rbuf, -2);
+            mark->mark[k1] = 1;
+            mark->mark[k2] = 1;
+            goto flush;
+        }
 
-    if ( rec1->rec->rid!=rec2->rec->rid ) return 1;
-    if ( rec1->rec->pos!=rec2->rec->pos ) return 1;
+        // the last site is not a duplicate with the previous, all sites but the last one can be flushed
+        flush = 1;
+    }
+    else if ( buf->rbuf.n > 1 ) flush = 1;
 
-    return 0;
+flush:
+    if ( !flush ) return 0;
+
+    int i = rbuf_shift(&mark->rbuf);
+    mark->last = mark->mark[i];
+    return 1;
 }
 
-static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
+static int mark_overlap_helper_(vcfbuf_t *buf, int flush_all)
 {
-    if ( flush_all ) { buf->overlap.rid = -1; return 1; }
+    if ( buf->status!=dirty ) return flush_all;
 
-    int i = rbuf_last(&buf->rbuf);
-    vcfrec_t *last = &buf->vcf[i];
-    if ( buf->overlap.rid != last->rec->rid ) buf->overlap.end = 0;
+    int flush = flush_all;
+    mark_t *mark = &buf->mark;
 
+    // a new site was just added by vcfbuf_push()
+    buf->status = clean;
+
+    rbuf_expand0(&mark->rbuf, uint8_t, buf->rbuf.n, mark->mark);
+    int i = rbuf_append(&mark->rbuf);
+    mark->mark[i] = 0;
+
+    // determine beg and end of the last record that was just added
+    i = rbuf_last(&buf->rbuf);
+    vcfrec_t *last = &buf->vcf[i];
+    if ( mark->overlap_rid != last->rec->rid ) mark->overlap_end = 0;
     int beg_pos = last->rec->pos;
     int end_pos = last->rec->pos + last->rec->rlen - 1;
 
     // Assuming left-aligned indels. In case it is a deletion, the real variant
-    // starts one base after. If an insertion, the overlap with previous zero length.
+    // starts one base after. If an insertion, the overlap with previous is zero
     int imin = last->rec->rlen;
     for (i=0; i<last->rec->n_allele; i++)
     {
@@ -306,24 +466,175 @@ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
         while ( *ref && *alt && nt_to_upper(*ref)==nt_to_upper(*alt) ) { ref++; alt++; }
         if ( imin > ref - last->rec->d.allele[0] ) imin = ref - last->rec->d.allele[0];
     }
-
-    if ( beg_pos <= buf->overlap.end )
+    if ( beg_pos <= mark->overlap_end )
     {
+        // the new site overlaps with the previous
         beg_pos += imin;
         if ( beg_pos > end_pos ) end_pos = beg_pos;
     }
-
     if ( buf->rbuf.n==1 )
     {
-        buf->overlap.rid = last->rec->rid;
-        buf->overlap.end = end_pos;
-        return 0;
+        mark->overlap_rid = last->rec->rid;
+        mark->overlap_end = end_pos;
+        return flush;
+    }
+    if ( beg_pos <= mark->overlap_end )
+    {
+        if ( mark->overlap_end < end_pos ) mark->overlap_end = end_pos;
+        int k1 = rbuf_kth(&mark->rbuf, -1);
+        int k2 = rbuf_kth(&mark->rbuf, -2);
+        mark->mark[k1] = 1;
+        mark->mark[k2] = 1;
+    }
+    else
+    {
+        if ( mark->overlap_end < end_pos ) mark->overlap_end = end_pos;
+        flush = 1;
+    }
+    return flush;
+}
+
+
+static int mark_overlap_can_flush_(vcfbuf_t *buf, int flush_all)
+{
+    int flush = flush_all;
+    if  ( buf->status==dirty ) flush = mark_overlap_helper_(buf,flush_all);
+    else if ( buf->rbuf.n > 1 ) flush = 1;
+    if ( !flush ) return 0;
+
+    mark_t *mark = &buf->mark;
+    int i = rbuf_shift(&mark->rbuf);
+    mark->last = mark->mark[i];
+    return 1;
+}
+
+
+static int records_overlap(bcf1_t *a, bcf1_t *b)
+{
+    if ( a->rid != b->rid ) return 0;
+    if ( a->pos + a->rlen - 1 < b->pos ) return 0;
+    return 1;
+}
+
+static int cmp_overlap_ptr_asc(const void *aptr, const void *bptr)
+{
+    overlap_t *a = *((overlap_t**)aptr);
+    overlap_t *b = *((overlap_t**)bptr);
+    if ( a->value < b->value ) return -1;
+    if ( a->value > b->value ) return 1;
+    return 0;
+}
+static void mark_expr_missing_reset_(vcfbuf_t *buf)
+{
+    buf->mark.max_qual = 0;
+    buf->mark.max_qual_dp = 0;
+}
+static void mark_expr_missing_prep_(vcfbuf_t *buf, overlap_t *olap)
+{
+    int nval = bcf_get_info_int32(buf->hdr,olap->rec,"DP",&buf->mark.tmpi,&buf->mark.ntmpi);
+    if ( nval!=1 ) return;
+
+    olap->dp = buf->mark.tmpi[0];
+    if ( bcf_float_is_missing(olap->rec->qual) ) return;
+    if ( buf->mark.max_qual < olap->rec->qual )
+    {
+        buf->mark.max_qual = olap->rec->qual;
+        buf->mark.max_qual_dp = olap->dp;
     }
-    if ( beg_pos <= buf->overlap.end )
+}
+static void mark_expr_missing_set_(vcfbuf_t *buf, overlap_t *olap)
+{
+    if ( !bcf_float_is_missing(olap->rec->qual) ) return;
+    if ( !buf->mark.max_qual_dp ) return;
+
+    // scale QUAL of the most confident variant in the overlap proportionally to the coverage
+    // and use that to prioritize the records
+    olap->value = buf->mark.max_qual * olap->dp / buf->mark.max_qual_dp;
+}
+static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all)
+{
+    mark_t *mark = &buf->mark;
+    if ( strcasecmp("min(QUAL)",mark->expr) ) error("Todo; at this time only min(QUAL) is supported\n");
+
+    int flush = flush_all;
+    if  ( buf->status==dirty )
     {
-        if ( buf->overlap.end < end_pos ) buf->overlap.end = end_pos;
-        return 0;
+        flush = mark_overlap_helper_(buf,flush_all);
+        if ( !flush ) return 0;
+
+        if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_reset_(buf);
+
+        // init overlaps, each overlap_t structure keeps a list of overlapping records, symmetrical
+        size_t nori = mark->nbuf;
+        hts_resize(overlap_t,  buf->rbuf.n, &mark->nbuf, &mark->buf, HTS_RESIZE_CLEAR);
+        hts_resize(overlap_t*, buf->rbuf.n, &nori, &mark->buf_ptr, HTS_RESIZE_CLEAR);
+        int i;
+        for (i=0; i<buf->rbuf.n; i++)
+        {
+            overlap_t *oi = &mark->buf[i];
+            int j = rbuf_kth(&buf->rbuf, i);
+            assert(j>=0);
+            bcf1_t *rec = buf->vcf[j].rec;
+            assert(rec);
+            oi->rec = rec;
+
+            // todo: other than QUAL values
+            oi->value = bcf_float_is_missing(rec->qual) ? mark->missing_value : rec->qual;
+            if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_prep_(buf,oi);
+            if ( oi->bset )
+            {
+                kbs_resize(&oi->bset,buf->rbuf.n);
+                kbs_clear(oi->bset);
+            }
+            else
+                oi->bset = kbs_init(buf->rbuf.n);
+            oi->idx  = i;
+            mark->buf_ptr[i] = oi;
+            mark->mark[oi->idx] = 0;
+        }
+        int nolap = 0;
+        for (i=0; i<buf->rbuf.n; i++)
+        {
+            overlap_t *oi = &mark->buf[i];
+            if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_set_(buf,oi);
+            int j;
+            for (j=i+1; j<buf->rbuf.n; j++)
+            {
+                overlap_t *oj = &mark->buf[j];
+                if ( !records_overlap(oi->rec,oj->rec) ) continue;
+                kbs_insert(oi->bset,j);
+                kbs_insert(oj->bset,i);
+                nolap++;
+            }
+        }
+
+        // sort according to the requested criteria, currently only min(QUAL)
+        qsort(mark->buf_ptr,buf->rbuf.n,sizeof(*mark->buf_ptr),cmp_overlap_ptr_asc);   // todo: other than min()
+
+        // go through the list sorted by overlap_t.value, eg QUAL
+        for (i=0; nolap && i<buf->rbuf.n; i++)
+        {
+            kbitset_iter_t itr;
+            overlap_t *oi = mark->buf_ptr[i];
+            kbs_start(&itr);
+            int j;
+            while ((j=kbs_next(oi->bset, &itr)) >= 0)
+            {
+                kbs_delete(oi->bset,j);
+                assert(nolap);
+                assert(kbs_exists(mark->buf[j].bset,oi->idx));
+                kbs_delete(mark->buf[j].bset,oi->idx);
+                nolap--;
+            }
+            j = rbuf_kth(&mark->rbuf,oi->idx);
+            mark->mark[j] = 1;
+        }
     }
+    else if ( buf->rbuf.n > 1 ) flush = 1;
+    if ( !flush ) return 0;
+
+    int i = rbuf_shift(&mark->rbuf);
+    mark->last = mark->mark[i];
     return 1;
 }
 
@@ -331,32 +642,56 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
 {
     int i,j;
 
+    // nothing to do, no lines in the buffer
     if ( buf->rbuf.n==0 ) return NULL;
-    if ( flush_all || buf->dummy ) goto ret;
-
-    i = rbuf_kth(&buf->rbuf, 0);    // first
-    j = rbuf_last(&buf->rbuf);      // last
 
-    if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret;
-    if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret;
-    if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret;
+    // dummy mode, always flushing
+    if ( buf->dummy ) goto ret;
 
-    if ( buf->win > 0 )
+    // pruning mode
+    if ( buf->win )
     {
-        if ( buf->rbuf.n <= buf->win ) return NULL;
+        int can_flush = flush_all;
+        i = rbuf_kth(&buf->rbuf, 0);    // first
+        j = rbuf_last(&buf->rbuf);      // last
+        if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) can_flush = 1;
+        else if ( buf->win > 0 )
+        {
+            if ( buf->rbuf.n > buf->win ) can_flush = 1;
+        }
+        else if ( buf->win < 0 )
+        {
+            if ( !(buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win) ) can_flush = 1;
+        }
+        buf->status = clean;
+        if ( !can_flush ) return NULL;
+        if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
         goto ret;
     }
-    else if ( buf->win < 0 )
+
+    // overlaps and duplicates
+    if ( buf->mark.mode )
     {
-        if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
+        int can_flush = 0;
+        if ( buf->mark.mode==MARK_OVERLAP )
+        {
+            if ( mark_overlap_can_flush_(buf,flush_all) ) can_flush = 1;
+        }
+        else if ( buf->mark.mode==MARK_DUP )
+        {
+            if ( mark_dup_can_flush_(buf,flush_all) ) can_flush = 1;
+        }
+        if ( buf->mark.mode==MARK_EXPR )
+        {
+            if ( mark_expr_can_flush_(buf,flush_all) ) can_flush = 1;
+        }
+        buf->status = clean;
+        if ( !can_flush ) return NULL;
         goto ret;
     }
-    else
-        return NULL;
 
 ret:
-    if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
-
+    buf->status = clean;
     i = rbuf_shift(&buf->rbuf);
     return buf->vcf[i].rec;
 }
diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c
index 7b1c40ed9..b74a5c49b 100644
--- a/bcftools/vcfbuf.c.pysam.c
+++ b/bcftools/vcfbuf.c.pysam.c
@@ -2,7 +2,7 @@
 
 /* The MIT License
 
-   Copyright (c) 2016-2022 Genome Research Ltd.
+   Copyright (c) 2016-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -31,6 +31,7 @@
 #include <htslib/vcf.h>
 #include <htslib/vcfutils.h>
 #include <htslib/hts_os.h>
+#include <htslib/kbitset.h>
 #include "bcftools.h"
 #include "vcfbuf.h"
 #include "rbuf.h"
@@ -46,7 +47,7 @@ typedef struct
 {
     bcf1_t *rec;
     double af;
-    int af_set:1, filter:1, idx:30;
+    unsigned int af_set:1, filter:1, idx:30;
 }
 vcfrec_t;
 
@@ -63,28 +64,61 @@ typedef struct
 }
 prune_t;
 
+
+#define MARK_OVERLAP 1
+#define MARK_DUP     2
+#define MARK_EXPR    3
+
+#define MARK_MISSING_SCALAR 0   // actual value to use
+#define MARK_MISSING_MAX_DP 1   // max overlap_t.value scaled by INFO/DP
+
+// temporary internal structure for iterative overlap removal by mark_t.expr
 typedef struct
 {
-    int active;
+    double value;       // the sort value
+    int rmme, idx;      // mark for removal, index in vcfbuf_t.rbuf
+    int dp;             // with MARK_MISSING_MAX_DP, INFO/DP is used extrapolate missing QUAL
+    kbitset_t *bset;    // mark which records it overlaps with, given as 0-based indexes to vcfbuf_t.rbuf
+    bcf1_t *rec;
 }
-rmdup_t;
-
+overlap_t;
 typedef struct
 {
-    int active, rid, end;
+    // modes
+    int mode;
+    char *expr;
+
+    // sites marked according to expr, returned to the caller via vcfbuf_get()
+    rbuf_t rbuf;
+    uint8_t *mark;
+    int last;
+
+    // MARK_OVERLAP
+    int overlap_rid, overlap_end;
+
+    // MARK_EXPR
+    int nbuf;
+    overlap_t *buf, **buf_ptr;
+    int missing_expr;       // the value to use when min(QUAL) encounters a missing value
+    float missing_value;    // the default missing value
+    float max_qual;         // with MARK_MISSING_MAX_DP
+    int max_qual_dp;        //
+    int ntmpi;              // temporary int array and the allocated memory
+    int32_t *tmpi;
 }
-overlap_t;
+mark_t;
 
 struct _vcfbuf_t
 {
-    int win, dummy;
+    int win,            // maximum number of sites in the buffer, either number of sites (<0) or bp (<0)
+        dummy;          // the caller maintains the buffer via push/peek/flush
     bcf_hdr_t *hdr;
     vcfrec_t *vcf;
     rbuf_t rbuf;
     ld_t ld;
     prune_t prune;
-    overlap_t overlap;
-    rmdup_t rmdup;
+    mark_t mark;
+    enum { clean, dirty } status;
 };
 
 vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
@@ -92,7 +126,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
     vcfbuf_t *buf = (vcfbuf_t*) calloc(1,sizeof(vcfbuf_t));
     buf->hdr = hdr;
     buf->win = win;
-    buf->overlap.rid = -1;
+    buf->status = clean;
+    buf->mark.overlap_rid = -1;
     int i;
     for (i=0; i<VCFBUF_LD_N; i++) buf->ld.max[i] = HUGE_VAL;
     rbuf_init(&buf->rbuf, 0);
@@ -108,38 +143,119 @@ void vcfbuf_destroy(vcfbuf_t *buf)
     free(buf->prune.farr);
     free(buf->prune.vrec);
     free(buf->prune.ac);
+    free(buf->prune.af_tag);
     free(buf->prune.idx);
+    free(buf->mark.mark);
+    free(buf->mark.expr);
+    for (i=0; i<buf->mark.nbuf; i++) kbs_destroy(buf->mark.buf[i].bset);
+    free(buf->mark.buf);
+    free(buf->mark.buf_ptr);
+    free(buf->mark.tmpi);
     free(buf);
 }
 
-void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
+int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
 {
-    if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; }
-    if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
-    if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; }
-    if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
-    if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
-
-    if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; }
-    if ( key==VCFBUF_NSITES )
+    va_list args;
+    switch (key)
     {
-        buf->prune.max_sites = *((int*)value);
-        if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
-        return;
+        case LD_FILTER1:
+            va_start(args, key);
+            buf->ld.filter1 = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case LD_RAND_MISSING:
+            va_start(args, key);
+            buf->ld.rand_missing = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case LD_MAX_R2:
+            va_start(args, key);
+            buf->ld.max[VCFBUF_LD_IDX_R2] = va_arg(args,double);
+            va_end(args);
+            return 0;
+
+        case LD_MAX_LD:
+            va_start(args, key);
+            buf->ld.max[VCFBUF_LD_IDX_LD] = va_arg(args,double);
+            va_end(args);
+            return 0;
+
+        case LD_MAX_HD:
+            va_start(args, key);
+            buf->ld.max[VCFBUF_LD_IDX_HD] = va_arg(args,double);
+            va_end(args);
+            return 0;
+
+        case VCFBUF_DUMMY:
+            va_start(args, key);
+            buf->dummy = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case PRUNE_NSITES:
+            va_start(args, key);
+            buf->prune.max_sites = va_arg(args,int);
+            if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+            va_end(args);
+            return 0;
+
+        case PRUNE_NSITES_MODE:
+            va_start(args, key);
+            char *mode = va_arg(args,char*);
+            va_end(args);
+            if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+            else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
+            else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
+            else error("The mode \"%s\" is not recognised\n",mode);
+            return 0;
+
+        case PRUNE_AF_TAG:
+            va_start(args, key);
+            buf->prune.af_tag = strdup(va_arg(args,char*));
+            va_end(args);
+            return 0;
+
+        case MARK:
+            va_start(args, key);
+            buf->mark.expr = strdup(va_arg(args,char*));
+            if ( !strcasecmp(buf->mark.expr,"overlap") ) buf->mark.mode = MARK_OVERLAP;
+            else if ( !strcasecmp(buf->mark.expr,"dup") ) buf->mark.mode = MARK_DUP;
+            else buf->mark.mode = MARK_EXPR;
+            va_end(args);
+            return 0;
+
+        case MARK_MISSING_EXPR:
+            va_start(args, key);
+            char *expr = va_arg(args,char*);
+            if ( !strcasecmp(expr,"0") )
+            {
+                buf->mark.missing_expr = MARK_MISSING_SCALAR;
+                buf->mark.missing_value = 0;
+            }
+            else if ( !strcasecmp(expr,"DP") )
+            {
+                if ( buf->mark.mode!=MARK_EXPR ) error("Only the combination of --mark 'min(QUAL)' with --missing DP is currently supported\n");
+                buf->mark.missing_expr = MARK_MISSING_MAX_DP;
+            }
+            else
+                error("todo: MARK_MISSING_EXPR=%s\n",expr);
+            va_end(args);
+            return 0;
     }
-    if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
-    if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
-    if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; }
+    return 0;
+}
 
-    if ( key==VCFBUF_NSITES_MODE )
-    {
-        char *mode = *((char**)value);
-        if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
-        else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
-        else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
-        else error("The mode \"%s\" is not recognised\n",mode);
-        return;
-    }
+void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...)
+{
+    va_list args;
+    va_start(args, key);
+    if ( key==MARK )
+        return &buf->mark.last;
+    va_end(args);
+    return NULL;
 }
 
 int vcfbuf_nsites(vcfbuf_t *buf)
@@ -149,8 +265,12 @@ int vcfbuf_nsites(vcfbuf_t *buf)
 
 bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
 {
-    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
+    // make sure the caller is using the buffer correctly and calls vcfbuf_flush()
+    // before placing next vcfbuf_push() call
+    assert(buf->status!=dirty);
+    if ( !buf->dummy ) buf->status = dirty;
 
+    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
     int i = rbuf_append(&buf->rbuf);
     if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
 
@@ -165,6 +285,7 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
 
 bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx)
 {
+    buf->status = clean;
     int i = rbuf_kth(&buf->rbuf, idx);
     return i<0 ? NULL : buf->vcf[i].rec;
 }
@@ -197,6 +318,7 @@ static int cmpint_desc(const void *_a, const void *_b)
 
 static void _prune_sites(vcfbuf_t *buf, int flush_all)
 {
+
     int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
 
     int nprune = nbuf - buf->prune.max_sites;
@@ -268,37 +390,75 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
         rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf);
 }
 
-static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all)
+static int mark_dup_can_flush_(vcfbuf_t *buf, int flush_all)
 {
-    if ( flush_all ) return 1;
+    int flush = flush_all;
+    mark_t *mark = &buf->mark;
+    if  ( buf->status==dirty )
+    {
+        // a new site was just added by vcfbuf_push()
+        rbuf_expand0(&mark->rbuf, uint8_t, buf->rbuf.n, mark->mark);
+        int i = rbuf_append(&mark->rbuf);
+        mark->mark[i] = 0;
+
+        if ( buf->rbuf.n==1 ) goto flush;
 
-    if ( buf->rbuf.n==1 ) return 0;
+        // there is at least one previous site, check if it's a duplicate
+        int k1 = rbuf_kth(&buf->rbuf, -1);
+        int k2 = rbuf_kth(&buf->rbuf, -2);
+        vcfrec_t *rec1 = &buf->vcf[k1];
+        vcfrec_t *rec2 = &buf->vcf[k2];
 
-    int k1 = rbuf_kth(&buf->rbuf, -1);
-    int k2 = rbuf_kth(&buf->rbuf, -2);
+        int is_dup = 1;
+        if ( rec1->rec->rid!=rec2->rec->rid ) is_dup = 0;
+        else if ( rec1->rec->pos!=rec2->rec->pos ) is_dup = 0;
 
-    vcfrec_t *rec1 = &buf->vcf[k1];
-    vcfrec_t *rec2 = &buf->vcf[k2];
+        if ( is_dup )
+        {
+            // it is, mark the last two sites as duplicates
+            int k1 = rbuf_kth(&mark->rbuf, -1);
+            int k2 = rbuf_kth(&mark->rbuf, -2);
+            mark->mark[k1] = 1;
+            mark->mark[k2] = 1;
+            goto flush;
+        }
 
-    if ( rec1->rec->rid!=rec2->rec->rid ) return 1;
-    if ( rec1->rec->pos!=rec2->rec->pos ) return 1;
+        // the last site is not a duplicate with the previous, all sites but the last one can be flushed
+        flush = 1;
+    }
+    else if ( buf->rbuf.n > 1 ) flush = 1;
 
-    return 0;
+flush:
+    if ( !flush ) return 0;
+
+    int i = rbuf_shift(&mark->rbuf);
+    mark->last = mark->mark[i];
+    return 1;
 }
 
-static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
+static int mark_overlap_helper_(vcfbuf_t *buf, int flush_all)
 {
-    if ( flush_all ) { buf->overlap.rid = -1; return 1; }
+    if ( buf->status!=dirty ) return flush_all;
 
-    int i = rbuf_last(&buf->rbuf);
-    vcfrec_t *last = &buf->vcf[i];
-    if ( buf->overlap.rid != last->rec->rid ) buf->overlap.end = 0;
+    int flush = flush_all;
+    mark_t *mark = &buf->mark;
 
+    // a new site was just added by vcfbuf_push()
+    buf->status = clean;
+
+    rbuf_expand0(&mark->rbuf, uint8_t, buf->rbuf.n, mark->mark);
+    int i = rbuf_append(&mark->rbuf);
+    mark->mark[i] = 0;
+
+    // determine beg and end of the last record that was just added
+    i = rbuf_last(&buf->rbuf);
+    vcfrec_t *last = &buf->vcf[i];
+    if ( mark->overlap_rid != last->rec->rid ) mark->overlap_end = 0;
     int beg_pos = last->rec->pos;
     int end_pos = last->rec->pos + last->rec->rlen - 1;
 
     // Assuming left-aligned indels. In case it is a deletion, the real variant
-    // starts one base after. If an insertion, the overlap with previous zero length.
+    // starts one base after. If an insertion, the overlap with previous is zero
     int imin = last->rec->rlen;
     for (i=0; i<last->rec->n_allele; i++)
     {
@@ -308,24 +468,175 @@ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
         while ( *ref && *alt && nt_to_upper(*ref)==nt_to_upper(*alt) ) { ref++; alt++; }
         if ( imin > ref - last->rec->d.allele[0] ) imin = ref - last->rec->d.allele[0];
     }
-
-    if ( beg_pos <= buf->overlap.end )
+    if ( beg_pos <= mark->overlap_end )
     {
+        // the new site overlaps with the previous
         beg_pos += imin;
         if ( beg_pos > end_pos ) end_pos = beg_pos;
     }
-
     if ( buf->rbuf.n==1 )
     {
-        buf->overlap.rid = last->rec->rid;
-        buf->overlap.end = end_pos;
-        return 0;
+        mark->overlap_rid = last->rec->rid;
+        mark->overlap_end = end_pos;
+        return flush;
+    }
+    if ( beg_pos <= mark->overlap_end )
+    {
+        if ( mark->overlap_end < end_pos ) mark->overlap_end = end_pos;
+        int k1 = rbuf_kth(&mark->rbuf, -1);
+        int k2 = rbuf_kth(&mark->rbuf, -2);
+        mark->mark[k1] = 1;
+        mark->mark[k2] = 1;
+    }
+    else
+    {
+        if ( mark->overlap_end < end_pos ) mark->overlap_end = end_pos;
+        flush = 1;
+    }
+    return flush;
+}
+
+
+static int mark_overlap_can_flush_(vcfbuf_t *buf, int flush_all)
+{
+    int flush = flush_all;
+    if  ( buf->status==dirty ) flush = mark_overlap_helper_(buf,flush_all);
+    else if ( buf->rbuf.n > 1 ) flush = 1;
+    if ( !flush ) return 0;
+
+    mark_t *mark = &buf->mark;
+    int i = rbuf_shift(&mark->rbuf);
+    mark->last = mark->mark[i];
+    return 1;
+}
+
+
+static int records_overlap(bcf1_t *a, bcf1_t *b)
+{
+    if ( a->rid != b->rid ) return 0;
+    if ( a->pos + a->rlen - 1 < b->pos ) return 0;
+    return 1;
+}
+
+static int cmp_overlap_ptr_asc(const void *aptr, const void *bptr)
+{
+    overlap_t *a = *((overlap_t**)aptr);
+    overlap_t *b = *((overlap_t**)bptr);
+    if ( a->value < b->value ) return -1;
+    if ( a->value > b->value ) return 1;
+    return 0;
+}
+static void mark_expr_missing_reset_(vcfbuf_t *buf)
+{
+    buf->mark.max_qual = 0;
+    buf->mark.max_qual_dp = 0;
+}
+static void mark_expr_missing_prep_(vcfbuf_t *buf, overlap_t *olap)
+{
+    int nval = bcf_get_info_int32(buf->hdr,olap->rec,"DP",&buf->mark.tmpi,&buf->mark.ntmpi);
+    if ( nval!=1 ) return;
+
+    olap->dp = buf->mark.tmpi[0];
+    if ( bcf_float_is_missing(olap->rec->qual) ) return;
+    if ( buf->mark.max_qual < olap->rec->qual )
+    {
+        buf->mark.max_qual = olap->rec->qual;
+        buf->mark.max_qual_dp = olap->dp;
     }
-    if ( beg_pos <= buf->overlap.end )
+}
+static void mark_expr_missing_set_(vcfbuf_t *buf, overlap_t *olap)
+{
+    if ( !bcf_float_is_missing(olap->rec->qual) ) return;
+    if ( !buf->mark.max_qual_dp ) return;
+
+    // scale QUAL of the most confident variant in the overlap proportionally to the coverage
+    // and use that to prioritize the records
+    olap->value = buf->mark.max_qual * olap->dp / buf->mark.max_qual_dp;
+}
+static int mark_expr_can_flush_(vcfbuf_t *buf, int flush_all)
+{
+    mark_t *mark = &buf->mark;
+    if ( strcasecmp("min(QUAL)",mark->expr) ) error("Todo; at this time only min(QUAL) is supported\n");
+
+    int flush = flush_all;
+    if  ( buf->status==dirty )
     {
-        if ( buf->overlap.end < end_pos ) buf->overlap.end = end_pos;
-        return 0;
+        flush = mark_overlap_helper_(buf,flush_all);
+        if ( !flush ) return 0;
+
+        if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_reset_(buf);
+
+        // init overlaps, each overlap_t structure keeps a list of overlapping records, symmetrical
+        size_t nori = mark->nbuf;
+        hts_resize(overlap_t,  buf->rbuf.n, &mark->nbuf, &mark->buf, HTS_RESIZE_CLEAR);
+        hts_resize(overlap_t*, buf->rbuf.n, &nori, &mark->buf_ptr, HTS_RESIZE_CLEAR);
+        int i;
+        for (i=0; i<buf->rbuf.n; i++)
+        {
+            overlap_t *oi = &mark->buf[i];
+            int j = rbuf_kth(&buf->rbuf, i);
+            assert(j>=0);
+            bcf1_t *rec = buf->vcf[j].rec;
+            assert(rec);
+            oi->rec = rec;
+
+            // todo: other than QUAL values
+            oi->value = bcf_float_is_missing(rec->qual) ? mark->missing_value : rec->qual;
+            if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_prep_(buf,oi);
+            if ( oi->bset )
+            {
+                kbs_resize(&oi->bset,buf->rbuf.n);
+                kbs_clear(oi->bset);
+            }
+            else
+                oi->bset = kbs_init(buf->rbuf.n);
+            oi->idx  = i;
+            mark->buf_ptr[i] = oi;
+            mark->mark[oi->idx] = 0;
+        }
+        int nolap = 0;
+        for (i=0; i<buf->rbuf.n; i++)
+        {
+            overlap_t *oi = &mark->buf[i];
+            if ( mark->missing_expr==MARK_MISSING_MAX_DP ) mark_expr_missing_set_(buf,oi);
+            int j;
+            for (j=i+1; j<buf->rbuf.n; j++)
+            {
+                overlap_t *oj = &mark->buf[j];
+                if ( !records_overlap(oi->rec,oj->rec) ) continue;
+                kbs_insert(oi->bset,j);
+                kbs_insert(oj->bset,i);
+                nolap++;
+            }
+        }
+
+        // sort according to the requested criteria, currently only min(QUAL)
+        qsort(mark->buf_ptr,buf->rbuf.n,sizeof(*mark->buf_ptr),cmp_overlap_ptr_asc);   // todo: other than min()
+
+        // go through the list sorted by overlap_t.value, eg QUAL
+        for (i=0; nolap && i<buf->rbuf.n; i++)
+        {
+            kbitset_iter_t itr;
+            overlap_t *oi = mark->buf_ptr[i];
+            kbs_start(&itr);
+            int j;
+            while ((j=kbs_next(oi->bset, &itr)) >= 0)
+            {
+                kbs_delete(oi->bset,j);
+                assert(nolap);
+                assert(kbs_exists(mark->buf[j].bset,oi->idx));
+                kbs_delete(mark->buf[j].bset,oi->idx);
+                nolap--;
+            }
+            j = rbuf_kth(&mark->rbuf,oi->idx);
+            mark->mark[j] = 1;
+        }
     }
+    else if ( buf->rbuf.n > 1 ) flush = 1;
+    if ( !flush ) return 0;
+
+    int i = rbuf_shift(&mark->rbuf);
+    mark->last = mark->mark[i];
     return 1;
 }
 
@@ -333,32 +644,56 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
 {
     int i,j;
 
+    // nothing to do, no lines in the buffer
     if ( buf->rbuf.n==0 ) return NULL;
-    if ( flush_all || buf->dummy ) goto ret;
-
-    i = rbuf_kth(&buf->rbuf, 0);    // first
-    j = rbuf_last(&buf->rbuf);      // last
 
-    if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret;
-    if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret;
-    if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret;
+    // dummy mode, always flushing
+    if ( buf->dummy ) goto ret;
 
-    if ( buf->win > 0 )
+    // pruning mode
+    if ( buf->win )
     {
-        if ( buf->rbuf.n <= buf->win ) return NULL;
+        int can_flush = flush_all;
+        i = rbuf_kth(&buf->rbuf, 0);    // first
+        j = rbuf_last(&buf->rbuf);      // last
+        if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) can_flush = 1;
+        else if ( buf->win > 0 )
+        {
+            if ( buf->rbuf.n > buf->win ) can_flush = 1;
+        }
+        else if ( buf->win < 0 )
+        {
+            if ( !(buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win) ) can_flush = 1;
+        }
+        buf->status = clean;
+        if ( !can_flush ) return NULL;
+        if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
         goto ret;
     }
-    else if ( buf->win < 0 )
+
+    // overlaps and duplicates
+    if ( buf->mark.mode )
     {
-        if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
+        int can_flush = 0;
+        if ( buf->mark.mode==MARK_OVERLAP )
+        {
+            if ( mark_overlap_can_flush_(buf,flush_all) ) can_flush = 1;
+        }
+        else if ( buf->mark.mode==MARK_DUP )
+        {
+            if ( mark_dup_can_flush_(buf,flush_all) ) can_flush = 1;
+        }
+        if ( buf->mark.mode==MARK_EXPR )
+        {
+            if ( mark_expr_can_flush_(buf,flush_all) ) can_flush = 1;
+        }
+        buf->status = clean;
+        if ( !can_flush ) return NULL;
         goto ret;
     }
-    else
-        return NULL;
 
 ret:
-    if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
-
+    buf->status = clean;
     i = rbuf_shift(&buf->rbuf);
     return buf->vcf[i].rec;
 }
diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h
index 878fd1044..96d7115c2 100644
--- a/bcftools/vcfbuf.h
+++ b/bcftools/vcfbuf.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2017-2022 Genome Research Ltd.
+   Copyright (c) 2017-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -38,13 +38,27 @@ typedef struct _vcfbuf_t vcfbuf_t;
 // Modes of operation
 typedef enum
 {
-    VCFBUF_DUMMY,           // the caller maintains the buffer via push/peek/flush, nothing is removed by vcfbuf
-
-    VCFBUF_OVERLAP_WIN,     // keep only overlapping variants in the window
-    VCFBUF_RMDUP,           // remove duplicate sites (completely)
-    VCFBUF_NSITES,          // leave at max this many sites in the window
-    VCFBUF_NSITES_MODE,     // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly)
-    VCFBUF_AF_TAG,          // use this INFO tag with VCFBUF_NSITES
+    VCFBUF_DUMMY,           // int {0,1}, the caller maintains the buffer via push/peek/flush, nothing is removed by vcfbuf
+
+    // pruning
+    PRUNE_NSITES,           // int, leave max this many sites in the window
+    PRUNE_NSITES_MODE,      // char *, maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly)
+    PRUNE_AF_TAG,           // char *, use this INFO/AF tag with VCFBUF_NSITES
+
+    // duplicates and overlaps
+    MARK,                   // w: char *, resolve overlaps by preferentially removing sites according to EXPR:
+                            //      min(QUAL) .. remove sites with lowest QUAL until overlaps are resolved
+                            //      overlap   .. select all overlapping sites
+                            //      dup       .. select duplicate sites
+                            // r: use as
+                            //      while ( (rec=vcfbuf_flush(buf,flush_all)) )
+                            //      {
+                            //          int is_marked = vcfbuf_get_val(buf,int,MARK);
+                            //          if ( is_marked ) do_something(rec);
+                            //      }
+    MARK_MISSING_EXPR,      // char *, what to do when missing value are encountered with min(QUAL)
+                            //      0   .. set to 0 (the default)
+                            //      DP  .. scale max quality in the window proportionally to INFO/DP
 
     // LD related options
     LD_RAND_MISSING,        // randomize rather than ignore missing genotypes
@@ -55,8 +69,23 @@ typedef enum
 }
 vcfbuf_opt_t;
 
-#define vcfbuf_set_opt(buf,type,key,value) { type tmp = value; vcfbuf_set(buf, key, (void*)&tmp); }
-void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value);
+
+/**
+ *  vcfbuf_set() - set various options, see the vcfbuf_opt_t keys for the complete list
+ *
+ *  Returns 0 if the call succeeded, or negative number on error.
+ */
+int vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, ...);   // returns 0 on success
+
+/**
+ *  vcfbuf_get()     - get various options, see the vcfbuf_opt_t keys
+ *  vcfbuf_get_val() - wrapper for `vcfbuf_get()` to return typed value
+ *
+ *  The former returns pointer to the memory area populated by the requested setting,
+ *  its type can be inferred from the vcfbuf_opt_t documentation.
+ */
+void *vcfbuf_get(vcfbuf_t *buf, vcfbuf_opt_t key, ...);
+#define vcfbuf_get_val(buf,type,key) (*(type*)vcfbuf_get(buf, key))
 
 
 /*
@@ -67,7 +96,9 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win);
 void vcfbuf_destroy(vcfbuf_t *buf);
 
 /*
- *  vcfbuf_push() - push a new site for analysis
+ *  vcfbuf_push() - push a new site for analysis.
+ *
+ *  Note that vcfbuf_flush() or vcfbuf_peek() must be called before next site is pushed.
  */
 bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec);
 
@@ -86,6 +117,8 @@ bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx);
 /*
  *  vcfbuf_flush() - returns the next record or NULL, depending on the mode of operation and
  *      the content of the buffer
+ *
+ *  @flush_all: 1 if no more vcfbuf_push() calls will follow, 0 otherwise
  */
 bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all);
 
diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c
index d2f6e2c5f..13e516f83 100644
--- a/bcftools/vcfcall.c
+++ b/bcftools/vcfcall.c
@@ -1,6 +1,6 @@
 /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -397,15 +397,16 @@ static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t
     while ( *ss )
     {
         se = ss;
-        while ( *se && *se!=',' ) se++;
+        while ( *se && *se!=',' && !isspace(*se) ) se++;
         als->n++;
         als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele));
         als->allele[als->n-1] = (char*)malloc(se-ss+1);
         memcpy(als->allele[als->n-1],ss,se-ss);
         als->allele[als->n-1][se-ss] = 0;
         ss = se+1;
-        if ( !*se ) break;
+        if ( !*se || isspace(*se) ) break;
     }
+    if ( als->n<2 ) error("Unable to parse the -T file; expected CHROM\\tPOS\\tREF,ALT with -C alleles but found instead:\n\t%s\n",line);
     return 0;
 }
 static void tgt_free(void *payload)
@@ -695,7 +696,10 @@ static void init_data(args_t *args)
     }
 
     if ( args->aux.flag & CALL_CONSTR_ALLELES )
+    {
         args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
+        vcfbuf_set(args->vcfbuf,VCFBUF_DUMMY,1);
+    }
 
     char wmode[8];
     set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
@@ -717,7 +721,9 @@ static void init_data(args_t *args)
 
     if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
     if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out_fh,args->aux.hdr,args->output_fname,
+                     &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
 }
@@ -910,6 +916,7 @@ static void usage(args_t *args)
     fprintf(stderr, "\n");
     fprintf(stderr, "Input/output options:\n");
     fprintf(stderr, "   -A, --keep-alts                 Keep all possible alternate alleles at variant sites\n");
+    fprintf(stderr, "   -*, --keep-unseen-allele        Keep the unobserved allele <*> or <NON_REF>\n");
     fprintf(stderr, "   -a, --annotate LIST             Optional tags to output (lowercase allowed); '?' to list available tags\n");
     fprintf(stderr, "   -F, --prior-freqs AN,AC         Use prior allele frequencies, determined from these pre-filled tags\n");
     fprintf(stderr, "   -G, --group-samples FILE|-      Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
@@ -920,7 +927,7 @@ static void usage(args_t *args)
     fprintf(stderr, "   -M, --keep-masked-ref           Keep sites with masked reference allele (REF=N)\n");
     fprintf(stderr, "   -V, --skip-variants TYPE        Skip indels/snps\n");
     fprintf(stderr, "   -v, --variants-only             Output variant sites only\n");
-    fprintf(stderr, "       --write-index               Automatically index the output files [off]\n");
+    fprintf(stderr, "   -W, --write-index[=FMT]         Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Consensus/variant calling options:\n");
     fprintf(stderr, "   -c, --consensus-caller          The original calling method (conflicts with -m)\n");
@@ -937,7 +944,7 @@ static void usage(args_t *args)
     // todo (and more)
     // fprintf(stderr, "\nContrast calling and association test options:\n");
     // fprintf(stderr, "       -1 INT    number of group-1 samples [0]\n");
-    // fprintf(stderr, "       -C FLOAT  posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
+    // fprintf(stderr, "       -C FLOAT  posterior contrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
     // fprintf(stderr, "       -U INT    number of permutations for association testing (effective with -1) [0]\n");
     // fprintf(stderr, "       -X FLOAT  only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
     fprintf(stderr, "\n");
@@ -987,6 +994,7 @@ int main_vcfcall(int argc, char *argv[])
         {"targets-file",required_argument,NULL,'T'},
         {"threads",required_argument,NULL,9},
         {"keep-alts",no_argument,NULL,'A'},
+        {"keep-unseen-allele",no_argument,NULL,'*'},
         {"insert-missed",no_argument,NULL,'i'},
         {"skip-Ns",no_argument,NULL,'N'},            // now the new default
         {"keep-masked-refs",no_argument,NULL,'M'},
@@ -1003,12 +1011,12 @@ int main_vcfcall(int argc, char *argv[])
         {"chromosome-X",no_argument,NULL,'X'},
         {"chromosome-Y",no_argument,NULL,'Y'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
 
     char *tmp = NULL;
-    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:A*NMV:vcmp:C:n:P:f:a:ig:XYF:G:W::", loptions, NULL)) >= 0)
     {
         switch (c)
         {
@@ -1026,6 +1034,7 @@ int main_vcfcall(int argc, char *argv[])
             case 'M': args.flag &= ~CF_ACGT_ONLY; break;     // keep sites where REF is N
             case 'N': args.flag |= CF_ACGT_ONLY; break;      // omit sites where first base in REF is N (the new default)
             case 'A': args.aux.flag |= CALL_KEEPALT; break;
+            case '*': args.aux.flag |= CALL_KEEP_UNSEEN; break;
             case 'c': args.flag |= CF_CCALL; break;          // the original EM based calling method
             case 'i': args.flag |= CF_INS_MISSED; break;
             case 'v': args.aux.flag |= CALL_VARONLY; break;
@@ -1090,7 +1099,10 @@ int main_vcfcall(int argc, char *argv[])
                 args.regions_overlap = parse_overlap_option(optarg);
                 if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
                 break;
-            case  10: args.write_index = 1; break;
+            case 'W':
+                if (!(args.write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             default: usage(&args);
         }
     }
diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c
index a955342f4..0da344ef1 100644
--- a/bcftools/vcfcall.c.pysam.c
+++ b/bcftools/vcfcall.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -399,15 +399,16 @@ static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t
     while ( *ss )
     {
         se = ss;
-        while ( *se && *se!=',' ) se++;
+        while ( *se && *se!=',' && !isspace(*se) ) se++;
         als->n++;
         als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele));
         als->allele[als->n-1] = (char*)malloc(se-ss+1);
         memcpy(als->allele[als->n-1],ss,se-ss);
         als->allele[als->n-1][se-ss] = 0;
         ss = se+1;
-        if ( !*se ) break;
+        if ( !*se || isspace(*se) ) break;
     }
+    if ( als->n<2 ) error("Unable to parse the -T file; expected CHROM\\tPOS\\tREF,ALT with -C alleles but found instead:\n\t%s\n",line);
     return 0;
 }
 static void tgt_free(void *payload)
@@ -697,7 +698,10 @@ static void init_data(args_t *args)
     }
 
     if ( args->aux.flag & CALL_CONSTR_ALLELES )
+    {
         args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
+        vcfbuf_set(args->vcfbuf,VCFBUF_DUMMY,1);
+    }
 
     char wmode[8];
     set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
@@ -719,7 +723,9 @@ static void init_data(args_t *args)
 
     if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
     if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out_fh,args->aux.hdr,args->output_fname,
+                     &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
 }
@@ -912,6 +918,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Input/output options:\n");
     fprintf(bcftools_stderr, "   -A, --keep-alts                 Keep all possible alternate alleles at variant sites\n");
+    fprintf(bcftools_stderr, "   -*, --keep-unseen-allele        Keep the unobserved allele <*> or <NON_REF>\n");
     fprintf(bcftools_stderr, "   -a, --annotate LIST             Optional tags to output (lowercase allowed); '?' to list available tags\n");
     fprintf(bcftools_stderr, "   -F, --prior-freqs AN,AC         Use prior allele frequencies, determined from these pre-filled tags\n");
     fprintf(bcftools_stderr, "   -G, --group-samples FILE|-      Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
@@ -922,7 +929,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "   -M, --keep-masked-ref           Keep sites with masked reference allele (REF=N)\n");
     fprintf(bcftools_stderr, "   -V, --skip-variants TYPE        Skip indels/snps\n");
     fprintf(bcftools_stderr, "   -v, --variants-only             Output variant sites only\n");
-    fprintf(bcftools_stderr, "       --write-index               Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "   -W, --write-index[=FMT]         Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
     fprintf(bcftools_stderr, "   -c, --consensus-caller          The original calling method (conflicts with -m)\n");
@@ -939,7 +946,7 @@ static void usage(args_t *args)
     // todo (and more)
     // fprintf(bcftools_stderr, "\nContrast calling and association test options:\n");
     // fprintf(bcftools_stderr, "       -1 INT    number of group-1 samples [0]\n");
-    // fprintf(bcftools_stderr, "       -C FLOAT  posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
+    // fprintf(bcftools_stderr, "       -C FLOAT  posterior contrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", args->aux.min_lrt);
     // fprintf(bcftools_stderr, "       -U INT    number of permutations for association testing (effective with -1) [0]\n");
     // fprintf(bcftools_stderr, "       -X FLOAT  only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
     fprintf(bcftools_stderr, "\n");
@@ -989,6 +996,7 @@ int main_vcfcall(int argc, char *argv[])
         {"targets-file",required_argument,NULL,'T'},
         {"threads",required_argument,NULL,9},
         {"keep-alts",no_argument,NULL,'A'},
+        {"keep-unseen-allele",no_argument,NULL,'*'},
         {"insert-missed",no_argument,NULL,'i'},
         {"skip-Ns",no_argument,NULL,'N'},            // now the new default
         {"keep-masked-refs",no_argument,NULL,'M'},
@@ -1005,12 +1013,12 @@ int main_vcfcall(int argc, char *argv[])
         {"chromosome-X",no_argument,NULL,'X'},
         {"chromosome-Y",no_argument,NULL,'Y'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
 
     char *tmp = NULL;
-    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:A*NMV:vcmp:C:n:P:f:a:ig:XYF:G:W::", loptions, NULL)) >= 0)
     {
         switch (c)
         {
@@ -1028,6 +1036,7 @@ int main_vcfcall(int argc, char *argv[])
             case 'M': args.flag &= ~CF_ACGT_ONLY; break;     // keep sites where REF is N
             case 'N': args.flag |= CF_ACGT_ONLY; break;      // omit sites where first base in REF is N (the new default)
             case 'A': args.aux.flag |= CALL_KEEPALT; break;
+            case '*': args.aux.flag |= CALL_KEEP_UNSEEN; break;
             case 'c': args.flag |= CF_CCALL; break;          // the original EM based calling method
             case 'i': args.flag |= CF_INS_MISSED; break;
             case 'v': args.aux.flag |= CALL_VARONLY; break;
@@ -1092,7 +1101,10 @@ int main_vcfcall(int argc, char *argv[])
                 args.regions_overlap = parse_overlap_option(optarg);
                 if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
                 break;
-            case  10: args.write_index = 1; break;
+            case 'W':
+                if (!(args.write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             default: usage(&args);
         }
     }
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c
index 0302261d5..e970b043b 100644
--- a/bcftools/vcfcnv.c
+++ b/bcftools/vcfcnv.c
@@ -41,6 +41,7 @@
 #include <htslib/kstring.h>
 #include <htslib/kfunc.h>
 #include <htslib/khash_str2int.h>
+#include <htslib/hts_defs.h>
 #include "bcftools.h"
 #include "HMM.h"
 #include "rbuf.h"
@@ -105,7 +106,7 @@ typedef struct _args_t
 }
 args_t;
 
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4);
 
 static inline void hmm2cn_state(int nstates, int i, int *a, int *b)
 {
@@ -879,7 +880,7 @@ static int update_sample_args(args_t *args, sample_t *smpl, int ismpl)
 
     /*
         A noisy CN2 band is hard to distinguish from two CN3 bands which are
-        close to each other. Set a treshold on the minimum separation based
+        close to each other. Set a threshold on the minimum separation based
         on the BAF deviation at p=0.95
     */
     baf_dev2 /= norm_cn3;
@@ -1297,7 +1298,7 @@ int main_vcfcnv(int argc, char *argv[])
         {0,0,0,0}
     };
     char *tmp = NULL;
-    while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W:f:a:L:d:k:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:",loptions,NULL)) >= 0) {
         switch (c) {
             case 'L': 
                 args->lrr_smooth_win = strtol(optarg,&tmp,10);
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c
index fd2e3bba1..d6ee42121 100644
--- a/bcftools/vcfcnv.c.pysam.c
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -43,6 +43,7 @@
 #include <htslib/kstring.h>
 #include <htslib/kfunc.h>
 #include <htslib/khash_str2int.h>
+#include <htslib/hts_defs.h>
 #include "bcftools.h"
 #include "HMM.h"
 #include "rbuf.h"
@@ -107,7 +108,7 @@ typedef struct _args_t
 }
 args_t;
 
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4);
 
 static inline void hmm2cn_state(int nstates, int i, int *a, int *b)
 {
@@ -881,7 +882,7 @@ static int update_sample_args(args_t *args, sample_t *smpl, int ismpl)
 
     /*
         A noisy CN2 band is hard to distinguish from two CN3 bands which are
-        close to each other. Set a treshold on the minimum separation based
+        close to each other. Set a threshold on the minimum separation based
         on the BAF deviation at p=0.95
     */
     baf_dev2 /= norm_cn3;
@@ -1299,7 +1300,7 @@ int main_vcfcnv(int argc, char *argv[])
         {0,0,0,0}
     };
     char *tmp = NULL;
-    while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W:f:a:L:d:k:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h?r:R:t:T:s:o:p:l:T:c:b:P:x:e:O:W::f:a:L:d:k:",loptions,NULL)) >= 0) {
         switch (c) {
             case 'L': 
                 args->lrr_smooth_win = strtol(optarg,&tmp,10);
diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c
index 8e25cc590..232b3ae39 100644
--- a/bcftools/vcfconcat.c
+++ b/bcftools/vcfconcat.c
@@ -29,6 +29,7 @@ THE SOFTWARE.  */
 #include <assert.h>
 #include <errno.h>
 #include <math.h>
+#include <stdint.h>
 #include <inttypes.h>
 #include <htslib/vcf.h>
 #include <htslib/synced_bcf_reader.h>
@@ -36,6 +37,7 @@ THE SOFTWARE.  */
 #include <htslib/bgzf.h>
 #include <htslib/tbx.h> // for hts_get_bgzfp()
 #include <htslib/thread_pool.h>
+#include <htslib/hts_endian.h>
 #include <sys/time.h>
 #include "bcftools.h"
 
@@ -157,7 +159,9 @@ static void init_data(args_t *args)
         hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
     }
     if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out_fh,args->out_hdr,args->output_fname,
+                     &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->allow_overlaps )
     {
@@ -781,7 +785,7 @@ static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fnam
     for (j=0; j<hdr0->nhrec; j++)
     {
         bcf_hrec_t *hrec0 = hdr0->hrec[j];
-        if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue;    // skip fiels w/o IDX
+        if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue;    // skip fields w/o IDX
         int itag = bcf_hrec_find_key(hrec0, "ID");
         bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL);
 
@@ -887,19 +891,20 @@ static void naive_concat(args_t *args)
         int nskip;
         if ( type.format==bcf )
         {
-            uint8_t magic[5];
-            if ( bgzf_read(fp, magic, 5) != 5 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
+            const size_t magic_len = 5 + 4; // "Magic" string + header length
+            uint8_t magic[magic_len];
+            if ( bgzf_read(fp, magic, magic_len) != magic_len ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
+            // First five bytes are the "Magic" string
             if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("\nInvalid BCF magic string in %s\n", args->fnames[i]);
-
-            if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
+            // Next four are the header length (little-endian)
+            tmp.l = le_to_u32(magic + 5);
             hts_expand(char,tmp.l,tmp.m,tmp.s);
             if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
 
             // write only the first header
             if ( i==0 )
             {
-                if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("\nFailed to write %d bytes to %s\n", 5,args->output_fname);
-                if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("\nFailed to write %d bytes to %s\n", 4,args->output_fname);
+                if ( bgzf_write(bgzf_out, magic, magic_len) != magic_len ) error("\nFailed to write %zu bytes to %s\n", magic_len,args->output_fname);
                 if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("\nFailed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname);
             }
             nskip = fp->block_offset;
@@ -982,7 +987,7 @@ static void usage(args_t *args)
     fprintf(stderr, "       --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(stderr, "       --threads INT              Use multithreading with <int> worker threads [0]\n");
     fprintf(stderr, "   -v, --verbose 0|1              Set verbosity level [1]\n");
-    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
+    fprintf(stderr, "   -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -1021,12 +1026,12 @@ int main_vcfconcat(int argc, char *argv[])
         {"file-list",required_argument,NULL,'f'},
         {"min-PQ",required_argument,NULL,'q'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,13},
+        {"write-index",optional_argument,NULL,'W'},
         {"drop-genotypes",no_argument,NULL,'G'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:W::",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'c': args->compact_PS = 1; break;
@@ -1076,7 +1081,10 @@ int main_vcfconcat(int argc, char *argv[])
                       args->verbose = strtol(optarg, &tmp, 0);
                       if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
                       break;
-            case 13 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -1104,6 +1112,7 @@ int main_vcfconcat(int argc, char *argv[])
     if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
     if ( args->naive_concat )
     {
+        if ( args->write_index ) error("Error: cannot --write-index in the %s mode\n",args->naive_concat_trust_headers?"--naive-force":"--naive");
         if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
         if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
         if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c
index 0d3b3943c..d238dc045 100644
--- a/bcftools/vcfconcat.c.pysam.c
+++ b/bcftools/vcfconcat.c.pysam.c
@@ -31,6 +31,7 @@ THE SOFTWARE.  */
 #include <assert.h>
 #include <errno.h>
 #include <math.h>
+#include <stdint.h>
 #include <inttypes.h>
 #include <htslib/vcf.h>
 #include <htslib/synced_bcf_reader.h>
@@ -38,6 +39,7 @@ THE SOFTWARE.  */
 #include <htslib/bgzf.h>
 #include <htslib/tbx.h> // for hts_get_bgzfp()
 #include <htslib/thread_pool.h>
+#include <htslib/hts_endian.h>
 #include <sys/time.h>
 #include "bcftools.h"
 
@@ -159,7 +161,9 @@ static void init_data(args_t *args)
         hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
     }
     if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out_fh,args->out_hdr,args->output_fname,
+                     &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
     if ( args->allow_overlaps )
     {
@@ -783,7 +787,7 @@ static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fnam
     for (j=0; j<hdr0->nhrec; j++)
     {
         bcf_hrec_t *hrec0 = hdr0->hrec[j];
-        if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue;    // skip fiels w/o IDX
+        if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue;    // skip fields w/o IDX
         int itag = bcf_hrec_find_key(hrec0, "ID");
         bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL);
 
@@ -889,19 +893,20 @@ static void naive_concat(args_t *args)
         int nskip;
         if ( type.format==bcf )
         {
-            uint8_t magic[5];
-            if ( bgzf_read(fp, magic, 5) != 5 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
+            const size_t magic_len = 5 + 4; // "Magic" string + header length
+            uint8_t magic[magic_len];
+            if ( bgzf_read(fp, magic, magic_len) != magic_len ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
+            // First five bytes are the "Magic" string
             if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("\nInvalid BCF magic string in %s\n", args->fnames[i]);
-
-            if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
+            // Next four are the header length (little-endian)
+            tmp.l = le_to_u32(magic + 5);
             hts_expand(char,tmp.l,tmp.m,tmp.s);
             if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]);
 
             // write only the first header
             if ( i==0 )
             {
-                if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("\nFailed to write %d bytes to %s\n", 5,args->output_fname);
-                if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("\nFailed to write %d bytes to %s\n", 4,args->output_fname);
+                if ( bgzf_write(bgzf_out, magic, magic_len) != magic_len ) error("\nFailed to write %zu bytes to %s\n", magic_len,args->output_fname);
                 if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("\nFailed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname);
             }
             nskip = fp->block_offset;
@@ -984,7 +989,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "       --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(bcftools_stderr, "       --threads INT              Use multithreading with <int> worker threads [0]\n");
     fprintf(bcftools_stderr, "   -v, --verbose 0|1              Set verbosity level [1]\n");
-    fprintf(bcftools_stderr, "       --write-index              Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "   -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -1023,12 +1028,12 @@ int main_vcfconcat(int argc, char *argv[])
         {"file-list",required_argument,NULL,'f'},
         {"min-PQ",required_argument,NULL,'q'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,13},
+        {"write-index",optional_argument,NULL,'W'},
         {"drop-genotypes",no_argument,NULL,'G'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:W::",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'c': args->compact_PS = 1; break;
@@ -1078,7 +1083,10 @@ int main_vcfconcat(int argc, char *argv[])
                       args->verbose = strtol(optarg, &tmp, 0);
                       if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
                       break;
-            case 13 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -1106,6 +1114,7 @@ int main_vcfconcat(int argc, char *argv[])
     if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
     if ( args->naive_concat )
     {
+        if ( args->write_index ) error("Error: cannot --write-index in the %s mode\n",args->naive_concat_trust_headers?"--naive-force":"--naive");
         if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
         if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
         if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c
index 76c4a325a..f75085aab 100644
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -31,6 +31,7 @@ THE SOFTWARE.  */
 #include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <stdint.h>
 #include <inttypes.h>
 #include <htslib/faidx.h>
 #include <htslib/vcf.h>
@@ -38,6 +39,7 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/kseq.h>
+#include <htslib/hts_endian.h>
 #include "bcftools.h"
 #include "filter.h"
 #include "convert.h"
@@ -209,7 +211,10 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
     {
         long end = strtol(se+1,&ss,10);
         if ( ss==se+1 ) return -1;
-        bcf_update_info_int32(args->header, rec, "END", &end, 1);
+        if (end < 1 || end > INT32_MAX)
+            return -1;
+        int32_t e = end; // bcf_update_info_int32 needs an int32_t pointer
+        bcf_update_info_int32(args->header, rec, "END", &e, 1);
     }
 
     rec->rid = rid;
@@ -490,7 +495,9 @@ static void gensample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     nsamples -= 2;
@@ -634,7 +641,9 @@ static void haplegendsample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
@@ -786,7 +795,9 @@ static void hapsample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     nsamples -= 2;
@@ -1389,7 +1400,9 @@ static void tsv_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
 
     tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
     if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
@@ -1468,7 +1481,9 @@ static void vcf_to_vcf(args_t *args)
 
     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
 
     while ( bcf_sr_next_line(args->files) )
     {
@@ -1510,7 +1525,9 @@ static void gvcf_to_vcf(args_t *args)
     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
     if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,hdr,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
 
     int32_t *itmp = NULL, nitmp = 0;
 
@@ -1608,7 +1625,7 @@ static void usage(void)
     fprintf(stderr, "   -o, --output FILE              Output file name [stdout]\n");
     fprintf(stderr, "   -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
     fprintf(stderr, "       --threads INT              Use multithreading with INT worker threads [0]\n");
-    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
+    fprintf(stderr, "   -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
     fprintf(stderr, "   -G, --gensample2vcf ...        <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
@@ -1702,11 +1719,11 @@ int main_vcfconvert(int argc, char *argv[])
         {"fasta-ref",required_argument,NULL,'f'},
         {"no-version",no_argument,NULL,10},
         {"keep-duplicates",no_argument,NULL,12},
-        {"write-index",no_argument,NULL,16},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'e':
                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
@@ -1731,7 +1748,10 @@ int main_vcfconvert(int argc, char *argv[])
             case  7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
             case  8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
             case 15 : args->gen_3N6 = 1; break;
-            case 16 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
             case 'f': args->ref_fname = optarg; break;
             case 'c': args->columns = optarg; break;
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c
index 16bb3be68..f8921bf63 100644
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -33,6 +33,7 @@ THE SOFTWARE.  */
 #include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <stdint.h>
 #include <inttypes.h>
 #include <htslib/faidx.h>
 #include <htslib/vcf.h>
@@ -40,6 +41,7 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/kseq.h>
+#include <htslib/hts_endian.h>
 #include "bcftools.h"
 #include "filter.h"
 #include "convert.h"
@@ -211,7 +213,10 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
     {
         long end = strtol(se+1,&ss,10);
         if ( ss==se+1 ) return -1;
-        bcf_update_info_int32(args->header, rec, "END", &end, 1);
+        if (end < 1 || end > INT32_MAX)
+            return -1;
+        int32_t e = end; // bcf_update_info_int32 needs an int32_t pointer
+        bcf_update_info_int32(args->header, rec, "END", &e, 1);
     }
 
     rec->rid = rid;
@@ -492,7 +497,9 @@ static void gensample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     nsamples -= 2;
@@ -636,7 +643,9 @@ static void haplegendsample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
@@ -788,7 +797,9 @@ static void hapsample_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
     bcf1_t *rec = bcf_init();
 
     nsamples -= 2;
@@ -1391,7 +1402,9 @@ static void tsv_to_vcf(args_t *args)
     if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
     if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
     if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
 
     tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
     if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
@@ -1470,7 +1483,9 @@ static void vcf_to_vcf(args_t *args)
 
     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,args->header,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
 
     while ( bcf_sr_next_line(args->files) )
     {
@@ -1512,7 +1527,9 @@ static void gvcf_to_vcf(args_t *args)
     bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
     if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
     if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
-    if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
+    if ( init_index2(out_fh,hdr,args->outfname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->outfname);
 
     int32_t *itmp = NULL, nitmp = 0;
 
@@ -1610,7 +1627,7 @@ static void usage(void)
     fprintf(bcftools_stderr, "   -o, --output FILE              Output file name [bcftools_stdout]\n");
     fprintf(bcftools_stderr, "   -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
     fprintf(bcftools_stderr, "       --threads INT              Use multithreading with INT worker threads [0]\n");
-    fprintf(bcftools_stderr, "       --write-index              Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "   -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
     fprintf(bcftools_stderr, "   -G, --gensample2vcf ...        <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
@@ -1704,11 +1721,11 @@ int main_vcfconvert(int argc, char *argv[])
         {"fasta-ref",required_argument,NULL,'f'},
         {"no-version",no_argument,NULL,10},
         {"keep-duplicates",no_argument,NULL,12},
-        {"write-index",no_argument,NULL,16},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'e':
                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
@@ -1733,7 +1750,10 @@ int main_vcfconvert(int argc, char *argv[])
             case  7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
             case  8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
             case 15 : args->gen_3N6 = 1; break;
-            case 16 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
             case 'f': args->ref_fname = optarg; break;
             case 'c': args->columns = optarg; break;
diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c
index 8665409d1..52d4f9455 100644
--- a/bcftools/vcffilter.c
+++ b/bcftools/vcffilter.c
@@ -493,7 +493,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -T, --targets-file FILE        Similar to -R but streams rather than index-jumps\n");
     fprintf(stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
-    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
+    fprintf(stderr, "    -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -536,11 +536,11 @@ int main_vcffilter(int argc, char *argv[])
         {"SnpGap",required_argument,NULL,'g'},
         {"IndelGap",required_argument,NULL,'G'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,12},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'g':
                 args->snp_gap = strtol(optarg,&tmp,10);
@@ -629,7 +629,10 @@ int main_vcffilter(int argc, char *argv[])
                 else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
                 else error("Could not parse: --mask-overlap %s\n",optarg);
                 break;
-            case  12 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -677,7 +680,9 @@ int main_vcffilter(int argc, char *argv[])
 
     init_data(args);
     if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out_fh,args->hdr,args->output_fname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
     while ( bcf_sr_next_line(args->files) )
     {
         bcf1_t *line = bcf_sr_get_line(args->files, 0);
diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c
index 6d17151e9..c240f799d 100644
--- a/bcftools/vcffilter.c.pysam.c
+++ b/bcftools/vcffilter.c.pysam.c
@@ -495,7 +495,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "    -T, --targets-file FILE        Similar to -R but streams rather than index-jumps\n");
     fprintf(bcftools_stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(bcftools_stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
-    fprintf(bcftools_stderr, "        --write-index              Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "    -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -538,11 +538,11 @@ int main_vcffilter(int argc, char *argv[])
         {"SnpGap",required_argument,NULL,'g'},
         {"IndelGap",required_argument,NULL,'G'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,12},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'g':
                 args->snp_gap = strtol(optarg,&tmp,10);
@@ -631,7 +631,10 @@ int main_vcffilter(int argc, char *argv[])
                 else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
                 else error("Could not parse: --mask-overlap %s\n",optarg);
                 break;
-            case  12 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -679,7 +682,9 @@ int main_vcffilter(int argc, char *argv[])
 
     init_data(args);
     if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out_fh,args->hdr,args->output_fname,&args->index_fn,
+                     args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
     while ( bcf_sr_next_line(args->files) )
     {
         bcf1_t *line = bcf_sr_get_line(args->files, 0);
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c
index 561be62a5..be886db34 100644
--- a/bcftools/vcfgtcheck.c
+++ b/bcftools/vcfgtcheck.c
@@ -1,6 +1,6 @@
 /*  vcfgtcheck.c -- Check sample identity.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -39,11 +39,17 @@ THE SOFTWARE.  */
 #include <htslib/vcfutils.h>
 #include <htslib/kbitset.h>
 #include <htslib/hts_os.h>
+#include <htslib/bgzf.h>
 #include <inttypes.h>
 #include <sys/time.h>
 #include "bcftools.h"
 #include "extsort.h"
 //#include "hclust.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
 
 typedef struct
 {
@@ -56,20 +62,22 @@ typedef struct
     bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
     bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
     char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
-    int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
-    int regions_overlap, targets_overlap;
+    char *output_fname;
+    int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file, output_type;;
+    int regions_overlap, targets_overlap, clevel;
     int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
     int nused[2][2];
     double *pdiff, *qry_prob, *gt_prob;
-    uint32_t *ndiff,*ncnt,ncmp, npairs;
+    uint32_t *ndiff,*ncnt,*nmatch,ncmp, npairs;
     int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
     uint8_t *qry_dsg, *gt_dsg;
     pair_t *pairs;
     double *hwe_prob, dsg2prob[8][3], pl2prob[256];
     double min_inter_err, max_intra_err;
-    int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
-    FILE *fp;
-    unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
+    int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, gt_err;
+    BGZF *out_fh;
+    unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL, nskip_filter;
+    kstring_t kstr;
 
     // for --distinctive-sites
     double distinctive_sites;
@@ -77,6 +85,11 @@ typedef struct
     size_t diff_sites_size;
     extsort_t *es;
     char *es_tmp_prefix, *es_max_mem;
+
+    // include or exclude sites which match the filters
+    filter_t *qry_filter, *gt_filter;
+    char *qry_filter_str, *gt_filter_str;
+    int qry_filter_logic, gt_filter_logic;       // FLT_INCLUDE or FLT_EXCLUDE
 }
 args_t;
 
@@ -94,15 +107,17 @@ static void set_cwd(args_t *args)
     }
     assert(buf);
 }
-static void print_header(args_t *args, FILE *fp)
+static void print_header(args_t *args)
 {
-    fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
-    fprintf(fp, "# \t bcftools %s ", args->argv[0]);
+    args->kstr.l = 0;
+    ksprintf(&args->kstr, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
+    ksprintf(&args->kstr, "# \t bcftools %s ", args->argv[0]);
     int i;
     for (i=1; i<args->argc; i++)
-        fprintf(fp, " %s",args->argv[i]);
-    fprintf(fp, "\n# and the working directory was:\n");
-    fprintf(fp, "# \t %s\n#\n", args->cwd);
+        ksprintf(&args->kstr, " %s",args->argv[i]);
+    ksprintf(&args->kstr, "\n# and the working directory was:\n");
+    ksprintf(&args->kstr, "# \t %s\n#\n", args->cwd);
+    if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
 }
 
 static int cmp_int(const void *_a, const void *_b)
@@ -126,7 +141,7 @@ static int cmp_pair(const void *_a, const void *_b)
 
 typedef struct
 {
-    uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
+    uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosomes
     unsigned long kbs_dat[1];
 }
 diff_sites_t;
@@ -262,6 +277,9 @@ static void init_data(args_t *args)
         if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
     }
 
+    if ( args->gt_hdr && args->gt_filter_str ) args->gt_filter = filter_init(args->gt_hdr, args->gt_filter_str);
+    if ( args->qry_hdr && args->qry_filter_str ) args->qry_filter = filter_init(args->qry_hdr, args->qry_filter_str);
+
     // Determine whether GT or PL will be used
     if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
     {
@@ -371,56 +389,76 @@ static void init_data(args_t *args)
         args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
         args->gt_dsg  = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
     }
-    if ( args->use_PLs )
+    if ( args->gt_err )
     {
         args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff));      // log probability of pair samples being the same
         args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
         args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
 
+        // Convert genotypes to genotype likelihoods given by -E, the probability of reading one allele incorrectly. In this
+        // simple model we have:
+        //     - probability of reading an allele incorrectly, eg. 0 as 1 or 1 as 0
+        //         P(0|1) = P(1|0) = e
+        //     - probability of genotype G={00,01,11} being correct given observed dosage {0,1,2} and the
+        //       genotyping error probability e:
+        //          P(00|0) = 1       P(00|1) = e       P(00|2) = e^2
+        //          P(01|0) = e       P(01|1) = 1       P(01|2) = e
+        //          P(11|0) = e^2     P(11|1) = e       P(11|2) = 1
+        //
         // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
-        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+        // anything else indicated an error, this is just to reuse gt_to_dsg(); the second index are the corresponding
         // probabilities of 0/0, 0/1, and 1/1 genotypes
+        //
         for (i=0; i<8; i++)
             for (j=0; j<3; j++)
                 args->dsg2prob[i][j] = HUGE_VAL;
-        args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
+        double eprob = pow(10,-0.1*args->gt_err);      // convert from phred score to probability
+        args->dsg2prob[1][0] = 0;               // P(00|0) = 1
+        args->dsg2prob[1][1] = -log(eprob);     // P(01|0) = e
+        args->dsg2prob[1][2] = -2*log(eprob);   // P(11|0) = e^2
+        args->dsg2prob[2][0] = -log(eprob);     // P(00|1) = e
+        args->dsg2prob[2][1] = 0;               // P(01|1) = 1
+        args->dsg2prob[2][2] = -log(eprob);     // P(11|1) = e
+        args->dsg2prob[4][0] = -2*log(eprob);   // P(00|2) = e^2
+        args->dsg2prob[4][1] = -log(eprob);     // P(01|2) = e
+        args->dsg2prob[4][2] = 0;               // P(11|2) = 1
 
         // lookup table to avoid exponentiation
         for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
     }
     else
         args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff));    // number of differing genotypes for each pair of samples
-    args->ncnt  = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
+    args->ncnt   = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
     if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
     if ( args->calc_hwe_prob )
     {
         // prob of the observed sequence of matches given site AFs and HWE
         args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
         if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
+        args->nmatch = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of matches, used only with calc_hwe_prob
+        if ( !args->nmatch ) error("Error: failed to allocate %.1f Mb.\n", args->npairs*sizeof(*args->ncnt)/1e6);
     }
 
     if ( args->distinctive_sites ) diff_sites_init(args);
 
-    args->fp = stdout;
-    print_header(args, args->fp);
+    args->out_fh = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu");
+    if ( args->out_fh == NULL )
+        error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname ? args->output_fname : "standard output", strerror(errno));
+
+    print_header(args);
 }
 
 static void destroy_data(args_t *args)
 {
+    free(args->kstr.s);
+    if ( args->gt_filter ) filter_destroy(args->gt_filter);
+    if ( args->qry_filter ) filter_destroy(args->qry_filter);
     if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
     free(args->qry_dsg);
     if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
     free(args->qry_prob);
     free(args->es_max_mem);
-    fclose(args->fp);
+    if ( bgzf_close(args->out_fh)!=0 )  error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
     if ( args->distinctive_sites ) diff_sites_destroy(args);
     free(args->hwe_prob);
     free(args->cwd);
@@ -429,6 +467,7 @@ static void destroy_data(args_t *args)
     free(args->pdiff);
     free(args->ndiff);
     free(args->ncnt);
+    free(args->nmatch);
     free(args->qry_smpl);
     if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
     free(args->pairs);
@@ -538,6 +577,13 @@ static void process_line(args_t *args)
     int i,j,k, nqry1, ngt1, ret;
 
     bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0);   // the query file
+    if ( args->qry_filter )
+    {
+        int pass = filter_test(args->qry_filter, qry_rec, NULL);
+        if ( args->qry_filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1;
+        if ( !pass ) { args->nskip_filter++; return; }
+    }
+
     int qry_use_GT = args->qry_use_GT;
     int gt_use_GT  = args->gt_use_GT;
 
@@ -547,6 +593,12 @@ static void process_line(args_t *args)
     if ( args->gt_hdr )
     {
         gt_rec = bcf_sr_get_line(args->files,1);
+        if ( args->gt_filter )
+        {
+            int pass = filter_test(args->gt_filter, gt_rec, NULL);
+            if ( args->gt_filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1;
+            if ( !pass ) { args->nskip_filter++; return; }
+        }
         ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, &gt_use_GT);
         if ( ret<0 ) return;
     }
@@ -560,7 +612,7 @@ static void process_line(args_t *args)
     args->ncmp++;
     args->nused[qry_use_GT][gt_use_GT]++;
 
-    double af,hwe_dsg[8];
+    double hwe_dsg[8];
     if ( args->calc_hwe_prob )
     {
         int ac[2];
@@ -570,18 +622,28 @@ static void process_line(args_t *args)
         }
         else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
 
-        // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
-        // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
-        //      hwe[1,2,4] ..  dsg=0,1,2
-        //      hwe[3]     ..  dsg=0 or 1
-        //      hwe[6]     ..  dsg=1 or 2
-
-        double hwe[3];
-        const double min_af = 1e-5;             // cap the AF in case we get unrealistic values
-        af = (double)ac[1]/(ac[0]+ac[1]);
-        hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
-        hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
-        hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
+        // Calculate HWE probability for each possible qry+gt dosage combination. The alternate allele dosage
+        // values returned by gt_to_prob() below are 0,1,2,4 (0=missing, 1<<0, 1<<1, 1<<2). We consider only
+        // biallelic sites, therefore we work with eight genotype combinations.
+        //
+        // The array hwe_dsg is accessed with hwe_dsg[qry_dsg & gt_dsg] and is constructed to account for PL uncertainty
+        // when we encounter less informative PL, such as PL=0,0,10, where multiple dosage values are equally
+        // likely. If we allowed complete uncertainty (PL=0,0,0), we'd have up to eight possible genotype
+        // mask combinations: from e.g. 0=(gt_dsg=1<<0 & qry_dsg=1<<1) to 7=(gt_dsg=1<<0|1<<1|1<<2 & qry_dsg=1<<0|1<<1|1<<2).
+        // Note the extreme case of 1|2|4 is skipped, see pl_to_dsg().
+        //
+        // When the dosage is uncertain, we take the minimum of their corresponding HWE value, for example
+        //      hwe[0] = 0
+        //      hwe[1] = (1-AF)**2
+        //      hwe[2] = 2*AF*(1-AF)
+        //      hwe[4] = AF**2
+        //      hwe[3] = min{hwe[1],hwe[2]}
+
+        double hwe[3];  // while hwe_dsg iterates over dsg bitmasks (0..7), hwe iterates over dsg (0,1,2)
+        double af = ac[0]+ac[1] ? (double)ac[1]/(ac[0]+ac[1]) : 1e-6;
+        hwe[0] = -log((1-af)*(1-af));
+        hwe[1] = -log(2*af*(1-af));
+        hwe[2] = -log(af*af);
         hwe_dsg[0] = 0;
         for (i=1; i<8; i++)
         {
@@ -596,7 +658,7 @@ static void process_line(args_t *args)
     // The sample pairs were given explicitly via -p/-P options
     if ( args->pairs )
     {
-        if ( !args->use_PLs )
+        if ( !args->gt_err )
         {
             int ndiff = 0;
             if ( args->kbs_diff ) diff_sites_reset(args);
@@ -621,13 +683,17 @@ static void process_line(args_t *args)
                     args->ndiff[i]++;
                     if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
                 }
-                else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
+                else if ( args->calc_hwe_prob )
+                {
+                    args->hwe_prob[i] += hwe_dsg[match];
+                    args->nmatch[i]++;
+                }
                 args->ncnt[i]++;
             }
 
             if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
         }
-        else    // use_PLs set
+        else    // gt_err set
         {
             for (i=0; i<args->npairs; i++)
             {
@@ -655,6 +721,7 @@ static void process_line(args_t *args)
                 {
                     int match = qry_dsg & gt_dsg;
                     args->hwe_prob[i] += hwe_dsg[match];
+                    if ( match ) args->nmatch[i]++;
                 }
                 args->ncnt[i]++;
             }
@@ -663,7 +730,7 @@ static void process_line(args_t *args)
     }
 
     int idx=0;
-    if ( !args->use_PLs )
+    if ( !args->gt_err )
     {
         for (i=0; i<args->nqry_smpl; i++)
         {
@@ -690,13 +757,17 @@ static void process_line(args_t *args)
                 if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
                 int match = args->qry_dsg[i] & args->gt_dsg[j];
                 if ( !match ) args->ndiff[idx]++;
-                else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
+                else if ( args->calc_hwe_prob )
+                {
+                    args->hwe_prob[idx] += hwe_dsg[match];
+                    args->nmatch[idx]++;
+                }
                 args->ncnt[idx]++;
                 idx++;
             }
         }
     }
-    else    // use_PLs set
+    else    // gt_err set
     {
         for (i=0; i<args->nqry_smpl; i++)
         {
@@ -731,6 +802,7 @@ static void process_line(args_t *args)
                 {
                     int match = args->qry_dsg[i] & args->gt_dsg[j];
                     args->hwe_prob[idx] += hwe_dsg[match];
+                    if ( match ) args->nmatch[idx]++;
                 }
                 args->ncnt[idx]++;
                 idx++;
@@ -758,12 +830,13 @@ static void report_distinctive_sites(args_t *args)
 {
     extsort_sort(args->es);
 
-    fprintf(args->fp,"# DS, distinctive sites:\n");
-    fprintf(args->fp,"#     - chromosome\n");
-    fprintf(args->fp,"#     - position\n");
-    fprintf(args->fp,"#     - cumulative number of pairs distinguished by this block\n");
-    fprintf(args->fp,"#     - block id\n");
-    fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
+    args->kstr.l = 0;
+    ksprintf(&args->kstr,"# DS, distinctive sites:\n");
+    ksprintf(&args->kstr,"#     - chromosome\n");
+    ksprintf(&args->kstr,"#     - position\n");
+    ksprintf(&args->kstr,"#     - cumulative number of pairs distinguished by this block\n");
+    ksprintf(&args->kstr,"#     - block id\n");
+    ksprintf(&args->kstr,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
 
     kbitset_t *kbs_blk = kbs_init(args->npairs);
     kbitset_iter_t itr;
@@ -783,7 +856,9 @@ static void report_distinctive_sites(args_t *args)
         if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
         if ( !ndiff_new ) continue;     // no new pair distinguished by this site
         ndiff_tot += ndiff_new;
-        fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+        args->kstr.l = 0;
+        ksprintf(&args->kstr,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+        if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
         if ( ndiff_tot < ndiff_min ) continue;   // fewer than the requested number of pairs can be distinguished at this point
         iblock++;
         ndiff_tot = 0;
@@ -793,24 +868,35 @@ static void report_distinctive_sites(args_t *args)
 }
 static void report(args_t *args)
 {
-    fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
-    fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
-    fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
-    fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
-    fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
-    fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
-    fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
-    fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
-    fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
-    fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
-    fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
-    fprintf(args->fp,"# DC, discordance:\n");
-    fprintf(args->fp,"#     - query sample\n");
-    fprintf(args->fp,"#     - genotyped sample\n");
-    fprintf(args->fp,"#     - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
-    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
-    fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
-    fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
+    args->kstr.l = 0;
+    ksprintf(&args->kstr,"INFO\tsites-compared\t%u\n",args->ncmp);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-filtering-expression\t%u\n",args->nskip_filter);
+    ksprintf(&args->kstr,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+    ksprintf(&args->kstr,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+    ksprintf(&args->kstr,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+    ksprintf(&args->kstr,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
+    ksprintf(&args->kstr,"# DCv2, discordance version 2:\n");
+    ksprintf(&args->kstr,"#     - Query sample\n");
+    ksprintf(&args->kstr,"#     - Genotyped sample\n");
+    ksprintf(&args->kstr,"#     - Discordance, given either as an abstract score or number of mismatches, see the options -E/-u\n"
+                         "#       in man page for details. Note that samples with high missingness have fewer sites compared,\n"
+                         "#       which results in lower overall discordance. Therefore it is advisable to use the average score\n"
+                         "#       per site rather than the absolute value, i.e. divide the value by the number of sites compared\n"
+                         "#       (smaller value = better match)\n");
+    ksprintf(&args->kstr,"#     - Average negative log of HWE probability at matching sites, attempts to quantify the following\n"
+                         "#       intuition: rare genotype matches are more informative than common genotype matches, hence two\n"
+                         "#       samples with similar discordance can be further stratified by the HWE score (bigger value = better\n"
+                         "#       match, the observed concordance was less likely to occur by chance)\n");
+    ksprintf(&args->kstr,"#     - Number of sites compared for this pair of samples (bigger = more informative)\n");
+    ksprintf(&args->kstr,"#     - Number of matching genotypes\n");
+    ksprintf(&args->kstr,"#DCv2\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]Average -log P(HWE)\t[6]Number of sites compared\t[7]Number of matching genotypes\n");
+    if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
 
     int trim = args->ntop;
     if ( !args->pairs )
@@ -824,26 +910,30 @@ static void report(args_t *args)
         int i;
         for (i=0; i<args->npairs; i++)
         {
+            args->kstr.l = 0;
             int iqry = args->pairs[i].iqry;
             int igt  = args->pairs[i].igt;
             if ( args->ndiff )
             {
-                fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                         args->qry_hdr->samples[iqry],
                         args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                         args->ndiff[i],
-                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
-                        args->ncnt[i]);
+                        (args->calc_hwe_prob && args->nmatch[i]) ? args->hwe_prob[i]/args->nmatch[i] : 0,
+                        args->ncnt[i],
+                        args->nmatch[i]);
             }
             else
             {
-                fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                         args->qry_hdr->samples[iqry],
                         args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                         args->pdiff[i],
-                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
-                        args->ncnt[i]);
+                        (args->calc_hwe_prob && args->nmatch[i]) ? args->hwe_prob[i]/args->nmatch[i] : 0,
+                        args->ncnt[i],
+                        args->nmatch[i]);
             }
+            if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
         }
     }
     else if ( !trim )
@@ -855,25 +945,29 @@ static void report(args_t *args)
             int ngt  = args->cross_check ? i : args->ngt_smpl;
             for (j=0; j<ngt; j++)
             {
+                args->kstr.l = 0;
                 int igt = args->gt_smpl ? args->gt_smpl[j] : j;
                 if ( args->ndiff )
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->ndiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
                 else
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->pdiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
+                if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
                 idx++;
             }
         }
@@ -888,7 +982,7 @@ static void report(args_t *args)
             for (j=0; j<args->ngt_smpl; j++)
             {
                 if ( args->sort_by_hwe )
-                    arr[j].val = -args->hwe_prob[idx];
+                    arr[j].val = args->nmatch[idx] ? -args->hwe_prob[idx]/args->nmatch[idx] : 0;  // -args->hwe_prob[idx];
                 else if ( args->ndiff )
                     arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
                 else
@@ -901,26 +995,30 @@ static void report(args_t *args)
             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
             for (j=0; j<args->ntop; j++)
             {
+                args->kstr.l = 0;
                 int idx = arr[j].idx;
                 int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
                 if ( args->ndiff )
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->ndiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
                 else
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->pdiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
+                if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
             }
         }
         free(arr);
@@ -936,7 +1034,7 @@ static void report(args_t *args)
             for (j=0; j<i; j++)
             {
                 if ( args->sort_by_hwe )
-                    arr[k].val = -args->hwe_prob[idx];
+                    arr[k].val = args->nmatch[idx] ? -args->hwe_prob[idx]/args->nmatch[idx] : 0;
                 else if ( args->ndiff )
                     arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
                 else
@@ -950,7 +1048,7 @@ static void report(args_t *args)
             {
                 idx = j*(j+1)/2 + i;
                 if ( args->sort_by_hwe )
-                    arr[k].val = -args->hwe_prob[idx];
+                    arr[k].val = args->nmatch[idx] ? -args->hwe_prob[idx]/args->nmatch[idx] : 0;
                 else if ( args->ndiff )
                     arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
                 else
@@ -963,27 +1061,31 @@ static void report(args_t *args)
             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
             for (j=0; j<args->ntop; j++)
             {
+                args->kstr.l = 0;
                 if ( i <= arr[j].ism ) continue;
                 int idx = arr[j].idx;
                 int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
                 if ( args->ndiff )
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->qry_hdr->samples[igt],
                             args->ndiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
                 else
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->qry_hdr->samples[igt],
                             args->pdiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
+                if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
             }
         }
         free(arr);
@@ -1053,12 +1155,16 @@ static void usage(void)
     fprintf(stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
 #endif
     fprintf(stderr, "        --dry-run                      Stop after first record to estimate required time\n");
-    fprintf(stderr, "    -e, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+    fprintf(stderr, "    -E, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+    fprintf(stderr, "    -e, --exclude [qry|gt]:EXPR        Exclude sites for which the expression is true\n");
     fprintf(stderr, "    -g, --genotypes FILE               Genotypes to compare against\n");
     fprintf(stderr, "    -H, --homs-only                    Homozygous genotypes only, useful with low coverage data (requires -g)\n");
+    fprintf(stderr, "    -i, --include [qry|gt]:EXPR        Include sites for which the expression is true\n");
     fprintf(stderr, "        --n-matches INT                Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
     fprintf(stderr, "                                           Use negative value to sort by HWE probability rather than by discordance [0]\n");
     fprintf(stderr, "        --no-HWE-prob                  Disable calculation of HWE probability\n");
+    fprintf(stderr, "    -o, --output FILE                  Write output to a file [standard output]\n");
+    fprintf(stderr, "    -O, --output-type t|z              t: plain tab-delimited text output, z: compressed [t]\n");
     fprintf(stderr, "    -p, --pairs LIST                   Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
     fprintf(stderr, "    -P, --pairs-file FILE              File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
     fprintf(stderr, "    -r, --regions REGION               Restrict to comma-separated list of regions\n");
@@ -1071,10 +1177,10 @@ static void usage(void)
     fprintf(stderr, "        --targets-overlap 0|1|2        Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(stderr, "    -u, --use TAG1[,TAG2]              Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
     fprintf(stderr, "Examples:\n");
-    fprintf(stderr, "   # Check discordance of all samples from B against all sample in A\n");
+    fprintf(stderr, "   # Check discordance of all samples from B against all samples in A\n");
     fprintf(stderr, "   bcftools gtcheck -g A.bcf B.bcf\n");
     fprintf(stderr, "\n");
-    fprintf(stderr, "   # Limit comparisons to the fiven list of samples\n");
+    fprintf(stderr, "   # Limit comparisons to the given list of samples\n");
     fprintf(stderr, "   bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "   # Compare only two pairs a1,b1 and a1,b2\n");
@@ -1091,9 +1197,10 @@ int main_vcfgtcheck(int argc, char *argv[])
     args->qry_use_GT = -1;
     args->gt_use_GT  = -1;
     args->calc_hwe_prob = 1;
-    args->use_PLs = 40;
+    args->gt_err = 40;
     args->regions_overlap = 1;
     args->targets_overlap = 0;
+    args->output_fname = "-";
 
     // external sort for --distinctive-sites
 #ifdef _WIN32
@@ -1112,7 +1219,11 @@ int main_vcfgtcheck(int argc, char *argv[])
 
     static struct option loptions[] =
     {
-        {"error-probability",1,0,'e'},
+        {"error-probability",1,0,'E'},  // note this used to be 'e', but can easily auto-detect to assure backward compatibility
+        {"exclude",required_argument,0,'e'},
+        {"include",required_argument,0,'i'},
+        {"output",required_argument,0,'o'},
+        {"output-type",required_argument,NULL,'O'},
         {"use",1,0,'u'},
         {"cluster",1,0,'c'},
         {"GTs-only",1,0,'G'},
@@ -1139,10 +1250,79 @@ int main_vcfgtcheck(int argc, char *argv[])
         {0,0,0,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:",loptions,NULL)) >= 0) {
         switch (c) {
+            case 'o': args->output_fname = optarg; break;
+            case 'O':
+                switch (optarg[0]) {
+                    case 't': args->output_type = FT_TAB_TEXT; break;
+                    case 'z': args->output_type = FT_VCF_GZ; break;
+                    default:
+                    {
+                        args->clevel = strtol(optarg,&tmp,10);
+                        if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+                    }
+                }
+                if ( optarg[1] )
+                {
+                    args->clevel = strtol(optarg+1,&tmp,10);
+                    if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
+                }
+                break;
             case 'e':
-                args->use_PLs = strtol(optarg,&tmp,10);
+                if ( !strncasecmp("gt:",optarg,3) )
+                {
+                    if ( args->gt_filter_str ) error("Error: only one -i or -e expression can be given for gt:, and they cannot be combined\n");
+                    args->gt_filter_str = optarg;
+                    args->gt_filter_logic |= FLT_EXCLUDE;
+                }
+                else if ( !strncasecmp("qry:",optarg,4) )
+                {
+                    if ( args->qry_filter_str ) error("Error: only one -i or -e expression can be given for qry:, and they cannot be combined\n");
+                    args->qry_filter_str = optarg;
+                    args->qry_filter_logic |= FLT_EXCLUDE;
+                }
+                else
+                {
+                    // this could be the old -e, --error-probability option
+                    args->gt_err = strtol(optarg,&tmp,10);
+                    if ( !tmp || *tmp )
+                    {
+                        // it is not
+                        args->gt_filter_str  = optarg;
+                        args->qry_filter_str = optarg;
+                        args->gt_filter_logic  |= FLT_EXCLUDE;
+                        args->qry_filter_logic |= FLT_EXCLUDE;
+                    }
+                    else
+                    {
+                        fprintf(stderr,"[warning] auto-detected the old format --error-probability option, please switch from -e to -E.\n");
+                    }
+                }
+                break;
+            case 'i':
+                if ( !strncasecmp("gt:",optarg,3) )
+                {
+                    if ( args->gt_filter_str ) error("Error: only one -i or -e expression can be given for gt:, and they cannot be combined\n");
+                    args->gt_filter_str = optarg;
+                    args->gt_filter_logic |= FLT_INCLUDE;
+                }
+                else if ( !strncasecmp("qry:",optarg,4) )
+                {
+                    if ( args->qry_filter_str ) error("Error: only one -i or -e expression can be given for qry:, and they cannot be combined\n");
+                    args->qry_filter_str = optarg;
+                    args->qry_filter_logic |= FLT_INCLUDE;
+                }
+                else
+                {
+                    args->gt_filter_str  = optarg;
+                    args->qry_filter_str = optarg;
+                    args->gt_filter_logic  |= FLT_INCLUDE;
+                    args->qry_filter_logic |= FLT_INCLUDE;
+                }
+                break;
+            case 'E':
+                args->gt_err = strtol(optarg,&tmp,10);
                 if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
                 break;
             case 'u':
@@ -1187,7 +1367,7 @@ int main_vcfgtcheck(int argc, char *argv[])
                     while ( *tmp && *tmp!=',' ) tmp++;
                     if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
                 }
-                args->use_PLs = 0;
+                args->gt_err = 0;
                 break;
             case 'c':
                 error("The -c option is to be implemented, please open an issue on github\n");
@@ -1247,7 +1427,7 @@ int main_vcfgtcheck(int argc, char *argv[])
     }
     if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
     if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
-    if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
+    if ( args->distinctive_sites && args->gt_err ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
 
     init_data(args);
 
@@ -1267,7 +1447,9 @@ int main_vcfgtcheck(int argc, char *argv[])
             gettimeofday(&t1, NULL);
             double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
             fprintf(stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
-            fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+            args->kstr.l = 0;
+            ksprintf(&args->kstr,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+            if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
             if ( args->dry_run ) break;
         }
     }
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c
index 54568b054..de7c61624 100644
--- a/bcftools/vcfgtcheck.c.pysam.c
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfgtcheck.c -- Check sample identity.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -41,11 +41,17 @@ THE SOFTWARE.  */
 #include <htslib/vcfutils.h>
 #include <htslib/kbitset.h>
 #include <htslib/hts_os.h>
+#include <htslib/bgzf.h>
 #include <inttypes.h>
 #include <sys/time.h>
 #include "bcftools.h"
 #include "extsort.h"
 //#include "hclust.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
 
 typedef struct
 {
@@ -58,20 +64,22 @@ typedef struct
     bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
     bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
     char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
-    int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
-    int regions_overlap, targets_overlap;
+    char *output_fname;
+    int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file, output_type;;
+    int regions_overlap, targets_overlap, clevel;
     int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
     int nused[2][2];
     double *pdiff, *qry_prob, *gt_prob;
-    uint32_t *ndiff,*ncnt,ncmp, npairs;
+    uint32_t *ndiff,*ncnt,*nmatch,ncmp, npairs;
     int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
     uint8_t *qry_dsg, *gt_dsg;
     pair_t *pairs;
     double *hwe_prob, dsg2prob[8][3], pl2prob[256];
     double min_inter_err, max_intra_err;
-    int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
-    FILE *fp;
-    unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
+    int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, gt_err;
+    BGZF *out_fh;
+    unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL, nskip_filter;
+    kstring_t kstr;
 
     // for --distinctive-sites
     double distinctive_sites;
@@ -79,6 +87,11 @@ typedef struct
     size_t diff_sites_size;
     extsort_t *es;
     char *es_tmp_prefix, *es_max_mem;
+
+    // include or exclude sites which match the filters
+    filter_t *qry_filter, *gt_filter;
+    char *qry_filter_str, *gt_filter_str;
+    int qry_filter_logic, gt_filter_logic;       // FLT_INCLUDE or FLT_EXCLUDE
 }
 args_t;
 
@@ -96,15 +109,17 @@ static void set_cwd(args_t *args)
     }
     assert(buf);
 }
-static void print_header(args_t *args, FILE *fp)
+static void print_header(args_t *args)
 {
-    fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
-    fprintf(fp, "# \t bcftools %s ", args->argv[0]);
+    args->kstr.l = 0;
+    ksprintf(&args->kstr, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
+    ksprintf(&args->kstr, "# \t bcftools %s ", args->argv[0]);
     int i;
     for (i=1; i<args->argc; i++)
-        fprintf(fp, " %s",args->argv[i]);
-    fprintf(fp, "\n# and the working directory was:\n");
-    fprintf(fp, "# \t %s\n#\n", args->cwd);
+        ksprintf(&args->kstr, " %s",args->argv[i]);
+    ksprintf(&args->kstr, "\n# and the working directory was:\n");
+    ksprintf(&args->kstr, "# \t %s\n#\n", args->cwd);
+    if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
 }
 
 static int cmp_int(const void *_a, const void *_b)
@@ -128,7 +143,7 @@ static int cmp_pair(const void *_a, const void *_b)
 
 typedef struct
 {
-    uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
+    uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosomes
     unsigned long kbs_dat[1];
 }
 diff_sites_t;
@@ -264,6 +279,9 @@ static void init_data(args_t *args)
         if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
     }
 
+    if ( args->gt_hdr && args->gt_filter_str ) args->gt_filter = filter_init(args->gt_hdr, args->gt_filter_str);
+    if ( args->qry_hdr && args->qry_filter_str ) args->qry_filter = filter_init(args->qry_hdr, args->qry_filter_str);
+
     // Determine whether GT or PL will be used
     if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
     {
@@ -373,56 +391,76 @@ static void init_data(args_t *args)
         args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
         args->gt_dsg  = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
     }
-    if ( args->use_PLs )
+    if ( args->gt_err )
     {
         args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff));      // log probability of pair samples being the same
         args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
         args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
 
+        // Convert genotypes to genotype likelihoods given by -E, the probability of reading one allele incorrectly. In this
+        // simple model we have:
+        //     - probability of reading an allele incorrectly, eg. 0 as 1 or 1 as 0
+        //         P(0|1) = P(1|0) = e
+        //     - probability of genotype G={00,01,11} being correct given observed dosage {0,1,2} and the
+        //       genotyping error probability e:
+        //          P(00|0) = 1       P(00|1) = e       P(00|2) = e^2
+        //          P(01|0) = e       P(01|1) = 1       P(01|2) = e
+        //          P(11|0) = e^2     P(11|1) = e       P(11|2) = 1
+        //
         // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
-        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+        // anything else indicated an error, this is just to reuse gt_to_dsg(); the second index are the corresponding
         // probabilities of 0/0, 0/1, and 1/1 genotypes
+        //
         for (i=0; i<8; i++)
             for (j=0; j<3; j++)
                 args->dsg2prob[i][j] = HUGE_VAL;
-        args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
-        args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
+        double eprob = pow(10,-0.1*args->gt_err);      // convert from phred score to probability
+        args->dsg2prob[1][0] = 0;               // P(00|0) = 1
+        args->dsg2prob[1][1] = -log(eprob);     // P(01|0) = e
+        args->dsg2prob[1][2] = -2*log(eprob);   // P(11|0) = e^2
+        args->dsg2prob[2][0] = -log(eprob);     // P(00|1) = e
+        args->dsg2prob[2][1] = 0;               // P(01|1) = 1
+        args->dsg2prob[2][2] = -log(eprob);     // P(11|1) = e
+        args->dsg2prob[4][0] = -2*log(eprob);   // P(00|2) = e^2
+        args->dsg2prob[4][1] = -log(eprob);     // P(01|2) = e
+        args->dsg2prob[4][2] = 0;               // P(11|2) = 1
 
         // lookup table to avoid exponentiation
         for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
     }
     else
         args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff));    // number of differing genotypes for each pair of samples
-    args->ncnt  = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
+    args->ncnt   = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
     if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
     if ( args->calc_hwe_prob )
     {
         // prob of the observed sequence of matches given site AFs and HWE
         args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
         if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
+        args->nmatch = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of matches, used only with calc_hwe_prob
+        if ( !args->nmatch ) error("Error: failed to allocate %.1f Mb.\n", args->npairs*sizeof(*args->ncnt)/1e6);
     }
 
     if ( args->distinctive_sites ) diff_sites_init(args);
 
-    args->fp = bcftools_stdout;
-    print_header(args, args->fp);
+    args->out_fh = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu");
+    if ( args->out_fh == NULL )
+        error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname ? args->output_fname : "standard output", strerror(errno));
+
+    print_header(args);
 }
 
 static void destroy_data(args_t *args)
 {
+    free(args->kstr.s);
+    if ( args->gt_filter ) filter_destroy(args->gt_filter);
+    if ( args->qry_filter ) filter_destroy(args->qry_filter);
     if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
     free(args->qry_dsg);
     if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
     free(args->qry_prob);
     free(args->es_max_mem);
-    fclose(args->fp);
+    if ( bgzf_close(args->out_fh)!=0 )  error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
     if ( args->distinctive_sites ) diff_sites_destroy(args);
     free(args->hwe_prob);
     free(args->cwd);
@@ -431,6 +469,7 @@ static void destroy_data(args_t *args)
     free(args->pdiff);
     free(args->ndiff);
     free(args->ncnt);
+    free(args->nmatch);
     free(args->qry_smpl);
     if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
     free(args->pairs);
@@ -540,6 +579,13 @@ static void process_line(args_t *args)
     int i,j,k, nqry1, ngt1, ret;
 
     bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0);   // the query file
+    if ( args->qry_filter )
+    {
+        int pass = filter_test(args->qry_filter, qry_rec, NULL);
+        if ( args->qry_filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1;
+        if ( !pass ) { args->nskip_filter++; return; }
+    }
+
     int qry_use_GT = args->qry_use_GT;
     int gt_use_GT  = args->gt_use_GT;
 
@@ -549,6 +595,12 @@ static void process_line(args_t *args)
     if ( args->gt_hdr )
     {
         gt_rec = bcf_sr_get_line(args->files,1);
+        if ( args->gt_filter )
+        {
+            int pass = filter_test(args->gt_filter, gt_rec, NULL);
+            if ( args->gt_filter_logic==FLT_EXCLUDE ) pass = pass ? 0 : 1;
+            if ( !pass ) { args->nskip_filter++; return; }
+        }
         ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, &gt_use_GT);
         if ( ret<0 ) return;
     }
@@ -562,7 +614,7 @@ static void process_line(args_t *args)
     args->ncmp++;
     args->nused[qry_use_GT][gt_use_GT]++;
 
-    double af,hwe_dsg[8];
+    double hwe_dsg[8];
     if ( args->calc_hwe_prob )
     {
         int ac[2];
@@ -572,18 +624,28 @@ static void process_line(args_t *args)
         }
         else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
 
-        // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
-        // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
-        //      hwe[1,2,4] ..  dsg=0,1,2
-        //      hwe[3]     ..  dsg=0 or 1
-        //      hwe[6]     ..  dsg=1 or 2
-
-        double hwe[3];
-        const double min_af = 1e-5;             // cap the AF in case we get unrealistic values
-        af = (double)ac[1]/(ac[0]+ac[1]);
-        hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
-        hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
-        hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
+        // Calculate HWE probability for each possible qry+gt dosage combination. The alternate allele dosage
+        // values returned by gt_to_prob() below are 0,1,2,4 (0=missing, 1<<0, 1<<1, 1<<2). We consider only
+        // biallelic sites, therefore we work with eight genotype combinations.
+        //
+        // The array hwe_dsg is accessed with hwe_dsg[qry_dsg & gt_dsg] and is constructed to account for PL uncertainty
+        // when we encounter less informative PL, such as PL=0,0,10, where multiple dosage values are equally
+        // likely. If we allowed complete uncertainty (PL=0,0,0), we'd have up to eight possible genotype
+        // mask combinations: from e.g. 0=(gt_dsg=1<<0 & qry_dsg=1<<1) to 7=(gt_dsg=1<<0|1<<1|1<<2 & qry_dsg=1<<0|1<<1|1<<2).
+        // Note the extreme case of 1|2|4 is skipped, see pl_to_dsg().
+        //
+        // When the dosage is uncertain, we take the minimum of their corresponding HWE value, for example
+        //      hwe[0] = 0
+        //      hwe[1] = (1-AF)**2
+        //      hwe[2] = 2*AF*(1-AF)
+        //      hwe[4] = AF**2
+        //      hwe[3] = min{hwe[1],hwe[2]}
+
+        double hwe[3];  // while hwe_dsg iterates over dsg bitmasks (0..7), hwe iterates over dsg (0,1,2)
+        double af = ac[0]+ac[1] ? (double)ac[1]/(ac[0]+ac[1]) : 1e-6;
+        hwe[0] = -log((1-af)*(1-af));
+        hwe[1] = -log(2*af*(1-af));
+        hwe[2] = -log(af*af);
         hwe_dsg[0] = 0;
         for (i=1; i<8; i++)
         {
@@ -598,7 +660,7 @@ static void process_line(args_t *args)
     // The sample pairs were given explicitly via -p/-P options
     if ( args->pairs )
     {
-        if ( !args->use_PLs )
+        if ( !args->gt_err )
         {
             int ndiff = 0;
             if ( args->kbs_diff ) diff_sites_reset(args);
@@ -623,13 +685,17 @@ static void process_line(args_t *args)
                     args->ndiff[i]++;
                     if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
                 }
-                else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
+                else if ( args->calc_hwe_prob )
+                {
+                    args->hwe_prob[i] += hwe_dsg[match];
+                    args->nmatch[i]++;
+                }
                 args->ncnt[i]++;
             }
 
             if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
         }
-        else    // use_PLs set
+        else    // gt_err set
         {
             for (i=0; i<args->npairs; i++)
             {
@@ -657,6 +723,7 @@ static void process_line(args_t *args)
                 {
                     int match = qry_dsg & gt_dsg;
                     args->hwe_prob[i] += hwe_dsg[match];
+                    if ( match ) args->nmatch[i]++;
                 }
                 args->ncnt[i]++;
             }
@@ -665,7 +732,7 @@ static void process_line(args_t *args)
     }
 
     int idx=0;
-    if ( !args->use_PLs )
+    if ( !args->gt_err )
     {
         for (i=0; i<args->nqry_smpl; i++)
         {
@@ -692,13 +759,17 @@ static void process_line(args_t *args)
                 if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
                 int match = args->qry_dsg[i] & args->gt_dsg[j];
                 if ( !match ) args->ndiff[idx]++;
-                else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
+                else if ( args->calc_hwe_prob )
+                {
+                    args->hwe_prob[idx] += hwe_dsg[match];
+                    args->nmatch[idx]++;
+                }
                 args->ncnt[idx]++;
                 idx++;
             }
         }
     }
-    else    // use_PLs set
+    else    // gt_err set
     {
         for (i=0; i<args->nqry_smpl; i++)
         {
@@ -733,6 +804,7 @@ static void process_line(args_t *args)
                 {
                     int match = args->qry_dsg[i] & args->gt_dsg[j];
                     args->hwe_prob[idx] += hwe_dsg[match];
+                    if ( match ) args->nmatch[idx]++;
                 }
                 args->ncnt[idx]++;
                 idx++;
@@ -760,12 +832,13 @@ static void report_distinctive_sites(args_t *args)
 {
     extsort_sort(args->es);
 
-    fprintf(args->fp,"# DS, distinctive sites:\n");
-    fprintf(args->fp,"#     - chromosome\n");
-    fprintf(args->fp,"#     - position\n");
-    fprintf(args->fp,"#     - cumulative number of pairs distinguished by this block\n");
-    fprintf(args->fp,"#     - block id\n");
-    fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
+    args->kstr.l = 0;
+    ksprintf(&args->kstr,"# DS, distinctive sites:\n");
+    ksprintf(&args->kstr,"#     - chromosome\n");
+    ksprintf(&args->kstr,"#     - position\n");
+    ksprintf(&args->kstr,"#     - cumulative number of pairs distinguished by this block\n");
+    ksprintf(&args->kstr,"#     - block id\n");
+    ksprintf(&args->kstr,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
 
     kbitset_t *kbs_blk = kbs_init(args->npairs);
     kbitset_iter_t itr;
@@ -785,7 +858,9 @@ static void report_distinctive_sites(args_t *args)
         if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
         if ( !ndiff_new ) continue;     // no new pair distinguished by this site
         ndiff_tot += ndiff_new;
-        fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+        args->kstr.l = 0;
+        ksprintf(&args->kstr,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+        if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
         if ( ndiff_tot < ndiff_min ) continue;   // fewer than the requested number of pairs can be distinguished at this point
         iblock++;
         ndiff_tot = 0;
@@ -795,24 +870,35 @@ static void report_distinctive_sites(args_t *args)
 }
 static void report(args_t *args)
 {
-    fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
-    fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
-    fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
-    fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
-    fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
-    fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
-    fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
-    fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
-    fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
-    fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
-    fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
-    fprintf(args->fp,"# DC, discordance:\n");
-    fprintf(args->fp,"#     - query sample\n");
-    fprintf(args->fp,"#     - genotyped sample\n");
-    fprintf(args->fp,"#     - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
-    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
-    fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
-    fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
+    args->kstr.l = 0;
+    ksprintf(&args->kstr,"INFO\tsites-compared\t%u\n",args->ncmp);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    ksprintf(&args->kstr,"INFO\tsites-skipped-filtering-expression\t%u\n",args->nskip_filter);
+    ksprintf(&args->kstr,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+    ksprintf(&args->kstr,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+    ksprintf(&args->kstr,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+    ksprintf(&args->kstr,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
+    ksprintf(&args->kstr,"# DCv2, discordance version 2:\n");
+    ksprintf(&args->kstr,"#     - Query sample\n");
+    ksprintf(&args->kstr,"#     - Genotyped sample\n");
+    ksprintf(&args->kstr,"#     - Discordance, given either as an abstract score or number of mismatches, see the options -E/-u\n"
+                         "#       in man page for details. Note that samples with high missingness have fewer sites compared,\n"
+                         "#       which results in lower overall discordance. Therefore it is advisable to use the average score\n"
+                         "#       per site rather than the absolute value, i.e. divide the value by the number of sites compared\n"
+                         "#       (smaller value = better match)\n");
+    ksprintf(&args->kstr,"#     - Average negative log of HWE probability at matching sites, attempts to quantify the following\n"
+                         "#       intuition: rare genotype matches are more informative than common genotype matches, hence two\n"
+                         "#       samples with similar discordance can be further stratified by the HWE score (bigger value = better\n"
+                         "#       match, the observed concordance was less likely to occur by chance)\n");
+    ksprintf(&args->kstr,"#     - Number of sites compared for this pair of samples (bigger = more informative)\n");
+    ksprintf(&args->kstr,"#     - Number of matching genotypes\n");
+    ksprintf(&args->kstr,"#DCv2\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]Average -log P(HWE)\t[6]Number of sites compared\t[7]Number of matching genotypes\n");
+    if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
 
     int trim = args->ntop;
     if ( !args->pairs )
@@ -826,26 +912,30 @@ static void report(args_t *args)
         int i;
         for (i=0; i<args->npairs; i++)
         {
+            args->kstr.l = 0;
             int iqry = args->pairs[i].iqry;
             int igt  = args->pairs[i].igt;
             if ( args->ndiff )
             {
-                fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                         args->qry_hdr->samples[iqry],
                         args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                         args->ndiff[i],
-                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
-                        args->ncnt[i]);
+                        (args->calc_hwe_prob && args->nmatch[i]) ? args->hwe_prob[i]/args->nmatch[i] : 0,
+                        args->ncnt[i],
+                        args->nmatch[i]);
             }
             else
             {
-                fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                         args->qry_hdr->samples[iqry],
                         args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                         args->pdiff[i],
-                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
-                        args->ncnt[i]);
+                        (args->calc_hwe_prob && args->nmatch[i]) ? args->hwe_prob[i]/args->nmatch[i] : 0,
+                        args->ncnt[i],
+                        args->nmatch[i]);
             }
+            if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
         }
     }
     else if ( !trim )
@@ -857,25 +947,29 @@ static void report(args_t *args)
             int ngt  = args->cross_check ? i : args->ngt_smpl;
             for (j=0; j<ngt; j++)
             {
+                args->kstr.l = 0;
                 int igt = args->gt_smpl ? args->gt_smpl[j] : j;
                 if ( args->ndiff )
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->ndiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
                 else
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->pdiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
+                if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
                 idx++;
             }
         }
@@ -890,7 +984,7 @@ static void report(args_t *args)
             for (j=0; j<args->ngt_smpl; j++)
             {
                 if ( args->sort_by_hwe )
-                    arr[j].val = -args->hwe_prob[idx];
+                    arr[j].val = args->nmatch[idx] ? -args->hwe_prob[idx]/args->nmatch[idx] : 0;  // -args->hwe_prob[idx];
                 else if ( args->ndiff )
                     arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
                 else
@@ -903,26 +997,30 @@ static void report(args_t *args)
             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
             for (j=0; j<args->ntop; j++)
             {
+                args->kstr.l = 0;
                 int idx = arr[j].idx;
                 int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
                 if ( args->ndiff )
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->ndiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
                 else
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
                             args->pdiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
+                if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
             }
         }
         free(arr);
@@ -938,7 +1036,7 @@ static void report(args_t *args)
             for (j=0; j<i; j++)
             {
                 if ( args->sort_by_hwe )
-                    arr[k].val = -args->hwe_prob[idx];
+                    arr[k].val = args->nmatch[idx] ? -args->hwe_prob[idx]/args->nmatch[idx] : 0;
                 else if ( args->ndiff )
                     arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
                 else
@@ -952,7 +1050,7 @@ static void report(args_t *args)
             {
                 idx = j*(j+1)/2 + i;
                 if ( args->sort_by_hwe )
-                    arr[k].val = -args->hwe_prob[idx];
+                    arr[k].val = args->nmatch[idx] ? -args->hwe_prob[idx]/args->nmatch[idx] : 0;
                 else if ( args->ndiff )
                     arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
                 else
@@ -965,27 +1063,31 @@ static void report(args_t *args)
             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
             for (j=0; j<args->ntop; j++)
             {
+                args->kstr.l = 0;
                 if ( i <= arr[j].ism ) continue;
                 int idx = arr[j].idx;
                 int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
                 if ( args->ndiff )
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%u\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->qry_hdr->samples[igt],
                             args->ndiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
                 else
                 {
-                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                    ksprintf(&args->kstr,"DCv2\t%s\t%s\t%e\t%e\t%u\t%u\n",
                             args->qry_hdr->samples[iqry],
                             args->qry_hdr->samples[igt],
                             args->pdiff[idx],
-                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
-                            args->ncnt[idx]);
+                            (args->calc_hwe_prob && args->nmatch[idx]) ? args->hwe_prob[idx]/args->nmatch[idx] : 0,
+                            args->ncnt[idx],
+                            args->calc_hwe_prob ? args->nmatch[idx] : 0);
                 }
+                if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
             }
         }
         free(arr);
@@ -1055,12 +1157,16 @@ static void usage(void)
     fprintf(bcftools_stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
 #endif
     fprintf(bcftools_stderr, "        --dry-run                      Stop after first record to estimate required time\n");
-    fprintf(bcftools_stderr, "    -e, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+    fprintf(bcftools_stderr, "    -E, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+    fprintf(bcftools_stderr, "    -e, --exclude [qry|gt]:EXPR        Exclude sites for which the expression is true\n");
     fprintf(bcftools_stderr, "    -g, --genotypes FILE               Genotypes to compare against\n");
     fprintf(bcftools_stderr, "    -H, --homs-only                    Homozygous genotypes only, useful with low coverage data (requires -g)\n");
+    fprintf(bcftools_stderr, "    -i, --include [qry|gt]:EXPR        Include sites for which the expression is true\n");
     fprintf(bcftools_stderr, "        --n-matches INT                Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
     fprintf(bcftools_stderr, "                                           Use negative value to sort by HWE probability rather than by discordance [0]\n");
     fprintf(bcftools_stderr, "        --no-HWE-prob                  Disable calculation of HWE probability\n");
+    fprintf(bcftools_stderr, "    -o, --output FILE                  Write output to a file [standard output]\n");
+    fprintf(bcftools_stderr, "    -O, --output-type t|z              t: plain tab-delimited text output, z: compressed [t]\n");
     fprintf(bcftools_stderr, "    -p, --pairs LIST                   Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
     fprintf(bcftools_stderr, "    -P, --pairs-file FILE              File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
     fprintf(bcftools_stderr, "    -r, --regions REGION               Restrict to comma-separated list of regions\n");
@@ -1073,10 +1179,10 @@ static void usage(void)
     fprintf(bcftools_stderr, "        --targets-overlap 0|1|2        Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(bcftools_stderr, "    -u, --use TAG1[,TAG2]              Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
     fprintf(bcftools_stderr, "Examples:\n");
-    fprintf(bcftools_stderr, "   # Check discordance of all samples from B against all sample in A\n");
+    fprintf(bcftools_stderr, "   # Check discordance of all samples from B against all samples in A\n");
     fprintf(bcftools_stderr, "   bcftools gtcheck -g A.bcf B.bcf\n");
     fprintf(bcftools_stderr, "\n");
-    fprintf(bcftools_stderr, "   # Limit comparisons to the fiven list of samples\n");
+    fprintf(bcftools_stderr, "   # Limit comparisons to the given list of samples\n");
     fprintf(bcftools_stderr, "   bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "   # Compare only two pairs a1,b1 and a1,b2\n");
@@ -1093,9 +1199,10 @@ int main_vcfgtcheck(int argc, char *argv[])
     args->qry_use_GT = -1;
     args->gt_use_GT  = -1;
     args->calc_hwe_prob = 1;
-    args->use_PLs = 40;
+    args->gt_err = 40;
     args->regions_overlap = 1;
     args->targets_overlap = 0;
+    args->output_fname = "-";
 
     // external sort for --distinctive-sites
 #ifdef _WIN32
@@ -1114,7 +1221,11 @@ int main_vcfgtcheck(int argc, char *argv[])
 
     static struct option loptions[] =
     {
-        {"error-probability",1,0,'e'},
+        {"error-probability",1,0,'E'},  // note this used to be 'e', but can easily auto-detect to assure backward compatibility
+        {"exclude",required_argument,0,'e'},
+        {"include",required_argument,0,'i'},
+        {"output",required_argument,0,'o'},
+        {"output-type",required_argument,NULL,'O'},
         {"use",1,0,'u'},
         {"cluster",1,0,'c'},
         {"GTs-only",1,0,'G'},
@@ -1141,10 +1252,79 @@ int main_vcfgtcheck(int argc, char *argv[])
         {0,0,0,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:E:i:o:O:",loptions,NULL)) >= 0) {
         switch (c) {
+            case 'o': args->output_fname = optarg; break;
+            case 'O':
+                switch (optarg[0]) {
+                    case 't': args->output_type = FT_TAB_TEXT; break;
+                    case 'z': args->output_type = FT_VCF_GZ; break;
+                    default:
+                    {
+                        args->clevel = strtol(optarg,&tmp,10);
+                        if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+                    }
+                }
+                if ( optarg[1] )
+                {
+                    args->clevel = strtol(optarg+1,&tmp,10);
+                    if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
+                }
+                break;
             case 'e':
-                args->use_PLs = strtol(optarg,&tmp,10);
+                if ( !strncasecmp("gt:",optarg,3) )
+                {
+                    if ( args->gt_filter_str ) error("Error: only one -i or -e expression can be given for gt:, and they cannot be combined\n");
+                    args->gt_filter_str = optarg;
+                    args->gt_filter_logic |= FLT_EXCLUDE;
+                }
+                else if ( !strncasecmp("qry:",optarg,4) )
+                {
+                    if ( args->qry_filter_str ) error("Error: only one -i or -e expression can be given for qry:, and they cannot be combined\n");
+                    args->qry_filter_str = optarg;
+                    args->qry_filter_logic |= FLT_EXCLUDE;
+                }
+                else
+                {
+                    // this could be the old -e, --error-probability option
+                    args->gt_err = strtol(optarg,&tmp,10);
+                    if ( !tmp || *tmp )
+                    {
+                        // it is not
+                        args->gt_filter_str  = optarg;
+                        args->qry_filter_str = optarg;
+                        args->gt_filter_logic  |= FLT_EXCLUDE;
+                        args->qry_filter_logic |= FLT_EXCLUDE;
+                    }
+                    else
+                    {
+                        fprintf(bcftools_stderr,"[warning] auto-detected the old format --error-probability option, please switch from -e to -E.\n");
+                    }
+                }
+                break;
+            case 'i':
+                if ( !strncasecmp("gt:",optarg,3) )
+                {
+                    if ( args->gt_filter_str ) error("Error: only one -i or -e expression can be given for gt:, and they cannot be combined\n");
+                    args->gt_filter_str = optarg;
+                    args->gt_filter_logic |= FLT_INCLUDE;
+                }
+                else if ( !strncasecmp("qry:",optarg,4) )
+                {
+                    if ( args->qry_filter_str ) error("Error: only one -i or -e expression can be given for qry:, and they cannot be combined\n");
+                    args->qry_filter_str = optarg;
+                    args->qry_filter_logic |= FLT_INCLUDE;
+                }
+                else
+                {
+                    args->gt_filter_str  = optarg;
+                    args->qry_filter_str = optarg;
+                    args->gt_filter_logic  |= FLT_INCLUDE;
+                    args->qry_filter_logic |= FLT_INCLUDE;
+                }
+                break;
+            case 'E':
+                args->gt_err = strtol(optarg,&tmp,10);
                 if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
                 break;
             case 'u':
@@ -1189,7 +1369,7 @@ int main_vcfgtcheck(int argc, char *argv[])
                     while ( *tmp && *tmp!=',' ) tmp++;
                     if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
                 }
-                args->use_PLs = 0;
+                args->gt_err = 0;
                 break;
             case 'c':
                 error("The -c option is to be implemented, please open an issue on github\n");
@@ -1249,7 +1429,7 @@ int main_vcfgtcheck(int argc, char *argv[])
     }
     if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
     if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
-    if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
+    if ( args->distinctive_sites && args->gt_err ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
 
     init_data(args);
 
@@ -1269,7 +1449,9 @@ int main_vcfgtcheck(int argc, char *argv[])
             gettimeofday(&t1, NULL);
             double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
             fprintf(bcftools_stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
-            fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+            args->kstr.l = 0;
+            ksprintf(&args->kstr,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+            if ( bgzf_write(args->out_fh, args->kstr.s, args->kstr.l)!=args->kstr.l ) error("Failed to write to %s\n", args->output_fname);
             if ( args->dry_run ) break;
         }
     }
diff --git a/bcftools/vcfhead.c b/bcftools/vcfhead.c
index 20be2a947..0b0222b52 100644
--- a/bcftools/vcfhead.c
+++ b/bcftools/vcfhead.c
@@ -1,6 +1,7 @@
 /*  vcfhead.c -- view VCF/BCF file headers.
 
     Copyright (C) 2021 University of Glasgow.
+    Copyright (C) 2023 Genome Research Ltd.
 
     Author: John Marshall <jmarshall@hey.com>
 
@@ -41,30 +42,36 @@ int main_vcfhead(int argc, char *argv[])
 "Usage: bcftools head [OPTION]... [FILE]\n"
 "\n"
 "Options:\n"
-"  -h, --headers INT   Display INT header lines [all]\n"
-"  -n, --records INT   Display INT variant record lines [none]\n"
+"  -h, --headers INT    Display INT header lines [all]\n"
+"  -n, --records INT    Display INT variant record lines [none]\n"
+"  -s, --samples INT    Display INT records starting with the #CHROM header line [none]\n"
 "\n";
 
     static const struct option loptions[] = {
         { "headers", required_argument, NULL, 'h' },
         { "records", required_argument, NULL, 'n' },
+        { "samples", required_argument, NULL, 's' },
         { NULL, 0, NULL, 0 }
     };
 
     int all_headers = 1;
+    int samples = 0;
     uint64_t nheaders = 0;
     uint64_t nrecords = 0;
 
     int c, nargs;
-    while ((c = getopt_long(argc, argv, "h:n:", loptions, NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:n:s:", loptions, NULL)) >= 0)
         switch (c) {
         case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break;
         case 'n': nrecords = strtoull(optarg, NULL, 0); break;
+        case 's': nrecords = strtoull(optarg, NULL, 0); samples = 1; break;
         default:
             fputs(usage, stderr);
             return EXIT_FAILURE;
         }
 
+    if ( samples && all_headers ) all_headers = 0;
+
     nargs = argc - optind;
     if (nargs == 0 && isatty(STDIN_FILENO)) {
         fputs(usage, stdout);
@@ -99,17 +106,34 @@ int main_vcfhead(int argc, char *argv[])
         bcf_hdr_format(hdr, 0, &str);
         fputs(ks_str(&str), stdout);
     }
-    else if (nheaders > 0) {
+    else if (nheaders > 0 || samples ) {
         bcf_hdr_format(hdr, 0, &str);
         char *lim = str.s;
         uint64_t n;
+        int samples_printed = 0;
         for (n = 0; n < nheaders; n++) {
+            if ( samples && !strncmp(lim,"#CHROM\t",7) ) samples_printed = 1;
             lim = strchr(lim, '\n');
             if (lim) lim++;
             else break;
         }
-        if (lim) *lim = '\0';
-        fputs(ks_str(&str), stdout);
+        if ( nheaders )
+        {
+            char tmp;
+            if (lim) { tmp = *lim; *lim = '\0'; }
+            fputs(ks_str(&str), stdout);
+            if (lim) *lim = tmp;
+        }
+        if ( lim && samples && !samples_printed )
+        {
+            while ( lim && *lim )
+            {
+                if ( !strncmp(lim,"#CHROM\t",7) ) { fputs(lim, stdout); break; }
+                lim = strchr(lim, '\n');
+                if (lim) lim++;
+                else break;
+            }
+        }
     }
 
     if (nrecords > 0) {
diff --git a/bcftools/vcfhead.c.pysam.c b/bcftools/vcfhead.c.pysam.c
index 09744f23e..832c9bd74 100644
--- a/bcftools/vcfhead.c.pysam.c
+++ b/bcftools/vcfhead.c.pysam.c
@@ -3,6 +3,7 @@
 /*  vcfhead.c -- view VCF/BCF file headers.
 
     Copyright (C) 2021 University of Glasgow.
+    Copyright (C) 2023 Genome Research Ltd.
 
     Author: John Marshall <jmarshall@hey.com>
 
@@ -43,30 +44,36 @@ int main_vcfhead(int argc, char *argv[])
 "Usage: bcftools head [OPTION]... [FILE]\n"
 "\n"
 "Options:\n"
-"  -h, --headers INT   Display INT header lines [all]\n"
-"  -n, --records INT   Display INT variant record lines [none]\n"
+"  -h, --headers INT    Display INT header lines [all]\n"
+"  -n, --records INT    Display INT variant record lines [none]\n"
+"  -s, --samples INT    Display INT records starting with the #CHROM header line [none]\n"
 "\n";
 
     static const struct option loptions[] = {
         { "headers", required_argument, NULL, 'h' },
         { "records", required_argument, NULL, 'n' },
+        { "samples", required_argument, NULL, 's' },
         { NULL, 0, NULL, 0 }
     };
 
     int all_headers = 1;
+    int samples = 0;
     uint64_t nheaders = 0;
     uint64_t nrecords = 0;
 
     int c, nargs;
-    while ((c = getopt_long(argc, argv, "h:n:", loptions, NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:n:s:", loptions, NULL)) >= 0)
         switch (c) {
         case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break;
         case 'n': nrecords = strtoull(optarg, NULL, 0); break;
+        case 's': nrecords = strtoull(optarg, NULL, 0); samples = 1; break;
         default:
             fputs(usage, bcftools_stderr);
             return EXIT_FAILURE;
         }
 
+    if ( samples && all_headers ) all_headers = 0;
+
     nargs = argc - optind;
     if (nargs == 0 && isatty(STDIN_FILENO)) {
         fputs(usage, bcftools_stdout);
@@ -101,17 +108,34 @@ int main_vcfhead(int argc, char *argv[])
         bcf_hdr_format(hdr, 0, &str);
         fputs(ks_str(&str), bcftools_stdout);
     }
-    else if (nheaders > 0) {
+    else if (nheaders > 0 || samples ) {
         bcf_hdr_format(hdr, 0, &str);
         char *lim = str.s;
         uint64_t n;
+        int samples_printed = 0;
         for (n = 0; n < nheaders; n++) {
+            if ( samples && !strncmp(lim,"#CHROM\t",7) ) samples_printed = 1;
             lim = strchr(lim, '\n');
             if (lim) lim++;
             else break;
         }
-        if (lim) *lim = '\0';
-        fputs(ks_str(&str), bcftools_stdout);
+        if ( nheaders )
+        {
+            char tmp;
+            if (lim) { tmp = *lim; *lim = '\0'; }
+            fputs(ks_str(&str), bcftools_stdout);
+            if (lim) *lim = tmp;
+        }
+        if ( lim && samples && !samples_printed )
+        {
+            while ( lim && *lim )
+            {
+                if ( !strncmp(lim,"#CHROM\t",7) ) { fputs(lim, bcftools_stdout); break; }
+                lim = strchr(lim, '\n');
+                if (lim) lim++;
+                else break;
+            }
+        }
     }
 
     if (nrecords > 0) {
diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c
index 1dd960ea7..17eac5f32 100644
--- a/bcftools/vcfindex.c
+++ b/bcftools/vcfindex.c
@@ -1,6 +1,6 @@
 /*  vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
 
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2024 Genome Research Ltd.
 
     Author: Shane McCarthy <sm15@sanger.ac.uk>
 
@@ -264,6 +264,7 @@ int main_vcfindex(int argc, char *argv[])
             default: usage();
         }
     }
+    if (!min_shift) tbi = 1;
     if (stats > total)
     {
         fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c
index ac9e3ba2b..8f6932f1f 100644
--- a/bcftools/vcfindex.c.pysam.c
+++ b/bcftools/vcfindex.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
 
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2024 Genome Research Ltd.
 
     Author: Shane McCarthy <sm15@sanger.ac.uk>
 
@@ -266,6 +266,7 @@ int main_vcfindex(int argc, char *argv[])
             default: usage();
         }
     }
+    if (!min_shift) tbi = 1;
     if (stats > total)
     {
         fprintf(bcftools_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c
index 4ee29b4c8..24a45685b 100644
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -34,6 +34,7 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/hts_os.h>
+#include <htslib/hts_defs.h>
 #include "bcftools.h"
 #include "filter.h"
 
@@ -58,7 +59,7 @@ typedef struct
     FILE *fh_log, *fh_sites;
     htsFile **fh_out;
     char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
-    char *isec_exact;
+    char *isec_exact, *file_list;
     int argc, record_cmd_line;
     char *index_fn;
     int write_index;
@@ -69,19 +70,21 @@ args_t;
  *  mkdir_p() - create new directory for a file $fname
  *  @fname:   the file name to create the directory for, the part after last "/" is ignored
  */
-void mkdir_p(const char *fmt, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2)
+mkdir_p(const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
     int n = vsnprintf(NULL, 0, fmt, ap) + 2;
     va_end(ap);
 
-    char *path = (char*)malloc(n);
+    char *tmp = (char*)malloc(n);
+    if (!tmp) error("Couldn't allocate space for path: %s\n", strerror(errno));
     va_start(ap, fmt);
-    vsnprintf(path, n, fmt, ap);
+    vsnprintf(tmp, n, fmt, ap);
     va_end(ap);
 
-    char *tmp = strdup(path), *p = tmp+1;
+    char *p = tmp+1;
     while (*p)
     {
         while (*p && *p!='/') p++;
@@ -89,12 +92,11 @@ void mkdir_p(const char *fmt, ...)
         char ctmp = *p;
         *p = 0;
         int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-        if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
+        if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", tmp,strerror(errno));
         *p = ctmp;
         while ( *p && *p=='/' ) p++;
     }
     free(tmp);
-    free(path);
 }
 
 /**
@@ -105,7 +107,8 @@ void mkdir_p(const char *fmt, ...)
  *
  *  Returns open file descriptor or NULL if mode is NULL.
  */
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...)
+FILE * HTS_FORMAT(HTS_PRINTF_FMT, 3, 4)
+open_file(char **fname, const char *mode, const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
@@ -117,7 +120,7 @@ FILE *open_file(char **fname, const char *mode, const char *fmt, ...)
     vsnprintf(str, n, fmt, ap);
     va_end(ap);
 
-    mkdir_p(str);
+    mkdir_p("%s", str);
     if ( !mode )
     {
         if ( !fname ) error("Uh: expected fname or mode\n");
@@ -150,8 +153,11 @@ void isec_vcf(args_t *args)
         if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
         if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
         if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
-        if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
-            error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
+        if ( init_index2(out_fh,files->readers[args->iwrite].header,
+                         args->output_fname,&args->index_fn,
+                         args->write_index)<0 )
+            error("Error: failed to initialise index for %s\n",
+                  args->output_fname?args->output_fname:"standard output");
     }
     if ( !args->nwrite && !out_std && !args->prefix )
         fprintf(stderr,"Note: -w option not given, printing list of sites...\n");
@@ -454,12 +460,14 @@ static void destroy_data(args_t *args)
         {
             if ( !args->fnames[i] ) continue;
             if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]);
-            if ( args->output_type==FT_VCF_GZ )
+            int is_tbi = !args->write_index 
+                      || (args->write_index&127) == HTS_FMT_TBI;
+            if ( args->output_type==FT_VCF_GZ && is_tbi )
             {
                 tbx_conf_t conf = tbx_conf_vcf;
                 tbx_index_build(args->fnames[i], -1, &conf);
             }
-            else if ( args->output_type==FT_BCF_GZ )
+            else if ( args->output_type==FT_BCF_GZ || !is_tbi )
             {
                 if ( bcf_index_build(args->fnames[i],14) ) error("Could not index %s\n", args->fnames[i]);
             }
@@ -484,6 +492,7 @@ static void usage(void)
     fprintf(stderr, "    -e, --exclude EXPR             Exclude sites for which the expression is true\n");
     fprintf(stderr, "    -f, --apply-filters LIST       Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
     fprintf(stderr, "    -i, --include EXPR             Include only sites for which the expression is true\n");
+    fprintf(stderr, "    -l, --file-list FILE           Read the input file names from the file\n");
     fprintf(stderr, "        --no-version               Do not append version and command line to the header\n");
     fprintf(stderr, "    -n, --nfiles [+-=~]INT         Output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
     fprintf(stderr, "    -o, --output FILE              Write output to a file [standard output]\n");
@@ -497,7 +506,7 @@ static void usage(void)
     fprintf(stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
     fprintf(stderr, "    -w, --write LIST               List of files to write with -p given as 1-based indexes. By default, all files are written\n");
-    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
+    fprintf(stderr, "    -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   # Create intersection and complements of two sets saving the output in dir/*\n");
@@ -541,6 +550,7 @@ int main_vcfisec(int argc, char *argv[])
         {"collapse",required_argument,NULL,'c'},
         {"complement",no_argument,NULL,'C'},
         {"apply-filters",required_argument,NULL,'f'},
+        {"file-list",required_argument,NULL,'l'},
         {"nfiles",required_argument,NULL,'n'},
         {"prefix",required_argument,NULL,'p'},
         {"write",required_argument,NULL,'w'},
@@ -554,11 +564,11 @@ int main_vcfisec(int argc, char *argv[])
         {"output-type",required_argument,NULL,'O'},
         {"threads",required_argument,NULL,9},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'o': args->output_fname = optarg; break;
             case 'O':
@@ -593,12 +603,16 @@ int main_vcfisec(int argc, char *argv[])
             case 'C':
                 if ( args->isec_op!=0 && args->isec_op!=OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n");
                 args->isec_op = OP_COMPLEMENT; break;
+            case 'l': args->file_list = optarg; break;
             case 'r': args->regions_list = optarg; break;
             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
             case 't': args->targets_list = optarg; break;
             case 'T': args->targets_list = optarg; targets_is_file = 1; break;
             case 'p': args->prefix = optarg; break;
-            case 'w': args->write_files = optarg; break;
+            case 'w':
+                if ( args->write_files ) error("The option -w accepts a list of indices and can be given only once\n");
+                args->write_files = optarg;
+                break;
             case 'i': add_filter(args, optarg, FLT_INCLUDE); break;
             case 'e': add_filter(args, optarg, FLT_EXCLUDE); break;
             case 'n':
@@ -626,13 +640,33 @@ int main_vcfisec(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
-            case 10 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(); break;
             default: error("Unknown argument: %s\n", optarg);
         }
     }
-    if ( argc-optind<1 ) usage();   // no file given
+    if ( argc-optind<1 && !args->file_list ) usage();   // no file given
+
+    int nfiles = 0,i;
+    char **files = NULL;
+    if ( args->file_list )
+    {
+        files = hts_readlines(args->file_list, &nfiles);
+        if ( !files ) error("Failed to read from %s\n", args->file_list);
+    }
+    if ( optind<argc )
+    {
+        int n = argc - optind;
+        files = (char**)realloc(files,sizeof(*files)*(n+nfiles));
+        for (i=nfiles; i>0; i--) files[n+i-1] = files[n+i-2];
+        for (i=0; i<n; i++) files[i] = strdup(argv[optind+i]);
+        nfiles += n;
+    }
+
     if ( args->targets_list )
     {
         bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
@@ -645,7 +679,7 @@ int main_vcfisec(int argc, char *argv[])
         if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
             error("Failed to read the regions: %s\n", args->regions_list);
     }
-    if ( argc-optind==2 && !args->isec_op )
+    if ( nfiles==2 && !args->isec_op )
     {
         args->isec_op = OP_VENN;
         if ( !args->prefix ) error("Expected the -p option\n");
@@ -656,11 +690,13 @@ int main_vcfisec(int argc, char *argv[])
         args->isec_n  = 1;
     }
     args->files->require_index = 1;
-    while (optind<argc)
+    for (i=0; i<nfiles; i++)
     {
-        if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
-        optind++;
+        if ( !bcf_sr_add_reader(args->files, files[i]) ) error("Failed to open %s: %s\n", files[i],bcf_sr_strerror(args->files->errnum));
+        free(files[i]);
     }
+    free(files);
+
     init_data(args);
     isec_vcf(args);
     destroy_data(args);
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c
index 76e4d3a9f..f4727c1d3 100644
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -36,6 +36,7 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/hts_os.h>
+#include <htslib/hts_defs.h>
 #include "bcftools.h"
 #include "filter.h"
 
@@ -60,7 +61,7 @@ typedef struct
     FILE *fh_log, *fh_sites;
     htsFile **fh_out;
     char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
-    char *isec_exact;
+    char *isec_exact, *file_list;
     int argc, record_cmd_line;
     char *index_fn;
     int write_index;
@@ -71,19 +72,21 @@ args_t;
  *  mkdir_p() - create new directory for a file $fname
  *  @fname:   the file name to create the directory for, the part after last "/" is ignored
  */
-void mkdir_p(const char *fmt, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2)
+mkdir_p(const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
     int n = vsnprintf(NULL, 0, fmt, ap) + 2;
     va_end(ap);
 
-    char *path = (char*)malloc(n);
+    char *tmp = (char*)malloc(n);
+    if (!tmp) error("Couldn't allocate space for path: %s\n", strerror(errno));
     va_start(ap, fmt);
-    vsnprintf(path, n, fmt, ap);
+    vsnprintf(tmp, n, fmt, ap);
     va_end(ap);
 
-    char *tmp = strdup(path), *p = tmp+1;
+    char *p = tmp+1;
     while (*p)
     {
         while (*p && *p!='/') p++;
@@ -91,12 +94,11 @@ void mkdir_p(const char *fmt, ...)
         char ctmp = *p;
         *p = 0;
         int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-        if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
+        if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", tmp,strerror(errno));
         *p = ctmp;
         while ( *p && *p=='/' ) p++;
     }
     free(tmp);
-    free(path);
 }
 
 /**
@@ -107,7 +109,8 @@ void mkdir_p(const char *fmt, ...)
  *
  *  Returns open file descriptor or NULL if mode is NULL.
  */
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...)
+FILE * HTS_FORMAT(HTS_PRINTF_FMT, 3, 4)
+open_file(char **fname, const char *mode, const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
@@ -119,7 +122,7 @@ FILE *open_file(char **fname, const char *mode, const char *fmt, ...)
     vsnprintf(str, n, fmt, ap);
     va_end(ap);
 
-    mkdir_p(str);
+    mkdir_p("%s", str);
     if ( !mode )
     {
         if ( !fname ) error("Uh: expected fname or mode\n");
@@ -152,8 +155,11 @@ void isec_vcf(args_t *args)
         if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
         if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
         if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
-        if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
-            error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
+        if ( init_index2(out_fh,files->readers[args->iwrite].header,
+                         args->output_fname,&args->index_fn,
+                         args->write_index)<0 )
+            error("Error: failed to initialise index for %s\n",
+                  args->output_fname?args->output_fname:"standard output");
     }
     if ( !args->nwrite && !out_std && !args->prefix )
         fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n");
@@ -456,12 +462,14 @@ static void destroy_data(args_t *args)
         {
             if ( !args->fnames[i] ) continue;
             if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]);
-            if ( args->output_type==FT_VCF_GZ )
+            int is_tbi = !args->write_index 
+                      || (args->write_index&127) == HTS_FMT_TBI;
+            if ( args->output_type==FT_VCF_GZ && is_tbi )
             {
                 tbx_conf_t conf = tbx_conf_vcf;
                 tbx_index_build(args->fnames[i], -1, &conf);
             }
-            else if ( args->output_type==FT_BCF_GZ )
+            else if ( args->output_type==FT_BCF_GZ || !is_tbi )
             {
                 if ( bcf_index_build(args->fnames[i],14) ) error("Could not index %s\n", args->fnames[i]);
             }
@@ -486,6 +494,7 @@ static void usage(void)
     fprintf(bcftools_stderr, "    -e, --exclude EXPR             Exclude sites for which the expression is true\n");
     fprintf(bcftools_stderr, "    -f, --apply-filters LIST       Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
     fprintf(bcftools_stderr, "    -i, --include EXPR             Include only sites for which the expression is true\n");
+    fprintf(bcftools_stderr, "    -l, --file-list FILE           Read the input file names from the file\n");
     fprintf(bcftools_stderr, "        --no-version               Do not append version and command line to the header\n");
     fprintf(bcftools_stderr, "    -n, --nfiles [+-=~]INT         Output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
     fprintf(bcftools_stderr, "    -o, --output FILE              Write output to a file [standard output]\n");
@@ -499,7 +508,7 @@ static void usage(void)
     fprintf(bcftools_stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
     fprintf(bcftools_stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
     fprintf(bcftools_stderr, "    -w, --write LIST               List of files to write with -p given as 1-based indexes. By default, all files are written\n");
-    fprintf(bcftools_stderr, "        --write-index              Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "    -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Examples:\n");
     fprintf(bcftools_stderr, "   # Create intersection and complements of two sets saving the output in dir/*\n");
@@ -543,6 +552,7 @@ int main_vcfisec(int argc, char *argv[])
         {"collapse",required_argument,NULL,'c'},
         {"complement",no_argument,NULL,'C'},
         {"apply-filters",required_argument,NULL,'f'},
+        {"file-list",required_argument,NULL,'l'},
         {"nfiles",required_argument,NULL,'n'},
         {"prefix",required_argument,NULL,'p'},
         {"write",required_argument,NULL,'w'},
@@ -556,11 +566,11 @@ int main_vcfisec(int argc, char *argv[])
         {"output-type",required_argument,NULL,'O'},
         {"threads",required_argument,NULL,9},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:l:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'o': args->output_fname = optarg; break;
             case 'O':
@@ -595,12 +605,16 @@ int main_vcfisec(int argc, char *argv[])
             case 'C':
                 if ( args->isec_op!=0 && args->isec_op!=OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n");
                 args->isec_op = OP_COMPLEMENT; break;
+            case 'l': args->file_list = optarg; break;
             case 'r': args->regions_list = optarg; break;
             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
             case 't': args->targets_list = optarg; break;
             case 'T': args->targets_list = optarg; targets_is_file = 1; break;
             case 'p': args->prefix = optarg; break;
-            case 'w': args->write_files = optarg; break;
+            case 'w':
+                if ( args->write_files ) error("The option -w accepts a list of indices and can be given only once\n");
+                args->write_files = optarg;
+                break;
             case 'i': add_filter(args, optarg, FLT_INCLUDE); break;
             case 'e': add_filter(args, optarg, FLT_EXCLUDE); break;
             case 'n':
@@ -628,13 +642,33 @@ int main_vcfisec(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
-            case 10 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(); break;
             default: error("Unknown argument: %s\n", optarg);
         }
     }
-    if ( argc-optind<1 ) usage();   // no file given
+    if ( argc-optind<1 && !args->file_list ) usage();   // no file given
+
+    int nfiles = 0,i;
+    char **files = NULL;
+    if ( args->file_list )
+    {
+        files = hts_readlines(args->file_list, &nfiles);
+        if ( !files ) error("Failed to read from %s\n", args->file_list);
+    }
+    if ( optind<argc )
+    {
+        int n = argc - optind;
+        files = (char**)realloc(files,sizeof(*files)*(n+nfiles));
+        for (i=nfiles; i>0; i--) files[n+i-1] = files[n+i-2];
+        for (i=0; i<n; i++) files[i] = strdup(argv[optind+i]);
+        nfiles += n;
+    }
+
     if ( args->targets_list )
     {
         bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
@@ -647,7 +681,7 @@ int main_vcfisec(int argc, char *argv[])
         if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
             error("Failed to read the regions: %s\n", args->regions_list);
     }
-    if ( argc-optind==2 && !args->isec_op )
+    if ( nfiles==2 && !args->isec_op )
     {
         args->isec_op = OP_VENN;
         if ( !args->prefix ) error("Expected the -p option\n");
@@ -658,11 +692,13 @@ int main_vcfisec(int argc, char *argv[])
         args->isec_n  = 1;
     }
     args->files->require_index = 1;
-    while (optind<argc)
+    for (i=0; i<nfiles; i++)
     {
-        if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
-        optind++;
+        if ( !bcf_sr_add_reader(args->files, files[i]) ) error("Failed to open %s: %s\n", files[i],bcf_sr_strerror(args->files->errnum));
+        free(files[i]);
     }
+    free(files);
+
     init_data(args);
     isec_vcf(args);
     destroy_data(args);
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c
index 87b6b8a39..3ca5f287a 100644
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -1,6 +1,6 @@
 /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
 
-    Copyright (C) 2012-2023 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -34,6 +34,8 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/faidx.h>
+#include <htslib/kbitset.h>
+#include <htslib/hts_endian.h>
 #include <math.h>
 #include <ctype.h>
 #include <time.h>
@@ -172,7 +174,7 @@ typedef struct
     maux_t *maux;
     regidx_t *regs;    // apply regions only after the blocks are expanded
     regitr_t *regs_itr;
-    int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
+    int header_only, collapse, output_type, force_samples, force_single, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
     char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
     faidx_t *gvcf_fai;
     info_rule_t *rules;
@@ -192,6 +194,7 @@ typedef struct
     int keep_AC_AN;
     char *index_fn;
     int write_index;
+    int trim_star_allele;   // 0=don't trim; 1=trim at variant sites; 2=trim at all sites
 }
 args_t;
 
@@ -437,6 +440,11 @@ static void info_rules_init(args_t *args)
             if ( str.l ) kputc(',',&str);
             kputs("QS:sum",&str);
         }
+        if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MIN_DP")) )
+        {
+            if ( str.l ) kputc(',',&str);
+            kputs("MIN_DP:min",&str);
+        }
         if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) )
         {
             if ( str.l ) kputc(',',&str);
@@ -1272,32 +1280,32 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i
         if ( len==BCF_VL_A || len==BCF_VL_R )
         {
             int ifrom = len==BCF_VL_A ? 1 : 0;
-            #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
-                type_t *src = (type_t *) info->vptr; \
+            #define BRANCH(type_t, convert, is_missing, is_vector_end, out_type_t) { \
+                uint8_t *src = info->vptr; \
                 out_type_t *tgt = (out_type_t *) agr->buf; \
                 int iori, inew; \
-                for (iori=ifrom; iori<line->n_allele; iori++) \
+                for (iori=ifrom; iori<line->n_allele; iori++, src += sizeof(type_t)) \
                 { \
+                    type_t val = convert(src); \
                     if ( is_vector_end ) break; \
                     if ( is_missing ) continue; \
                     inew = als->map[iori] - ifrom; \
-                    tgt[inew] = *src; \
-                    src++; \
+                    tgt[inew] = val; \
                 } \
             }
             switch (info->type) {
-                case BCF_BT_INT8:  BRANCH(int8_t,  *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  int); break;
-                case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
-                case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
-                case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
+                case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  int); break;
+                case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, int); break;
+                case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, int); break;
+                case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), float); break;
                 default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
             }
             #undef BRANCH
         }
         else
         {
-            #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
-                type_t *src = (type_t *) info->vptr; \
+            #define BRANCH(type_t, convert, is_missing, is_vector_end, out_type_t) { \
+                uint8_t *src = info->vptr; \
                 out_type_t *tgt = (out_type_t *) agr->buf; \
                 int iori,jori, inew,jnew; \
                 for (iori=0; iori<line->n_allele; iori++) \
@@ -1307,19 +1315,20 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i
                     { \
                         jnew = als->map[jori]; \
                         int kori = iori*(iori+1)/2 + jori; \
+                        type_t val = convert(&src[kori * sizeof(type_t)]); \
                         if ( is_vector_end ) break; \
                         if ( is_missing ) continue; \
                         int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
-                        tgt[knew] = src[kori]; \
+                        tgt[knew] = val; \
                     } \
                     if ( jori<=iori ) break; \
                 } \
             }
             switch (info->type) {
-                case BCF_BT_INT8:  BRANCH(int8_t,  src[kori]==bcf_int8_missing,  src[kori]==bcf_int8_vector_end,  int); break;
-                case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
-                case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
-                case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
+                case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  int); break;
+                case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, int); break;
+                case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, int); break;
+                case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), float); break;
                 default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
             }
             #undef BRANCH
@@ -1488,12 +1497,12 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
 {
     int i,j, max_ploidy = 0;
 
-    #define BRANCH(type_t, vector_end) { \
-        type_t *ptr  = (type_t*) fmt->p; \
+    #define BRANCH(type_t, convert, vector_end) { \
+        uint8_t *ptr  = fmt->p; \
         for (i=0; i<nsmpl; i++) \
         { \
             for (j=0; j<fmt->n; j++) \
-                if ( ptr[j]==vector_end ) break; \
+                if ( convert(&ptr[j * sizeof(type_t)])==vector_end ) break; \
             if ( j==fmt->n ) \
             { \
                 /* all fields were used */ \
@@ -1501,14 +1510,14 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
                 break; \
             } \
             if ( max_ploidy < j ) max_ploidy = j; \
-            ptr += fmt->n; \
+            ptr += fmt->n * sizeof(type_t); \
         } \
     }
     switch (fmt->type)
     {
-        case BCF_BT_INT8:  BRANCH(int8_t,   bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_vector_end); break;
         default: error("Unexpected case: %d\n", fmt->type);
     }
     #undef BRANCH
@@ -1598,19 +1607,22 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
         int *map = ma->buf[i].rec[ma->buf[i].cur].map;
         double *allele_prob = ma->tmpd;
         int *idx = ma->tmpi;
-        #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \
-            src_type_t *src = (src_type_t*) fmt_ori->p; \
+        #define BRANCH(src_type_t, convert, src_is_missing, src_is_vector_end, pl2prob_idx) { \
+            uint8_t *src = fmt_ori->p; \
             for (j=0; j<nsmpl; j++) \
             { \
                 for (k=0; k<line->n_allele; k++) allele_prob[k] = 0; \
                 for (k=0; k<line->n_allele; k++) \
                     for (l=0; l<=k; l++) \
                     { \
-                        if ( src_is_missing || src_is_vector_end ) { src++; continue; } \
-                        double prob = ma->pl2prob[pl2prob_idx]; \
-                        allele_prob[k] += prob; \
-                        allele_prob[l] += prob; \
-                        src++; \
+                        src_type_t val = convert(src); \
+                        if ( !(src_is_missing) && !(src_is_vector_end) ) \
+                        { \
+                            double prob = ma->pl2prob[pl2prob_idx]; \
+                            allele_prob[k] += prob; \
+                            allele_prob[l] += prob; \
+                        } \
+                        src += sizeof(src_type_t); \
                     } \
                 /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \
                 allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \
@@ -1637,9 +1649,9 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8:  BRANCH( int8_t, *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  *src); break;
-            case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
-            case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+            case BCF_BT_INT8:  BRANCH( int8_t, le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  val); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, val>=0 && val<PL2PROB_MAX ? val : PL2PROB_MAX-1); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, val>=0 && val<PL2PROB_MAX ? val : PL2PROB_MAX-1); break;
             default: error("Unexpected case: %d, PL\n", fmt_ori->type);
         }
         #undef BRANCH
@@ -1735,8 +1747,8 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
             continue;
         }
 
-        #define BRANCH(type_t, vector_end) { \
-            type_t *p_ori  = (type_t*) fmt_ori->p; \
+        #define BRANCH(type_t, convert, vector_end) { \
+            uint8_t *p_ori = fmt_ori->p; \
             if ( !ma->buf[i].rec[irec].als_differ ) \
             { \
                 /* the allele numbering is unchanged */ \
@@ -1744,14 +1756,15 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                 { \
                     for (k=0; k<fmt_ori->n; k++) \
                     { \
-                        if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+                        type_t val = convert(&p_ori[k * sizeof(type_t)]); \
+                        if ( val==vector_end ) break; /* smaller ploidy */ \
                         ma->smpl_ploidy[ismpl+j]++; \
-                        if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
-                        else tmp[k] = p_ori[k]; \
+                        if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
+                        else tmp[k] = val; \
                     } \
                     for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
                     tmp += nsize; \
-                    p_ori += fmt_ori->n; \
+                    p_ori += fmt_ori->n * sizeof(type_t); \
                 } \
                 ismpl += bcf_hdr_nsamples(hdr); \
                 continue; \
@@ -1761,27 +1774,28 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
             { \
                 for (k=0; k<fmt_ori->n; k++) \
                 { \
-                    if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+                    type_t val = convert(&p_ori[k * sizeof(type_t)]); \
+                    if ( val==vector_end ) break; /* smaller ploidy */ \
                     ma->smpl_ploidy[ismpl+j]++; \
-                    if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
+                    if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
                     else \
                     { \
-                        int al = (p_ori[k]>>1) - 1; \
+                        int al = (val>>1) - 1; \
                         al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \
-                        tmp[k] = (al << 1) | ((p_ori[k])&1); \
+                        tmp[k] = (al << 1) | ((val)&1); \
                     } \
                 } \
                 for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
                 tmp += nsize; \
-                p_ori += fmt_ori->n; \
+                p_ori += fmt_ori->n * sizeof(type_t); \
             } \
             ismpl += bcf_hdr_nsamples(hdr); \
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8: BRANCH(int8_t,   bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+            case BCF_BT_INT8: BRANCH(int8_t,   le_to_i8,  bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_vector_end); break;
             default: error("Unexpected case: %d\n", fmt_ori->type);
         }
         #undef BRANCH
@@ -1959,10 +1973,10 @@ void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf
         if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n");
 
         // localize
-        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+        #define BRANCH(tgt_type_t, src_type_t, convert, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
             for (j=0; j<nsmpl; j++) \
             { \
-                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                uint8_t *src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
                 tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
                 int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
                 int ii,ij,tgt_idx = 0; \
@@ -1972,9 +1986,10 @@ void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf
                     for (ij=0; ij<=ii; ij++) \
                     { \
                         int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \
+                        src_type_t val = convert(&src[src_idx * sizeof(src_type_t)]); \
                         if ( src_is_missing ) tgt_set_missing; \
                         else if ( src_is_vector_end ) break; \
-                        else tgt[tgt_idx] = src[src_idx]; \
+                        else tgt[tgt_idx] = val; \
                         tgt_idx++; \
                     } \
                 } \
@@ -1985,10 +2000,10 @@ void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            case BCF_BT_INT8:  BRANCH(int32_t, int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, le_to_i16, val==bcf_int32_missing, val==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
             default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
         }
         #undef BRANCH
@@ -2058,10 +2073,10 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
         }
 
         // localize
-        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+        #define BRANCH(tgt_type_t, src_type_t, convert, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
             for (j=0; j<nsmpl; j++) \
             { \
-                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                uint8_t *src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
                 tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
                 int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
                 int ii,tgt_idx = 0; \
@@ -2069,9 +2084,10 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
                 { \
                     if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
                     int src_idx = laa[ii] - ibeg; \
+                    src_type_t val = convert(&src[src_idx * sizeof(src_type_t)]); \
                     if ( src_is_missing ) tgt_set_missing; \
                     else if ( src_is_vector_end ) break; \
-                    else tgt[tgt_idx] = src[src_idx]; \
+                    else tgt[tgt_idx] = val; \
                     tgt_idx++; \
                 } \
                 if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
@@ -2081,10 +2097,10 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            case BCF_BT_INT8:  BRANCH(int32_t, int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
             default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
         }
         #undef BRANCH
@@ -2201,7 +2217,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
         }
 
         // set the values
-        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+        #define BRANCH(tgt_type_t, src_type_t, convert, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
             int j, l, k; \
             tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
             if ( !fmt_ori ) \
@@ -2214,7 +2230,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                 ismpl += bcf_hdr_nsamples(hdr); \
                 continue; \
             } \
-            src_type_t *src = (src_type_t*) fmt_ori->p; \
+            uint8_t *src = fmt_ori->p; \
             if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \
             { \
                 /* alleles unchanged, copy over */ \
@@ -2224,11 +2240,11 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     { \
                         if ( src_is_vector_end ) break; \
                         else if ( src_is_missing ) tgt_set_missing; \
-                        else *tgt = *src; \
-                        tgt++; src++; \
+                        else *tgt = convert(src); \
+                        tgt++; src += sizeof(src_type_t); \
                     } \
                     for (k=l; k<nsize; k++) { tgt_set_vector_end; tgt++; } \
-                    src += fmt_ori->n - l; \
+                    src += sizeof(src_type_t) * (fmt_ori->n - l); \
                 } \
                 ismpl += bcf_hdr_nsamples(hdr); \
                 continue; \
@@ -2240,8 +2256,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                 for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
                 { \
                     tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
-                    src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
-                    if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+                    src = fmt_ori->p + sizeof(src_type_t) * j * fmt_ori->n; \
+                    int tag_missing = src_is_missing && fmt_ori->n==1;  \
+                    if ( src_is_missing && fmt_ori->n>1 ) { \
+                        src += sizeof(src_type_t); \
+                        tag_missing = src_is_vector_end ; \
+                    } \
+                    if ( tag_missing ) \
                     { \
                         /* tag with missing value "." */ \
                         tgt_set_missing; \
@@ -2252,9 +2273,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
                     if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
                     {  \
-                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
                         int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
-                        for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                        src_type_t val = convert(&src[iunkn * sizeof(src_type_t)]); \
+                        for (l=0; l<ngsize; l++) { *tgt = val; tgt++; } \
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
                     { \
@@ -2262,9 +2284,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
                     { \
-                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
-                        src_type_t max = src[0]; \
-                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
+                        src_type_t max = convert(src); \
+                        for (l=1; l<fmt_ori->n; l++) \
+                        { \
+                            src_type_t val = convert(&src[l * sizeof(src_type_t)]); \
+                            if ( max < val ) max = val; \
+                        } \
                         for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
                     } \
                     else \
@@ -2278,11 +2304,11 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                         for (iori=0; iori<line->n_allele; iori++) \
                         { \
                             inew = ma->buf[i].rec[irec].map[iori]; \
-                            src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
+                            src = fmt_ori->p + (j*fmt_ori->n + iori) * sizeof(src_type_t); \
                             tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
                             if ( src_is_vector_end ) break; \
                             if ( src_is_missing ) tgt_set_missing; \
-                            else *tgt = *src; \
+                            else *tgt = convert(src); \
                         } \
                     } \
                     else \
@@ -2297,7 +2323,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                                 jnew = ma->buf[i].rec[irec].map[jori]; \
                                 int kori = iori*(iori+1)/2 + jori; \
                                 int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
-                                src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
+                                src = fmt_ori->p + (j*fmt_ori->n + kori) * sizeof(src_type_t); \
                                 tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + knew; \
                                 if ( src_is_vector_end ) \
                                 { \
@@ -2305,7 +2331,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                                     break; \
                                 } \
                                 if ( src_is_missing ) tgt_set_missing; \
-                                else *tgt = *src; \
+                                else *tgt = convert(src); \
                             } \
                         } \
                     } \
@@ -2318,19 +2344,25 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                 for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
                 { \
                     tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
-                    src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
-                    if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+                    src = fmt_ori->p + j*fmt_ori->size; \
+                    int tag_missing = src_is_missing && fmt_ori->n==1;  \
+                    if ( src_is_missing && fmt_ori->n>1 ) { \
+                        src += sizeof(src_type_t); \
+                        tag_missing = src_is_vector_end ; \
+                    } \
+                    if ( tag_missing ) \
                     { \
                         /* tag with missing value "." */ \
                         tgt_set_missing; \
                         for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
                         continue; \
                     } \
-                    src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
+                    src = fmt_ori->p + j*fmt_ori->size; \
                     if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
                     { \
                         int iunkn = ma->buf[i].unkn_allele; \
-                        for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                        src_type_t val = convert(&src[iunkn * sizeof(src_type_t)]); \
+                        for (l=0; l<nsize; l++) { *tgt = val; tgt++; } \
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
                     { \
@@ -2338,9 +2370,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
                     { \
-                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
-                        src_type_t max = src[0]; \
-                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
+                        src_type_t max = convert(src); \
+                        for (l=1; l<fmt_ori->n; l++) \
+                        { \
+                            src_type_t val = convert(&src[l * sizeof(src_type_t)]); \
+                            if ( max < val ) max = val; \
+                        } \
                         for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
                     } \
                     else \
@@ -2354,8 +2390,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                         tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
                         if ( src_is_vector_end ) break; \
                         if ( src_is_missing ) tgt_set_missing; \
-                        else *tgt = *src; \
-                        src++; \
+                        else *tgt = convert(src); \
+                        src += sizeof(src_type_t); \
                     } \
                 } \
             } \
@@ -2363,10 +2399,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
         }
         switch (type)
         {
-            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
-            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+            case BCF_BT_INT8:  BRANCH(int32_t, int8_t,  le_to_i8,  le_to_i8(src)==bcf_int8_missing,  le_to_i8(src)==bcf_int8_vector_end,  *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, le_to_i16, le_to_i16(src)==bcf_int16_missing, le_to_i16(src)==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, le_to_i32, le_to_i32(src)==bcf_int32_missing, le_to_i32(src)==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, le_to_float, bcf_float_is_missing(le_to_float(src)), bcf_float_is_vector_end(le_to_float(src)), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
             default: error("Unexpected case: %d, %s\n", type, key);
         }
         #undef BRANCH
@@ -2582,10 +2618,20 @@ void gvcf_write_block(args_t *args, int start, int end)
     }
     else
         bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+
+    int iunseen;
+    if ( args->trim_star_allele && (out->n_allele > 2 || args->trim_star_allele > 1) && (iunseen=get_unseen_allele(out)) && iunseen>0 )
+    {
+        // the unobserved star allele should be trimmed, either it is variant site or trimming of all sites was requested
+        kbitset_t *rm_set = kbs_init(out->n_allele);
+        kbs_insert(rm_set, iunseen);
+        if ( bcf_remove_allele_set(args->out_hdr,out,rm_set) )
+            error("[%s] Error: failed to trim the unobserved allele at %s:%"PRIhts_pos"\n",__func__,bcf_seqname(args->out_hdr,out),out->pos+1);
+        kbs_destroy(rm_set);
+    }
     if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     bcf_clear1(out);
 
-
     // Inactivate blocks which do not extend beyond END and find new gvcf_min
     min = INT_MAX;
     for (i=0; i<args->files->nreaders; i++)
@@ -3215,6 +3261,16 @@ void merge_line(args_t *args)
     if ( args->do_gvcf )
         bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
     merge_format(args, out);
+    int iunseen;
+    if ( args->trim_star_allele && (out->n_allele > 2 || args->trim_star_allele > 1) && (iunseen=get_unseen_allele(out)) && iunseen>0 )
+    {
+        // the unobserved star allele should be trimmed, either it is variant site or trimming of all sites was requested
+        kbitset_t *rm_set = kbs_init(out->n_allele);
+        kbs_insert(rm_set, iunseen);
+        if ( bcf_remove_allele_set(args->out_hdr,out,rm_set) )
+            error("[%s] Error: failed to trim the unobserved allele at %s:%"PRIhts_pos"\n",__func__,bcf_seqname(args->out_hdr,out),out->pos+1);
+        kbs_destroy(rm_set);
+    }
     if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     bcf_clear1(out);
 }
@@ -3344,9 +3400,11 @@ void merge_vcf(args_t *args)
         if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
         return;
     }
-    else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    else if ( init_index2(args->out_fh,args->out_hdr,args->output_fname,
+                          &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
-    if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
+    args->vcmp = vcmp_init();
     args->maux = maux_init(args);
     args->out_line = bcf_init1();
     args->tmph = kh_init(strdict);
@@ -3408,17 +3466,19 @@ static void usage(void)
     fprintf(stderr, "Usage:   bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Options:\n");
+    fprintf(stderr, "        --force-no-index              Merge unindexed files, synonymous to --no-index\n");
     fprintf(stderr, "        --force-samples               Resolve duplicate sample names\n");
+    fprintf(stderr, "        --force-single                Run even if there is only one file on input\n");
     fprintf(stderr, "        --print-header                Print only the merged header and exit\n");
     fprintf(stderr, "        --use-header FILE             Use the provided header\n");
     fprintf(stderr, "    -0  --missing-to-ref              Assume genotypes at missing sites are 0/0\n");
     fprintf(stderr, "    -f, --apply-filters LIST          Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
     fprintf(stderr, "    -F, --filter-logic x|+            Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
-    fprintf(stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
+    fprintf(stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,MIN_DP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
     fprintf(stderr, "    -i, --info-rules TAG:METHOD,..    Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
     fprintf(stderr, "    -l, --file-list FILE              Read file names from the file\n");
-    fprintf(stderr, "    -L, --local-alleles INT           EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
-    fprintf(stderr, "    -m, --merge STRING                Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+    fprintf(stderr, "    -L, --local-alleles INT           If more than INT alt alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
+    fprintf(stderr, "    -m, --merge STRING[*|**]          Allow multiallelic records for snps,indels,both,snp-ins-del,all,none,id,*,**; see man page for details [both]\n");
     fprintf(stderr, "    -M, --missing-rules TAG:METHOD    Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
     fprintf(stderr, "        --no-index                    Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
     fprintf(stderr, "        --no-version                  Do not append version and command line to the header\n");
@@ -3427,8 +3487,8 @@ static void usage(void)
     fprintf(stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
     fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
     fprintf(stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
-    fprintf(stderr, "        --threads INT                 Use multithreading with <int> worker threads [0]\n");
-    fprintf(stderr, "        --write-index                 Automatically index the output files [off]\n");
+    fprintf(stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
+    fprintf(stderr, "    -W, --write-index[=FMT]           Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -3470,12 +3530,14 @@ int main_vcfmerge(int argc, char *argv[])
         {"missing-rules",required_argument,NULL,'M'},
         {"no-version",no_argument,NULL,8},
         {"no-index",no_argument,NULL,10},
+        {"force-no-index",no_argument,NULL,10},
+        {"force-single",no_argument,NULL,12},
         {"filter-logic",required_argument,NULL,'F'},
-        {"write-index",no_argument,NULL,11},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'L':
                 args->local_alleles = strtol(optarg,&tmp,10);
@@ -3520,17 +3582,23 @@ int main_vcfmerge(int argc, char *argv[])
                 }
                 break;
             case 'm':
+            {
+                int len = strlen(optarg);
+                if ( optarg[len-1]=='*' ) { args->trim_star_allele++; len--; }
+                if ( optarg[len-1]=='*' ) { args->trim_star_allele++; len--; }
+                if ( optarg[len-1]==',' ) len--;
                 args->collapse = COLLAPSE_NONE;
-                if ( !strcmp(optarg,"snps") ) args->collapse |= COLLAPSE_SNPS;
-                else if ( !strcmp(optarg,"indels") ) args->collapse |= COLLAPSE_INDELS;
-                else if ( !strcmp(optarg,"both") ) args->collapse |= COLLAPSE_BOTH;
-                else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
-                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
-                else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
+                if ( !strncmp(optarg,"snp-ins-del",len) ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
+                else if ( !strncmp(optarg,"snps",len) ) args->collapse |= COLLAPSE_SNPS;
+                else if ( !strncmp(optarg,"indels",len) ) args->collapse |= COLLAPSE_INDELS;
+                else if ( !strncmp(optarg,"id",len) ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
+                else if ( !strncmp(optarg,"any",len) ) args->collapse |= COLLAPSE_ANY;
+                else if ( !strncmp(optarg,"all",len) ) args->collapse |= COLLAPSE_ANY;
+                else if ( !strncmp(optarg,"both",len) ) args->collapse |= COLLAPSE_BOTH;
+                else if ( !strncmp(optarg,"none",len) ) args->collapse = COLLAPSE_NONE;
                 else error("The -m type \"%s\" is not recognised.\n", optarg);
                 break;
+            }
             case 'f': args->files->apply_filters = optarg; break;
             case 'r': args->regions_list = optarg; break;
             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
@@ -3544,15 +3612,17 @@ int main_vcfmerge(int argc, char *argv[])
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
             case 10 : args->no_index = 1; break;
-            case 11 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
+            case 12 : args->force_single = 1; break;
             case 'h':
             case '?': usage(); break;
             default: error("Unknown argument: %s\n", optarg);
         }
     }
     if ( argc==optind && !args->file_list ) usage();
-    if ( argc-optind<2 && !args->file_list ) usage();
-
     if ( args->no_index )
     {
         if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n");
@@ -3593,6 +3663,9 @@ int main_vcfmerge(int argc, char *argv[])
         for (i=0; i<nfiles; i++) free(files[i]);
         free(files);
     }
+    if ( !args->files->nreaders ) usage();
+    if ( args->files->nreaders==1 && !args->force_single ) error("Expected two or more files to merge, got only one. Use --force-single to proceed anyway\n");
+
     merge_vcf(args);
     bcf_sr_destroy(args->files);
     if ( args->regs ) regidx_destroy(args->regs);
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c
index 7ce5dfa8d..d0802d07a 100644
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
 
-    Copyright (C) 2012-2023 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -36,6 +36,8 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/faidx.h>
+#include <htslib/kbitset.h>
+#include <htslib/hts_endian.h>
 #include <math.h>
 #include <ctype.h>
 #include <time.h>
@@ -174,7 +176,7 @@ typedef struct
     maux_t *maux;
     regidx_t *regs;    // apply regions only after the blocks are expanded
     regitr_t *regs_itr;
-    int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
+    int header_only, collapse, output_type, force_samples, force_single, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
     char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
     faidx_t *gvcf_fai;
     info_rule_t *rules;
@@ -194,6 +196,7 @@ typedef struct
     int keep_AC_AN;
     char *index_fn;
     int write_index;
+    int trim_star_allele;   // 0=don't trim; 1=trim at variant sites; 2=trim at all sites
 }
 args_t;
 
@@ -439,6 +442,11 @@ static void info_rules_init(args_t *args)
             if ( str.l ) kputc(',',&str);
             kputs("QS:sum",&str);
         }
+        if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MIN_DP")) )
+        {
+            if ( str.l ) kputc(',',&str);
+            kputs("MIN_DP:min",&str);
+        }
         if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) )
         {
             if ( str.l ) kputc(',',&str);
@@ -1274,32 +1282,32 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i
         if ( len==BCF_VL_A || len==BCF_VL_R )
         {
             int ifrom = len==BCF_VL_A ? 1 : 0;
-            #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
-                type_t *src = (type_t *) info->vptr; \
+            #define BRANCH(type_t, convert, is_missing, is_vector_end, out_type_t) { \
+                uint8_t *src = info->vptr; \
                 out_type_t *tgt = (out_type_t *) agr->buf; \
                 int iori, inew; \
-                for (iori=ifrom; iori<line->n_allele; iori++) \
+                for (iori=ifrom; iori<line->n_allele; iori++, src += sizeof(type_t)) \
                 { \
+                    type_t val = convert(src); \
                     if ( is_vector_end ) break; \
                     if ( is_missing ) continue; \
                     inew = als->map[iori] - ifrom; \
-                    tgt[inew] = *src; \
-                    src++; \
+                    tgt[inew] = val; \
                 } \
             }
             switch (info->type) {
-                case BCF_BT_INT8:  BRANCH(int8_t,  *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  int); break;
-                case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
-                case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
-                case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
+                case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  int); break;
+                case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, int); break;
+                case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, int); break;
+                case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), float); break;
                 default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1);
             }
             #undef BRANCH
         }
         else
         {
-            #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \
-                type_t *src = (type_t *) info->vptr; \
+            #define BRANCH(type_t, convert, is_missing, is_vector_end, out_type_t) { \
+                uint8_t *src = info->vptr; \
                 out_type_t *tgt = (out_type_t *) agr->buf; \
                 int iori,jori, inew,jnew; \
                 for (iori=0; iori<line->n_allele; iori++) \
@@ -1309,19 +1317,20 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i
                     { \
                         jnew = als->map[jori]; \
                         int kori = iori*(iori+1)/2 + jori; \
+                        type_t val = convert(&src[kori * sizeof(type_t)]); \
                         if ( is_vector_end ) break; \
                         if ( is_missing ) continue; \
                         int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
-                        tgt[knew] = src[kori]; \
+                        tgt[knew] = val; \
                     } \
                     if ( jori<=iori ) break; \
                 } \
             }
             switch (info->type) {
-                case BCF_BT_INT8:  BRANCH(int8_t,  src[kori]==bcf_int8_missing,  src[kori]==bcf_int8_vector_end,  int); break;
-                case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
-                case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
-                case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
+                case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  int); break;
+                case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, int); break;
+                case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, int); break;
+                case BCF_BT_FLOAT: BRANCH(float,   le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), float); break;
                 default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1);
             }
             #undef BRANCH
@@ -1490,12 +1499,12 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
 {
     int i,j, max_ploidy = 0;
 
-    #define BRANCH(type_t, vector_end) { \
-        type_t *ptr  = (type_t*) fmt->p; \
+    #define BRANCH(type_t, convert, vector_end) { \
+        uint8_t *ptr  = fmt->p; \
         for (i=0; i<nsmpl; i++) \
         { \
             for (j=0; j<fmt->n; j++) \
-                if ( ptr[j]==vector_end ) break; \
+                if ( convert(&ptr[j * sizeof(type_t)])==vector_end ) break; \
             if ( j==fmt->n ) \
             { \
                 /* all fields were used */ \
@@ -1503,14 +1512,14 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
                 break; \
             } \
             if ( max_ploidy < j ) max_ploidy = j; \
-            ptr += fmt->n; \
+            ptr += fmt->n * sizeof(type_t); \
         } \
     }
     switch (fmt->type)
     {
-        case BCF_BT_INT8:  BRANCH(int8_t,   bcf_int8_vector_end); break;
-        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
-        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_vector_end); break;
         default: error("Unexpected case: %d\n", fmt->type);
     }
     #undef BRANCH
@@ -1600,19 +1609,22 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
         int *map = ma->buf[i].rec[ma->buf[i].cur].map;
         double *allele_prob = ma->tmpd;
         int *idx = ma->tmpi;
-        #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \
-            src_type_t *src = (src_type_t*) fmt_ori->p; \
+        #define BRANCH(src_type_t, convert, src_is_missing, src_is_vector_end, pl2prob_idx) { \
+            uint8_t *src = fmt_ori->p; \
             for (j=0; j<nsmpl; j++) \
             { \
                 for (k=0; k<line->n_allele; k++) allele_prob[k] = 0; \
                 for (k=0; k<line->n_allele; k++) \
                     for (l=0; l<=k; l++) \
                     { \
-                        if ( src_is_missing || src_is_vector_end ) { src++; continue; } \
-                        double prob = ma->pl2prob[pl2prob_idx]; \
-                        allele_prob[k] += prob; \
-                        allele_prob[l] += prob; \
-                        src++; \
+                        src_type_t val = convert(src); \
+                        if ( !(src_is_missing) && !(src_is_vector_end) ) \
+                        { \
+                            double prob = ma->pl2prob[pl2prob_idx]; \
+                            allele_prob[k] += prob; \
+                            allele_prob[l] += prob; \
+                        } \
+                        src += sizeof(src_type_t); \
                     } \
                 /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \
                 allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \
@@ -1639,9 +1651,9 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8:  BRANCH( int8_t, *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  *src); break;
-            case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
-            case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+            case BCF_BT_INT8:  BRANCH( int8_t, le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  val); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, val>=0 && val<PL2PROB_MAX ? val : PL2PROB_MAX-1); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, val>=0 && val<PL2PROB_MAX ? val : PL2PROB_MAX-1); break;
             default: error("Unexpected case: %d, PL\n", fmt_ori->type);
         }
         #undef BRANCH
@@ -1737,8 +1749,8 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
             continue;
         }
 
-        #define BRANCH(type_t, vector_end) { \
-            type_t *p_ori  = (type_t*) fmt_ori->p; \
+        #define BRANCH(type_t, convert, vector_end) { \
+            uint8_t *p_ori = fmt_ori->p; \
             if ( !ma->buf[i].rec[irec].als_differ ) \
             { \
                 /* the allele numbering is unchanged */ \
@@ -1746,14 +1758,15 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                 { \
                     for (k=0; k<fmt_ori->n; k++) \
                     { \
-                        if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+                        type_t val = convert(&p_ori[k * sizeof(type_t)]); \
+                        if ( val==vector_end ) break; /* smaller ploidy */ \
                         ma->smpl_ploidy[ismpl+j]++; \
-                        if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
-                        else tmp[k] = p_ori[k]; \
+                        if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
+                        else tmp[k] = val; \
                     } \
                     for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
                     tmp += nsize; \
-                    p_ori += fmt_ori->n; \
+                    p_ori += fmt_ori->n * sizeof(type_t); \
                 } \
                 ismpl += bcf_hdr_nsamples(hdr); \
                 continue; \
@@ -1763,27 +1776,28 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
             { \
                 for (k=0; k<fmt_ori->n; k++) \
                 { \
-                    if ( p_ori[k]==vector_end ) break; /* smaller ploidy */ \
+                    type_t val = convert(&p_ori[k * sizeof(type_t)]); \
+                    if ( val==vector_end ) break; /* smaller ploidy */ \
                     ma->smpl_ploidy[ismpl+j]++; \
-                    if ( bcf_gt_is_missing(p_ori[k]) ) tmp[k] = 0; /* missing allele */ \
+                    if ( bcf_gt_is_missing(val) ) tmp[k] = 0; /* missing allele */ \
                     else \
                     { \
-                        int al = (p_ori[k]>>1) - 1; \
+                        int al = (val>>1) - 1; \
                         al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \
-                        tmp[k] = (al << 1) | ((p_ori[k])&1); \
+                        tmp[k] = (al << 1) | ((val)&1); \
                     } \
                 } \
                 for (; k<nsize; k++) tmp[k] = bcf_int32_vector_end; \
                 tmp += nsize; \
-                p_ori += fmt_ori->n; \
+                p_ori += fmt_ori->n * sizeof(type_t); \
             } \
             ismpl += bcf_hdr_nsamples(hdr); \
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8: BRANCH(int8_t,   bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+            case BCF_BT_INT8: BRANCH(int8_t,   le_to_i8,  bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_vector_end); break;
             default: error("Unexpected case: %d\n", fmt_ori->type);
         }
         #undef BRANCH
@@ -1961,10 +1975,10 @@ void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf
         if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n");
 
         // localize
-        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+        #define BRANCH(tgt_type_t, src_type_t, convert, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
             for (j=0; j<nsmpl; j++) \
             { \
-                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                uint8_t *src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
                 tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
                 int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
                 int ii,ij,tgt_idx = 0; \
@@ -1974,9 +1988,10 @@ void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf
                     for (ij=0; ij<=ii; ij++) \
                     { \
                         int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \
+                        src_type_t val = convert(&src[src_idx * sizeof(src_type_t)]); \
                         if ( src_is_missing ) tgt_set_missing; \
                         else if ( src_is_vector_end ) break; \
-                        else tgt[tgt_idx] = src[src_idx]; \
+                        else tgt[tgt_idx] = val; \
                         tgt_idx++; \
                     } \
                 } \
@@ -1987,10 +2002,10 @@ void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            case BCF_BT_INT8:  BRANCH(int32_t, int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, le_to_i16, val==bcf_int32_missing, val==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
             default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
         }
         #undef BRANCH
@@ -2060,10 +2075,10 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
         }
 
         // localize
-        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+        #define BRANCH(tgt_type_t, src_type_t, convert, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
             for (j=0; j<nsmpl; j++) \
             { \
-                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                uint8_t *src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
                 tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
                 int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
                 int ii,tgt_idx = 0; \
@@ -2071,9 +2086,10 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
                 { \
                     if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
                     int src_idx = laa[ii] - ibeg; \
+                    src_type_t val = convert(&src[src_idx * sizeof(src_type_t)]); \
                     if ( src_is_missing ) tgt_set_missing; \
                     else if ( src_is_vector_end ) break; \
-                    else tgt[tgt_idx] = src[src_idx]; \
+                    else tgt[tgt_idx] = val; \
                     tgt_idx++; \
                 } \
                 if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
@@ -2083,10 +2099,10 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
         }
         switch (fmt_ori->type)
         {
-            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
-            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            case BCF_BT_INT8:  BRANCH(int32_t, int8_t,  le_to_i8,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, le_to_i16, val==bcf_int16_missing, val==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, le_to_i32, val==bcf_int32_missing, val==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, le_to_float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
             default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
         }
         #undef BRANCH
@@ -2203,7 +2219,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
         }
 
         // set the values
-        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+        #define BRANCH(tgt_type_t, src_type_t, convert, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
             int j, l, k; \
             tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
             if ( !fmt_ori ) \
@@ -2216,7 +2232,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                 ismpl += bcf_hdr_nsamples(hdr); \
                 continue; \
             } \
-            src_type_t *src = (src_type_t*) fmt_ori->p; \
+            uint8_t *src = fmt_ori->p; \
             if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \
             { \
                 /* alleles unchanged, copy over */ \
@@ -2226,11 +2242,11 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     { \
                         if ( src_is_vector_end ) break; \
                         else if ( src_is_missing ) tgt_set_missing; \
-                        else *tgt = *src; \
-                        tgt++; src++; \
+                        else *tgt = convert(src); \
+                        tgt++; src += sizeof(src_type_t); \
                     } \
                     for (k=l; k<nsize; k++) { tgt_set_vector_end; tgt++; } \
-                    src += fmt_ori->n - l; \
+                    src += sizeof(src_type_t) * (fmt_ori->n - l); \
                 } \
                 ismpl += bcf_hdr_nsamples(hdr); \
                 continue; \
@@ -2242,8 +2258,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                 for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
                 { \
                     tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
-                    src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
-                    if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+                    src = fmt_ori->p + sizeof(src_type_t) * j * fmt_ori->n; \
+                    int tag_missing = src_is_missing && fmt_ori->n==1;  \
+                    if ( src_is_missing && fmt_ori->n>1 ) { \
+                        src += sizeof(src_type_t); \
+                        tag_missing = src_is_vector_end ; \
+                    } \
+                    if ( tag_missing ) \
                     { \
                         /* tag with missing value "." */ \
                         tgt_set_missing; \
@@ -2254,9 +2275,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
                     if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
                     {  \
-                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
                         int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
-                        for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                        src_type_t val = convert(&src[iunkn * sizeof(src_type_t)]); \
+                        for (l=0; l<ngsize; l++) { *tgt = val; tgt++; } \
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
                     { \
@@ -2264,9 +2286,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
                     { \
-                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
-                        src_type_t max = src[0]; \
-                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
+                        src_type_t max = convert(src); \
+                        for (l=1; l<fmt_ori->n; l++) \
+                        { \
+                            src_type_t val = convert(&src[l * sizeof(src_type_t)]); \
+                            if ( max < val ) max = val; \
+                        } \
                         for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
                     } \
                     else \
@@ -2280,11 +2306,11 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                         for (iori=0; iori<line->n_allele; iori++) \
                         { \
                             inew = ma->buf[i].rec[irec].map[iori]; \
-                            src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
+                            src = fmt_ori->p + (j*fmt_ori->n + iori) * sizeof(src_type_t); \
                             tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
                             if ( src_is_vector_end ) break; \
                             if ( src_is_missing ) tgt_set_missing; \
-                            else *tgt = *src; \
+                            else *tgt = convert(src); \
                         } \
                     } \
                     else \
@@ -2299,7 +2325,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                                 jnew = ma->buf[i].rec[irec].map[jori]; \
                                 int kori = iori*(iori+1)/2 + jori; \
                                 int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
-                                src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
+                                src = fmt_ori->p + (j*fmt_ori->n + kori) * sizeof(src_type_t); \
                                 tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + knew; \
                                 if ( src_is_vector_end ) \
                                 { \
@@ -2307,7 +2333,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                                     break; \
                                 } \
                                 if ( src_is_missing ) tgt_set_missing; \
-                                else *tgt = *src; \
+                                else *tgt = convert(src); \
                             } \
                         } \
                     } \
@@ -2320,19 +2346,25 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                 for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
                 { \
                     tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize; \
-                    src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
-                    if ( (src_is_missing && fmt_ori->n==1) || (++src && src_is_vector_end) ) \
+                    src = fmt_ori->p + j*fmt_ori->size; \
+                    int tag_missing = src_is_missing && fmt_ori->n==1;  \
+                    if ( src_is_missing && fmt_ori->n>1 ) { \
+                        src += sizeof(src_type_t); \
+                        tag_missing = src_is_vector_end ; \
+                    } \
+                    if ( tag_missing ) \
                     { \
                         /* tag with missing value "." */ \
                         tgt_set_missing; \
                         for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
                         continue; \
                     } \
-                    src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
+                    src = fmt_ori->p + j*fmt_ori->size; \
                     if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
                     { \
                         int iunkn = ma->buf[i].unkn_allele; \
-                        for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                        src_type_t val = convert(&src[iunkn * sizeof(src_type_t)]); \
+                        for (l=0; l<nsize; l++) { *tgt = val; tgt++; } \
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
                     { \
@@ -2340,9 +2372,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                     } \
                     else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
                     { \
-                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
-                        src_type_t max = src[0]; \
-                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        src = fmt_ori->p + sizeof(src_type_t)*j*fmt_ori->n; \
+                        src_type_t max = convert(src); \
+                        for (l=1; l<fmt_ori->n; l++) \
+                        { \
+                            src_type_t val = convert(&src[l * sizeof(src_type_t)]); \
+                            if ( max < val ) max = val; \
+                        } \
                         for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
                     } \
                     else \
@@ -2356,8 +2392,8 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
                         tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
                         if ( src_is_vector_end ) break; \
                         if ( src_is_missing ) tgt_set_missing; \
-                        else *tgt = *src; \
-                        src++; \
+                        else *tgt = convert(src); \
+                        src += sizeof(src_type_t); \
                     } \
                 } \
             } \
@@ -2365,10 +2401,10 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule
         }
         switch (type)
         {
-            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
-            case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
-            case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
-            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+            case BCF_BT_INT8:  BRANCH(int32_t, int8_t,  le_to_i8,  le_to_i8(src)==bcf_int8_missing,  le_to_i8(src)==bcf_int8_vector_end,  *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, le_to_i16, le_to_i16(src)==bcf_int16_missing, le_to_i16(src)==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, le_to_i32, le_to_i32(src)==bcf_int32_missing, le_to_i32(src)==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, le_to_float, bcf_float_is_missing(le_to_float(src)), bcf_float_is_vector_end(le_to_float(src)), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
             default: error("Unexpected case: %d, %s\n", type, key);
         }
         #undef BRANCH
@@ -2584,10 +2620,20 @@ void gvcf_write_block(args_t *args, int start, int end)
     }
     else
         bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+
+    int iunseen;
+    if ( args->trim_star_allele && (out->n_allele > 2 || args->trim_star_allele > 1) && (iunseen=get_unseen_allele(out)) && iunseen>0 )
+    {
+        // the unobserved star allele should be trimmed, either it is variant site or trimming of all sites was requested
+        kbitset_t *rm_set = kbs_init(out->n_allele);
+        kbs_insert(rm_set, iunseen);
+        if ( bcf_remove_allele_set(args->out_hdr,out,rm_set) )
+            error("[%s] Error: failed to trim the unobserved allele at %s:%"PRIhts_pos"\n",__func__,bcf_seqname(args->out_hdr,out),out->pos+1);
+        kbs_destroy(rm_set);
+    }
     if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     bcf_clear1(out);
 
-
     // Inactivate blocks which do not extend beyond END and find new gvcf_min
     min = INT_MAX;
     for (i=0; i<args->files->nreaders; i++)
@@ -3217,6 +3263,16 @@ void merge_line(args_t *args)
     if ( args->do_gvcf )
         bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
     merge_format(args, out);
+    int iunseen;
+    if ( args->trim_star_allele && (out->n_allele > 2 || args->trim_star_allele > 1) && (iunseen=get_unseen_allele(out)) && iunseen>0 )
+    {
+        // the unobserved star allele should be trimmed, either it is variant site or trimming of all sites was requested
+        kbitset_t *rm_set = kbs_init(out->n_allele);
+        kbs_insert(rm_set, iunseen);
+        if ( bcf_remove_allele_set(args->out_hdr,out,rm_set) )
+            error("[%s] Error: failed to trim the unobserved allele at %s:%"PRIhts_pos"\n",__func__,bcf_seqname(args->out_hdr,out),out->pos+1);
+        kbs_destroy(rm_set);
+    }
     if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     bcf_clear1(out);
 }
@@ -3346,9 +3402,11 @@ void merge_vcf(args_t *args)
         if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
         return;
     }
-    else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    else if ( init_index2(args->out_fh,args->out_hdr,args->output_fname,
+                          &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
-    if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
+    args->vcmp = vcmp_init();
     args->maux = maux_init(args);
     args->out_line = bcf_init1();
     args->tmph = kh_init(strdict);
@@ -3410,17 +3468,19 @@ static void usage(void)
     fprintf(bcftools_stderr, "Usage:   bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Options:\n");
+    fprintf(bcftools_stderr, "        --force-no-index              Merge unindexed files, synonymous to --no-index\n");
     fprintf(bcftools_stderr, "        --force-samples               Resolve duplicate sample names\n");
+    fprintf(bcftools_stderr, "        --force-single                Run even if there is only one file on input\n");
     fprintf(bcftools_stderr, "        --print-header                Print only the merged header and exit\n");
     fprintf(bcftools_stderr, "        --use-header FILE             Use the provided header\n");
     fprintf(bcftools_stderr, "    -0  --missing-to-ref              Assume genotypes at missing sites are 0/0\n");
     fprintf(bcftools_stderr, "    -f, --apply-filters LIST          Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
     fprintf(bcftools_stderr, "    -F, --filter-logic x|+            Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
-    fprintf(bcftools_stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
+    fprintf(bcftools_stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,MIN_DP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
     fprintf(bcftools_stderr, "    -i, --info-rules TAG:METHOD,..    Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
     fprintf(bcftools_stderr, "    -l, --file-list FILE              Read file names from the file\n");
-    fprintf(bcftools_stderr, "    -L, --local-alleles INT           EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
-    fprintf(bcftools_stderr, "    -m, --merge STRING                Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+    fprintf(bcftools_stderr, "    -L, --local-alleles INT           If more than INT alt alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
+    fprintf(bcftools_stderr, "    -m, --merge STRING[*|**]          Allow multiallelic records for snps,indels,both,snp-ins-del,all,none,id,*,**; see man page for details [both]\n");
     fprintf(bcftools_stderr, "    -M, --missing-rules TAG:METHOD    Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
     fprintf(bcftools_stderr, "        --no-index                    Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
     fprintf(bcftools_stderr, "        --no-version                  Do not append version and command line to the header\n");
@@ -3429,8 +3489,8 @@ static void usage(void)
     fprintf(bcftools_stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
     fprintf(bcftools_stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
     fprintf(bcftools_stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
-    fprintf(bcftools_stderr, "        --threads INT                 Use multithreading with <int> worker threads [0]\n");
-    fprintf(bcftools_stderr, "        --write-index                 Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
+    fprintf(bcftools_stderr, "    -W, --write-index[=FMT]           Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -3472,12 +3532,14 @@ int main_vcfmerge(int argc, char *argv[])
         {"missing-rules",required_argument,NULL,'M'},
         {"no-version",no_argument,NULL,8},
         {"no-index",no_argument,NULL,10},
+        {"force-no-index",no_argument,NULL,10},
+        {"force-single",no_argument,NULL,12},
         {"filter-logic",required_argument,NULL,'F'},
-        {"write-index",no_argument,NULL,11},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:W::",loptions,NULL)) >= 0) {
         switch (c) {
             case 'L':
                 args->local_alleles = strtol(optarg,&tmp,10);
@@ -3522,17 +3584,23 @@ int main_vcfmerge(int argc, char *argv[])
                 }
                 break;
             case 'm':
+            {
+                int len = strlen(optarg);
+                if ( optarg[len-1]=='*' ) { args->trim_star_allele++; len--; }
+                if ( optarg[len-1]=='*' ) { args->trim_star_allele++; len--; }
+                if ( optarg[len-1]==',' ) len--;
                 args->collapse = COLLAPSE_NONE;
-                if ( !strcmp(optarg,"snps") ) args->collapse |= COLLAPSE_SNPS;
-                else if ( !strcmp(optarg,"indels") ) args->collapse |= COLLAPSE_INDELS;
-                else if ( !strcmp(optarg,"both") ) args->collapse |= COLLAPSE_BOTH;
-                else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
-                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
-                else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
+                if ( !strncmp(optarg,"snp-ins-del",len) ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
+                else if ( !strncmp(optarg,"snps",len) ) args->collapse |= COLLAPSE_SNPS;
+                else if ( !strncmp(optarg,"indels",len) ) args->collapse |= COLLAPSE_INDELS;
+                else if ( !strncmp(optarg,"id",len) ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
+                else if ( !strncmp(optarg,"any",len) ) args->collapse |= COLLAPSE_ANY;
+                else if ( !strncmp(optarg,"all",len) ) args->collapse |= COLLAPSE_ANY;
+                else if ( !strncmp(optarg,"both",len) ) args->collapse |= COLLAPSE_BOTH;
+                else if ( !strncmp(optarg,"none",len) ) args->collapse = COLLAPSE_NONE;
                 else error("The -m type \"%s\" is not recognised.\n", optarg);
                 break;
+            }
             case 'f': args->files->apply_filters = optarg; break;
             case 'r': args->regions_list = optarg; break;
             case 'R': args->regions_list = optarg; regions_is_file = 1; break;
@@ -3546,15 +3614,17 @@ int main_vcfmerge(int argc, char *argv[])
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
             case 10 : args->no_index = 1; break;
-            case 11 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
+            case 12 : args->force_single = 1; break;
             case 'h':
             case '?': usage(); break;
             default: error("Unknown argument: %s\n", optarg);
         }
     }
     if ( argc==optind && !args->file_list ) usage();
-    if ( argc-optind<2 && !args->file_list ) usage();
-
     if ( args->no_index )
     {
         if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n");
@@ -3595,6 +3665,9 @@ int main_vcfmerge(int argc, char *argv[])
         for (i=0; i<nfiles; i++) free(files[i]);
         free(files);
     }
+    if ( !args->files->nreaders ) usage();
+    if ( args->files->nreaders==1 && !args->force_single ) error("Expected two or more files to merge, got only one. Use --force-single to proceed anyway\n");
+
     merge_vcf(args);
     bcf_sr_destroy(args->files);
     if ( args->regs ) regidx_destroy(args->regs);
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c
index 02ad322d1..f47253385 100644
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -1,6 +1,6 @@
 /*  vcfnorm.c -- Left-align and normalize indels.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -42,6 +42,7 @@ THE SOFTWARE.  */
 #include "abuf.h"
 #include "gff.h"
 #include "regidx.h"
+#include "filter.h"
 
 #define CHECK_REF_EXIT 1
 #define CHECK_REF_WARN 2
@@ -51,6 +52,10 @@ THE SOFTWARE.  */
 #define MROWS_SPLIT 1
 #define MROWS_MERGE  2
 
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
 // for -m+, mapping from allele indexes of a single input record
 // to allele indexes of output record
 typedef struct
@@ -64,7 +69,7 @@ typedef struct
 {
     int n;  // number of alleles
     char *ref, *alt;
-    void *hash;
+    void *hash; // str2int hash
 }
 cmpals1_t;
 
@@ -79,8 +84,8 @@ typedef struct
 {
     char *tseq, *seq;
     int mseq;
-    bcf1_t **lines, **tmp_lines, **alines, **blines, *mrow_out;
-    int ntmp_lines, mtmp_lines, nalines, malines, nblines, mblines;
+    bcf1_t **lines, **tmp_lines, **mrows, *mrow_out;
+    int ntmp_lines, mtmp_lines, nmrows, mmrows, mrows_first;
     map_t *maps;     // mrow map for each buffered record
     char **als;
     int mmaps, nals, mals;
@@ -88,8 +93,8 @@ typedef struct
     int32_t *int32_arr;
     int ntmp_arr1, ntmp_arr2, nint32_arr;
     kstring_t *tmp_str;
-    kstring_t *tmp_als, *tmp_del, tmp_kstr;
-    int ntmp_als, ntmp_del;
+    kstring_t *tmp_als, *tmp_sym, tmp_kstr;
+    int ntmp_als, ntmp_sym;
     rbuf_t rbuf;
     int buf_win;            // maximum distance between two records to consider
     int aln_win;            // the realignment window size (maximum repeat size)
@@ -100,7 +105,7 @@ typedef struct
     struct { int tot, set, swap; } nref;
     char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
     int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel;
-    int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+    int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious;
     int record_cmd_line, force, force_warned, keep_sum_ad;
     abuf_t *abuf;
     abuf_opt_t atomize;
@@ -108,12 +113,17 @@ typedef struct
     char *old_rec_tag;
     htsFile *out;
     char *index_fn;
-    int write_index;
+    int write_index, gff_verbosity;
     int right_align;
     char *gff_fname;
     gff_t *gff;
     regidx_t *idx_tscript;
     regitr_t *itr_tscript;
+    int (*cmp_func)(const void *aptr, const void *bptr);
+    char *filter_str;
+    int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+    int filter_pass;
+    filter_t *filter;
 }
 args_t;
 
@@ -555,33 +565,57 @@ static int realign(args_t *args, bcf1_t *line)
     if ( bcf_get_variant_types(line)==VCF_BND ) return ERR_SYMBOLIC;   // breakend, not an error
 
     // make a copy of each allele for trimming
-    hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
-    hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); // the actual sequence to realign
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_sym,args->tmp_sym); // the original symbolic allele strings to output
     kstring_t *als = args->tmp_als;
-    kstring_t *del = args->tmp_del;
+    kstring_t *sym = args->tmp_sym;
+    int symbolic_alts = 1;
     for (i=0; i<line->n_allele; i++)
     {
-        del[i].l = 0;
+        sym[i].l = 0;
         if ( line->d.allele[i][0]=='<' )
         {
-            // symbolic allele, only <DEL.*> will be realigned
-            if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
-            if ( nref < line->rlen )
+            // symbolic allele, only <DEL.*> and <DUP.*> will be realigned
+            // TODO: there should be check for symbolic allele length. If too big, perhaps should not attempt realignment
+            int32_t sv_len = 0;
+            if ( !strncmp("<DEL",line->d.allele[i],4) ) sv_len = -line->rlen;
+            else if ( !strncmp("<DUP",line->d.allele[i],4) )
+            {
+                if ( bcf_get_info_int32(args->hdr,line,"SVLEN",&args->int32_arr,&args->nint32_arr)==1 ) sv_len = args->int32_arr[0];
+            }
+            if ( !sv_len ) return ERR_SYMBOLIC;
+
+            als[i].l = 0;
+            if ( sv_len<0 )
+            {
+                // del, expand REF and replace ALT, for example, replace "REF=C ALT=<DEL>" with "REF=CAT ALT=C"
+                if ( nref < line->rlen )
+                {
+                    free(ref);
+                    reflen = line->rlen;
+                    ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                    if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+                    seq_to_upper(ref,0);
+                    replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
+                    als[0].l = 0;
+                    kputs(ref, &als[0]);
+                }
+                kputsn(als[0].s,1,&als[i]);
+            }
+            else // sv_len>0
             {
+                // dup, replace "REF=C ALT=<DUP>" with "REF=C ALT=CAT"
                 free(ref);
-                reflen = line->rlen;
-                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+sv_len, &nref);
                 if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
                 seq_to_upper(ref,0);
                 replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
-                als[0].l = 0;
-                kputs(ref, &als[0]);
-                als[i].l = 0;
-                kputsn(ref,1,&als[i]);
-                kputs(line->d.allele[i],&del[i]);
-                continue;
+                kputs(ref,&als[i]);
             }
+            kputs(line->d.allele[i],&sym[i]);   // preserve the symbolic allele string
+            continue;
         }
+        if ( i>0 ) symbolic_alts = 0;
         if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
         if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
         {
@@ -610,8 +644,15 @@ static int realign(args_t *args, bcf1_t *line)
     else
         new_pos = realign_right(args, line);
 
-    // Have the alleles changed?
-    als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
+    // Have the alleles changed? Consider <DEL> could have expanded the REF allele. In that
+    // case it must be trimmed, however the new REF length must reflect the entire length.
+    als[0].s[ als[0].l ] = 0;   // for strcmp to work
+    int new_reflen = strlen(als[0].s);
+    if ( symbolic_alts )
+    {
+        als[0].l = 1;
+        als[0].s[ als[0].l ] = 0;
+    }
     if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
 
     set_old_rec_tag(args, line, line, 0);
@@ -621,7 +662,7 @@ static int realign(args_t *args, bcf1_t *line)
     for (i=0; i<line->n_allele; i++)
     {
         if (i>0) kputc(',',&args->tmp_kstr);
-        if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+        if ( sym[i].l ) kputs(sym[i].s,&args->tmp_kstr);
         else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
     }
     args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
@@ -629,7 +670,6 @@ static int realign(args_t *args, bcf1_t *line)
     args->nchanged++;
 
     // Update INFO/END if necessary
-    int new_reflen = strlen(line->d.allele[0]);
     if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
     {
         // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()
@@ -670,7 +710,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                     bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \
             } \
             bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \
@@ -692,7 +732,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                     bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \
             } \
             if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \
@@ -725,7 +765,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                     bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \
             } \
             if ( ialt!=0 ) \
@@ -745,31 +785,36 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
     }
     #undef BRANCH_NUMERIC
 }
-// Find n-th field in a comma-separated list and move it to dst.
-// The memory areas may overlap.
-#define STR_MOVE_NTH(dst,src,end,nth,len) \
-{ \
-    char *ss = src, *se = src; \
-    int j = 0; \
-    while ( *se && se<(end) ) \
-    { \
-        if ( *se==',' ) \
-        { \
-            if ( j==nth ) break; \
-            j++; \
-            ss = se+1; \
-        } \
-        se++; \
-    } \
-    if ( j==nth ) \
-    { \
-        int n = se - ss; \
-        memmove((dst),ss,n); \
-        src = se; \
-        len += n; \
-    } \
-    else len = -1; \
+// Find nth field in a comma-separated list in src and move it to dst.
+// The dst and src memory areas may overlap, end points just after the last valid src
+// character.
+// On success returns pointer to the end of the parsed field and increments ndst by the
+// number of memmoved characters. Returns NULL if the field was not found.
+static inline char *string_move_nth(char *dst, char *src, char *end, int nth, size_t *ndst)
+{
+    if ( src>=end ) return NULL;
+    char *ss = src, *se = src;
+    int j = 0;
+    while ( *se && se<(end) )
+    {
+        if ( *se==',' )
+        {
+            if ( j==nth ) break;
+            j++;
+            ss = se+1;
+        }
+        se++;
+    }
+    if ( j!=nth ) return NULL;
+    if ( ss>=end ) return NULL;
+    if ( !*ss ) return NULL;
+
+    int n = se - ss;
+    memmove((dst),ss,n);
+    *ndst += n;
+    return se;
 }
+
 static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
 {
     const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
@@ -784,41 +829,57 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
     int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key);
     if ( len==BCF_VL_A )
     {
+        char *end = str.s + str.l;
         char *tmp = str.s;
-        int len = 0;
-        STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len] = 0;
+        str.l = 0;
+        tmp = string_move_nth(str.s,tmp,end,ialt,&str.l);
+        if ( !tmp ) str.l = 1, str.s[0] = '.';
+        kputc_(0,&str);
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
     }
     else if ( len==BCF_VL_R )
     {
+        char *end = str.s + str.l;
         char *tmp = str.s;
-        int len = 0;
-        STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
-        str.s[len]=','; tmp++; len++;
-        STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len] = 0;
+        str.l = 0;
+        tmp = string_move_nth(str.s,tmp,end,0,&str.l);
+        if ( tmp )
+        {
+            kputc_(',',&str);
+            tmp = string_move_nth(str.s+str.l,tmp+1,end,ialt,&str.l); // ialt is 0-based index to ALT
+        }
+        if ( !tmp ) str.l = 1, str.s[0] = '.';
+        kputc_(0,&str);
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
     }
     else if ( len==BCF_VL_G )
     {
         int i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+        char *end = str.s + str.l;
         char *tmp = str.s;
-        int len = 0;
-        STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
-        str.s[len]=','; tmp++; len++;
-        STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,i0a-1,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len]=','; tmp++; len++;
-        STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len] = 0;
+        str.l = 0;
+        tmp = string_move_nth(str.s,tmp,end,0,&str.l);
+        if ( tmp )
+        {
+            kputc_(',',&str);
+            tmp = string_move_nth(str.s+str.l,tmp+1,end,i0a-1,&str.l);
+        }
+        if ( tmp )
+        {
+            kputc_(',',&str);
+            tmp = string_move_nth(str.s+str.l,tmp+1,end,iaa-i0a-1,&str.l);
+        }
+        if ( !tmp ) str.l = 1, str.s[0] = '.';
+        kputc_(0,&str);
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
     }
     else
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
+    if ( args->ntmp_arr1 < str.m )
+    {
+        args->ntmp_arr1 = str.m;
+        args->tmp_arr1 = (uint8_t*)str.s;
+    }
 }
 static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
 {
@@ -843,7 +904,7 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
         {
             if ( gt[j]==bcf_int32_vector_end ) break;
             if ( bcf_gt_is_missing(gt[j]) ) continue; // missing allele: leave as is
-            if ( (ialt==0 || args->ma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
+            if ( bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
             if ( bcf_gt_allele(gt[j])==ialt+1 )
                 gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
             else if ( args->ma_use_ref_allele )
@@ -889,7 +950,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                     bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                     tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \
             } \
             nvals /= nsmpl; \
@@ -922,7 +983,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                     bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                     tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*nsmpl,nvals); \
             } \
             nvals /= nsmpl; \
@@ -977,7 +1038,8 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                     bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
+                error("Error at %s:%"PRId64", the tag %s has wrong number of fields. Use --force to proceed anyway.\n", \
+                    bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
             } \
             nvals /= nsmpl; \
             int all_haploid = nvals==src->n_allele ? 1 : 0; \
@@ -1031,6 +1093,7 @@ static void squeeze_format_char(char *str, int src_blen, int dst_blen, int n)
         isrc += src_blen;
     }
 }
+// ialt is 0-based index to ALT
 static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
 {
     const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
@@ -1038,50 +1101,60 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
     if ( !ret ) return; // all values can be empty, leave out the tag, no need to panic
     assert( ret>0 );
 
+    int nsmpl = bcf_hdr_nsamples(args->hdr);
+    int blen = ret/nsmpl;   // per-sample field length
+    assert( blen>0 );
+
     kstring_t str;
     str.m = args->ntmp_arr1;
-    str.l = ret;
     str.s = (char*) args->tmp_arr1;
+    str.l = ret;
 
-    int nsmpl = bcf_hdr_nsamples(args->hdr);
-    int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
-    if ( len==BCF_VL_A )
+    int tag_len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
+    if ( tag_len==BCF_VL_A )
     {
-        int i, blen = ret/nsmpl, maxlen = 0;
+        int i, maxlen = 0;
         char *ptr = str.s;
         for (i=0; i<nsmpl; i++)
         {
             char *tmp = ptr;
-            int len = 0;
-            STR_MOVE_NTH(tmp,tmp,ptr+blen,ialt,len);
-            if ( len<0 ) return;   // wrong number of fields: skip
+            char *end = ptr + blen;
+            size_t len = 0;
+            tmp = string_move_nth(ptr,tmp,end,ialt,&len);
+            if ( !tmp ) ptr[0] = '.', len = 1;
             if ( maxlen < len ) maxlen = len;
+            while (len<blen) ptr[len++] = 0;
             ptr += blen;
         }
         if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
         bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
     }
-    else if ( len==BCF_VL_R )
+    else if ( tag_len==BCF_VL_R )
     {
-        int i, blen = ret/nsmpl, maxlen = 0;
+        int i, maxlen = 0;
         char *ptr = str.s;
         for (i=0; i<nsmpl; i++)
         {
             char *tmp = ptr;
-            int len = 0;
-            STR_MOVE_NTH(ptr,tmp,ptr+blen,0,len);
-            ptr[len]=','; tmp++; len++;
-            STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
-            if ( len<0 ) return;   // wrong number of fields: skip
+            char *end = ptr + blen;
+            size_t len = 0;
+            tmp = string_move_nth(ptr,tmp,end,0,&len);
+            if ( tmp )
+            {
+                ptr[len++] = ',';
+                tmp = string_move_nth(ptr+len,tmp+1,end,ialt,&len);
+            }
+            if ( !tmp ) ptr[0] = '.', len = 1;
             if ( maxlen < len ) maxlen = len;
+            while (len<blen) ptr[len++] = 0;
             ptr += blen;
         }
         if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
         bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
     }
-    else if ( len==BCF_VL_G )
+    else if ( tag_len==BCF_VL_G )
     {
-        int i, blen = ret/nsmpl, maxlen = 0, i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+        int i, maxlen = 0, i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
         char *ptr = str.s;
         for (i=0; i<nsmpl; i++)
         {
@@ -1108,31 +1181,38 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
                     bcf_update_format_char(args->out_hdr,dst,tag,NULL,0);
                     return;
                 }
-                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n",
+                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d. Use --force to proceed anyway.\n",
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields);
             }
 
-            int len = 0;
+            char *tmp = ptr;
+            char *end = ptr + blen;
+            size_t len = 0;
+            tmp = string_move_nth(ptr,tmp,end,0,&len);
             if ( nfields==src->n_allele )   // haploid
             {
-                char *tmp = ptr;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
-                ptr[len]=','; tmp++; len++;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
-                if ( len<0 ) return;   // wrong number of fields: skip
+                if ( tmp )
+                {
+                    ptr[len++] = ',';
+                    tmp = string_move_nth(ptr+len,tmp+1,end,ialt,&len);
+                }
             }
             else    // diploid
             {
-                char *tmp = ptr;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
-                ptr[len]=','; tmp++; len++;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,i0a-1,len);
-                if ( len<0 ) return;   // wrong number of fields: skip
-                ptr[len]=','; tmp++; len++;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,iaa-i0a-1,len);
-                if ( len<0 ) return;   // wrong number of fields: skip
+                if ( tmp )
+                {
+                    ptr[len++] = ',';
+                    tmp = string_move_nth(ptr+len,tmp+1,end,i0a-1,&len);
+                }
+                if ( tmp )
+                {
+                    ptr[len++] = ',';
+                    tmp = string_move_nth(ptr+len,tmp+1,end,iaa-i0a-1,&len);
+                }
             }
+            if ( !tmp ) ptr[0] = '.', len = 1;
             if ( maxlen < len ) maxlen = len;
+            while (len<blen) ptr[len++] = 0;
             ptr += blen;
         }
         if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
@@ -1392,6 +1472,23 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info
         bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1);
     }
 }
+static int gt_array_grow_ploidy(args_t *args, uint8_t **tmp_arr, int *ntmp_arr, int ngt_ori, int ngt_new, int nsmpl)
+{
+    *ntmp_arr = 4*ngt_new*nsmpl;
+    int32_t *ptr = (int32_t*)realloc(*tmp_arr,*ntmp_arr);
+    if ( !ptr ) error("Error: failed to allocate %d bytes\n",*ntmp_arr);
+    *tmp_arr = (uint8_t*) ptr;
+
+    int i,j;
+    for (i=nsmpl-1; i>=0; i--)
+    {
+        int32_t *src = ptr + i*ngt_ori;
+        int32_t *dst = ptr + i*ngt_new;
+        for (j=ngt_new; j>ngt_ori; j--) dst[j-1] = bcf_int32_vector_end;
+        for (j=ngt_ori; j>0; j--) dst[j-1] = src[j-1];
+    }
+    return ngt_new;
+}
 static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
 {
     // reusing int8_t arrays as int32_t arrays
@@ -1410,7 +1507,9 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
         int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2);
         args->ntmp_arr2 = ntmp2 * 4;
         ngts2 /= nsmpl;
-        if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
+        int ploidy_changed = ngts - ngts2;
+        if ( ngts < ngts2 ) ngts = gt_array_grow_ploidy(args,&args->tmp_arr1,&args->ntmp_arr1,ngts,ngts2,nsmpl);
+        if ( ngts > ngts2 ) ngts2 = gt_array_grow_ploidy(args,&args->tmp_arr2,&args->ntmp_arr2,ngts2,ngts,nsmpl);
 
         int32_t *gt  = (int32_t*) args->tmp_arr1;       // the first, destination line
         int32_t *gt2 = (int32_t*) args->tmp_arr2;       // one of the subsequent lines, i.e. the source line
@@ -1420,16 +1519,22 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
             // never overwrite with ref allele
             for (k2=0; k2<ngts2; k2++)
             {
-                if ( gt2[k2]==bcf_int32_vector_end ) break;
-                if ( bcf_gt_is_missing(gt2[k2]) ) continue;
+                if ( gt2[k2]==bcf_int32_vector_end )
+                {
+                    if ( ploidy_changed && bcf_gt_is_missing(gt[k2]) ) gt[k2] = bcf_int32_vector_end;
+                    break;
+                }
+                if ( bcf_gt_is_missing(gt2[k2]) ) continue;     // don't overwrite with missing
+
+                // don't overwrite with ref, unless the destination is missing, e.g. "./. + 0/1"
                 int ial2 = bcf_gt_allele(gt2[k2]);
-                if ( ial2==0 ) continue;    // never overwrite with ref
+                if ( ial2==0 && !bcf_gt_is_missing(gt[k2]) && gt[k2]!=bcf_int32_vector_end ) continue;
                 if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
 
                 // The destination allele
                 int ial = args->maps[i].map[ial2];
                 if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
-                    gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+                    gt[k2] = (gt[k2]!=bcf_int32_vector_end && bcf_gt_is_phased(gt[k2])) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
                 else
                 {
                     // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
@@ -1822,77 +1927,129 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
         else if ( type==BCF_HT_INT || type==BCF_HT_REAL ) merge_format_numeric(args, lines, nlines, fmt, dst);
         else merge_format_string(args, lines, nlines, fmt, dst);
     }
+    args->njoined++;
 }
 
 #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
-static void mrows_schedule(args_t *args, bcf1_t **line)
+static void mrows_push(args_t *args, bcf1_t **line)
 {
     int i,m;
-    if ( args->mrows_collapse==COLLAPSE_ANY         // merge all record types together
-        || bcf_get_variant_types(*line)&VCF_SNP     // SNP, put into alines
-        || bcf_get_variant_types(*line)==VCF_REF )  // ref
-    {
-        args->nalines++;
-        m = args->malines;
-        hts_expand(bcf1_t*,args->nalines,args->malines,args->alines);
-        for (i=m; i<args->malines; i++) args->alines[i] = bcf_init1();
-        SWAP(bcf1_t*, args->alines[args->nalines-1], *line);
-    }
-    else
-    {
-        args->nblines++;
-        m = args->mblines;
-        hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
-        for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
-        SWAP(bcf1_t*, args->blines[args->nblines-1], *line);
+    if ( !args->nmrows ) args->mrows_first = 0;
+    args->nmrows++;
+    m = args->mmrows;
+    hts_expand(bcf1_t*,args->nmrows,args->mmrows,args->mrows);
+    for (i=m; i<args->mmrows; i++) args->mrows[i] = bcf_init1();
+    SWAP(bcf1_t*, args->mrows[args->nmrows-1], *line);
+
+    if ( args->mrows_collapse==COLLAPSE_ANY ) return;
+
+    // move the line up the sorted list so that the same variant types end up together
+    int cur_type = bcf_get_variant_types(args->mrows[args->nmrows-1]);
+    i = args->mrows_first + args->nmrows - 1;
+    while (i>0)
+    {
+        int prev_type = bcf_get_variant_types(args->mrows[i-1]);
+        if ( prev_type <= cur_type ) break;
+        bcf1_t *tmp = args->mrows[i-1];
+        args->mrows[i-1] = args->mrows[i];
+        args->mrows[i] = tmp;
+        i--;
     }
 }
-static int mrows_ready_to_flush(args_t *args, bcf1_t *line)
+static int mrows_can_flush(args_t *args, bcf1_t *line)
 {
-    if ( args->nalines && (args->alines[0]->rid!=line->rid || args->alines[0]->pos!=line->pos) ) return 1;
-    if ( args->nblines && (args->blines[0]->rid!=line->rid || args->blines[0]->pos!=line->pos) ) return 1;
+    if ( !args->nmrows ) return 0;
+    int ibeg = args->mrows_first;
+    if ( args->mrows[ibeg]->rid != line->rid ) return 1;
+    if ( args->mrows[ibeg]->pos != line->pos ) return 1;
     return 0;
 }
 static bcf1_t *mrows_flush(args_t *args)
 {
-    if ( args->nblines && args->nalines==1 && bcf_get_variant_types(args->alines[0])==VCF_REF )
+    if ( !args->nmrows ) return NULL;
+
+    int ibeg = args->mrows_first;
+
+    //fprintf(stderr,"flush: ibeg=%d n=%d\n",ibeg,args->nmrows);
+    //int i;
+    //for (i=ibeg; i<ibeg+args->nmrows; i++)
+    //  fprintf(stderr,"\ti=%d type=%d %s %s\n",i,bcf_get_variant_types(args->mrows[i]),args->mrows[i]->d.allele[0],args->mrows[i]->d.allele[1]);
+
+    if ( args->nmrows==1 )
     {
-        // By default, REF lines are merged with SNPs if SNPs and indels are to be kept separately.
-        // However, if there are indels only and a single REF line, merge it with indels.
-        args->nblines++;
-        int i,m = args->mblines;
-        hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
-        for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
-        SWAP(bcf1_t*, args->blines[args->nblines-1], args->alines[0]);
-        args->nalines--;
+        args->nmrows = 0;
+        return args->mrows[ibeg];
     }
-    if ( args->nalines )
+
+    if ( args->mrows_collapse==COLLAPSE_ANY )
     {
-        if ( args->nalines==1 )
-        {
-            args->nalines = 0;
-            return args->alines[0];
-        }
+        // merge everything with anything
         bcf_clear(args->mrow_out);
-        merge_biallelics_to_multiallelic(args, args->mrow_out, args->alines, args->nalines);
-        args->nalines = 0;
+        merge_biallelics_to_multiallelic(args, args->mrow_out, &args->mrows[ibeg], args->nmrows - ibeg);
+        args->nmrows = 0;
         return args->mrow_out;
     }
-    else if ( args->nblines )
+
+    int j;
+    int types[] = { VCF_SNP, VCF_MNP, VCF_INDEL, VCF_OTHER, -1 };       // merge everything within the same category
+    if ( args->mrows_collapse==COLLAPSE_SNPS ) types[1] = -1;           // merge SNPs only
+    else if ( args->mrows_collapse==COLLAPSE_INDELS ) types[0] = VCF_INDEL, types[1] = -1;    // merge indels only
+    for (j=0; types[j]!=-1; j++)
     {
-        if ( args->nblines==1 )
+        int i, type = types[j]; // to keep the compiler happy
+        for (i=ibeg; i<ibeg+args->nmrows; i++)
         {
-            args->nblines = 0;
-            return args->blines[0];
+            type = bcf_get_variant_types(args->mrows[i]);
+            if ( type!=types[j] && type!=VCF_REF ) break;
+        }
+        if ( i==ibeg+1 && type!=VCF_REF )
+        {
+            // just one line of this type, no merging, but multiple lines of different type follow
+            args->nmrows--;
+            args->mrows_first++;
+            return args->mrows[ibeg];
+        }
+        if ( i>ibeg )
+        {
+            // more than one line, merging is needed
+            int nflush = i - ibeg;
+            bcf_clear(args->mrow_out);
+            merge_biallelics_to_multiallelic(args, args->mrow_out, &args->mrows[ibeg], nflush);
+            args->nmrows -= nflush;
+            args->mrows_first += nflush;
+            return args->mrow_out;
         }
-        bcf_clear(args->mrow_out);
-        merge_biallelics_to_multiallelic(args, args->mrow_out, args->blines, args->nblines);
-        args->nblines = 0;
-        return args->mrow_out;
     }
-    return NULL;
+    args->nmrows--;
+    args->mrows_first++;
+    return args->mrows[ibeg];
 }
-static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
+static char *strdup_alt_svlen(args_t *args, bcf1_t *rec, int ial)
+{
+    if ( rec->d.allele[ial][0]!='<' ) return strdup(rec->d.allele[ial]);
+
+    int ntmp = args->ntmp_arr1 / sizeof(int32_t);
+    int n = bcf_get_info_int32(args->hdr, rec, "SVLEN", &args->tmp_arr1, &ntmp);
+    args->ntmp_arr1 = ntmp * sizeof(int32_t);
+    int32_t *svlen = (int32_t *) args->tmp_arr1;
+    if ( n<=0 ) return strdup(rec->d.allele[ial]);
+
+    if ( n+1 != rec->n_allele )
+    {
+        // there should be as many SVLEN numbers as there are ALT alleles
+        static int warned = 0;
+        if ( !warned )
+        {
+            fprintf(stderr,"TODO: different number of ALT alleles and SVLEN fields %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+            warned = 1;
+        }
+    }
+
+    kstring_t str = {0,0,0};
+    ksprintf(&str,"%s.%d",rec->d.allele[ial],svlen[ial-1]);
+    return str.s;
+}
+static void cmpals_add(args_t *args, cmpals_t *ca, bcf1_t *rec)
 {
     ca->ncmpals++;
     hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals);
@@ -1900,10 +2057,11 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
     free(cmpals->ref);
     cmpals->ref = strdup(rec->d.allele[0]);
     cmpals->n   = rec->n_allele;
+
     if ( rec->n_allele==2 )
     {
         free(cmpals->alt);
-        cmpals->alt = strdup(rec->d.allele[1]);
+        cmpals->alt = strdup_alt_svlen(args,rec,1);
     }
     else
     {
@@ -1914,9 +2072,10 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
             khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
     }
 }
-static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
+static int cmpals_match(args_t *args, cmpals_t *ca, bcf1_t *rec)
 {
     int i, j;
+    char *alt_svlen = rec->n_allele==2 ? strdup_alt_svlen(args,rec,1) : NULL;
     for (i=0; i<ca->ncmpals; i++)
     {
         cmpals1_t *cmpals = ca->cmpals + i;
@@ -1928,7 +2087,8 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
         // the most frequent case
         if ( rec->n_allele==2 )
         {
-            if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue;
+            if ( strcasecmp(alt_svlen, cmpals->alt) ) continue;
+            free(alt_svlen);
             return 1;
         }
 
@@ -1938,6 +2098,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
         if ( j<rec->n_allele ) continue;
         return 1;
     }
+    free(alt_svlen);
     return 0;
 }
 static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; }
@@ -1964,21 +2125,13 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
         k = rbuf_shift(&args->rbuf);
         if ( args->mrows_op==MROWS_MERGE )
         {
-            if ( mrows_ready_to_flush(args, args->lines[k]) )
+            if ( mrows_can_flush(args, args->lines[k]) )
             {
                 while ( (line=mrows_flush(args)) )
                     if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
             }
-            int merge = 1;
-            if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
-            {
-                if ( !(bcf_get_variant_types(args->lines[k]) & args->mrows_collapse) ) merge = 0;
-            }
-            if ( merge )
-            {
-                mrows_schedule(args, &args->lines[k]);
-                continue;
-            }
+            mrows_push(args, &args->lines[k]);
+            continue;
         }
         else if ( args->rmdup )
         {
@@ -1988,7 +2141,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
                 if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
                 if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
                 if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
-                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue;
             }
             else
             {
@@ -1998,7 +2151,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
                 if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out);
             }
             prev_type |= line_type;
-            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
+            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args,&args->cmpals_out, args->lines[k]);
         }
         if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     }
@@ -2020,7 +2173,7 @@ static void init_data(args_t *args)
     else
         args->keep_sum_ad = -1;
 
-    args->out_hdr = bcf_hdr_dup(args->hdr);
+    args->out_hdr = args->hdr;
     if ( args->old_rec_tag )
         bcf_hdr_printf(args->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",args->old_rec_tag);
 
@@ -2042,22 +2195,31 @@ static void init_data(args_t *args)
         args->abuf = abuf_init(args->hdr, SPLIT);
         abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr);
         if ( args->old_rec_tag )
+        {
             abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
+            if ( bcf_hdr_sync(args->out_hdr)!=0 ) error("bcf_hdr_sync failed\n");
+        }
         abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
     }
     if ( args->gff_fname )
     {
         args->gff = gff_init(args->gff_fname);
-        gff_set(args->gff,verbosity,1);
+        gff_set(args->gff,verbosity,args->gff_verbosity);
         gff_set(args->gff,strip_chr_names,1);
         gff_parse(args->gff);
         args->idx_tscript = gff_get(args->gff,idx_tscript);
         args->itr_tscript = regitr_init(NULL);
     }
+    if ( args->filter_str )
+        args->filter = filter_init(args->hdr, args->filter_str);
+    args->filter_pass = 1;
+
+    args->out_hdr = bcf_hdr_dup(args->out_hdr);
 }
 
 static void destroy_data(args_t *args)
 {
+    if ( args->filter ) filter_destroy(args->filter);
     if ( args->gff )
     {
         gff_destroy(args->gff);
@@ -2072,20 +2234,17 @@ static void destroy_data(args_t *args)
     for (i=0; i<args->mtmp_lines; i++)
         if ( args->tmp_lines[i] ) bcf_destroy1(args->tmp_lines[i]);
     free(args->tmp_lines);
-    for (i=0; i<args->malines; i++)
-        bcf_destroy1(args->alines[i]);
-    free(args->alines);
-    for (i=0; i<args->mblines; i++)
-        bcf_destroy1(args->blines[i]);
-    free(args->blines);
+    for (i=0; i<args->mmrows; i++)
+        bcf_destroy1(args->mrows[i]);
+    free(args->mrows);
     for (i=0; i<args->mmaps; i++)
         free(args->maps[i].map);
     for (i=0; i<args->ntmp_als; i++)
         free(args->tmp_als[i].s);
-    for (i=0; i<args->ntmp_del; i++)
-        free(args->tmp_del[i].s);
+    for (i=0; i<args->ntmp_sym; i++)
+        free(args->tmp_sym[i].s);
     free(args->tmp_als);
-    free(args->tmp_del);
+    free(args->tmp_sym);
     free(args->tmp_kstr.s);
     if ( args->tmp_str )
     {
@@ -2110,10 +2269,10 @@ static void normalize_line(args_t *args, bcf1_t *line)
 {
     if ( args->fai )
     {
-        if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
+        if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line);
         if ( args->do_indels )
         {
-            int ret = realign(args, line);
+            int ret = args->filter_pass ? realign(args, line) : ERR_OK;
 
             // exclude broken VCF lines
             if ( ret==ERR_REF_MISMATCH && args->check_ref & CHECK_REF_SKIP )
@@ -2133,37 +2292,78 @@ static void normalize_line(args_t *args, bcf1_t *line)
         }
     }
 
-    // insert into sorted buffer
-    rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
-    int i,j;
-    i = j = rbuf_append(&args->rbuf);
-    if ( args->lines[i] ) bcf_destroy(args->lines[i]);
-    args->lines[i] = bcf_dup(line);
-    while ( rbuf_prev(&args->rbuf,&i) )
+    if ( args->filter_pass && args->atomize==SPLIT ) abuf_push(args->abuf,line);
+    while (1)
     {
-        if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
-        j = i;
+        if ( args->filter_pass && args->atomize==SPLIT )
+        {
+            line = abuf_flush(args->abuf, 0);
+            if ( !line ) break;
+        }
+
+        // insert into sorted buffer
+        rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
+        int i,j;
+        i = j = rbuf_append(&args->rbuf);
+        if ( args->lines[i] ) bcf_destroy(args->lines[i]);
+        args->lines[i] = bcf_dup(line);
+        while ( rbuf_prev(&args->rbuf,&i) )
+        {
+            if ( args->lines[i]->rid==args->lines[j]->rid )
+            {
+                bcf_unpack(args->lines[i], BCF_UN_STR);
+                bcf_unpack(args->lines[j], BCF_UN_STR);
+                if ( args->cmp_func(&args->lines[i], &args->lines[j]) > 0) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
+            }
+            j = i;
+        }
+        if ( !args->filter_pass || args->atomize!=SPLIT ) break;
     }
 }
 
-static bcf1_t *next_atomized_line(args_t *args)
+// return 0 on success, 1 when done
+static int split_and_normalize(args_t *args)
 {
-    bcf1_t *rec = NULL;
-    if ( args->atomize==SPLIT )
+    if ( !bcf_sr_next_line(args->files) ) return 1;
+
+    bcf1_t *line = bcf_sr_get_line(args->files,0);
+    args->ntotal++;
+
+    if ( args->filter )
     {
-        rec = abuf_flush(args->abuf, 0);
-        if ( rec ) return rec;
+        args->filter_pass = filter_test(args->filter,line,NULL);
+        if ( args->filter_logic==FLT_EXCLUDE ) args->filter_pass = args->filter_pass ? 0 : 1;
+        if ( !args->filter_pass ) args->nfilter++;
     }
 
-    if ( !bcf_sr_next_line(args->files) ) return NULL;
+    if ( args->mrows_op!=MROWS_SPLIT || line->n_allele<=2 || !args->filter_pass )
+    {
+        // normal operation, no splitting
+        normalize_line(args, line);
+        return 0;
+    }
 
-    if ( args->atomize==SPLIT )
+    // any restrictions on variant types to split?
+    if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
     {
-        abuf_push(args->abuf,bcf_sr_get_line(args->files,0));
-        return abuf_flush(args->abuf, 0);
+        int type = args->mrows_collapse==COLLAPSE_SNPS ? VCF_SNP : VCF_INDEL;
+        if ( !(bcf_get_variant_types(line) & type) )
+        {
+            normalize_line(args, line);
+            return 0;
+        }
     }
-    return bcf_sr_get_line(args->files,0);
+
+    args->nsplit++;
+    split_multiallelic_to_biallelics(args, line);
+
+    int j;
+    for (j=0; j<args->ntmp_lines; j++)
+        normalize_line(args, args->tmp_lines[j]);
+
+    return 0;
 }
+
 static void normalize_vcf(args_t *args)
 {
     char wmode[8];
@@ -2174,64 +2374,40 @@ static void normalize_vcf(args_t *args)
         hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
     if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
     if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out,args->out_hdr,args->output_fname,
+                     &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
-    bcf1_t *line;
-    int prev_rid = -1, prev_pos = -1, prev_type = 0;
-    while ( (line = next_atomized_line(args)) )
+    while (1)
     {
-        args->ntotal++;
-        if ( args->rmdup )
-        {
-            int line_type = bcf_get_variant_types(line);
-            if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
-            {
-                if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
-                if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
-                if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
-                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_in, line) ) continue;
-            }
-            else
-            {
-                prev_rid  = line->rid;
-                prev_pos  = line->pos;
-                prev_type = 0;
-                if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_in);
-            }
-            prev_type |= line_type;
-            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_in, line);
-        }
-
-        // still on the same chromosome?
-        int i,j,ilast = rbuf_last(&args->rbuf);
-        if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome
-
-        int split = 0;
-        if ( args->mrows_op==MROWS_SPLIT )
+        // buffer lines until a different position is reached
+        int done = 0;
+        while (1)
         {
-            split = 1;
-            if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
-            {
-                if ( !(bcf_get_variant_types(line) & args->mrows_collapse) ) split = 0;
-            }
-            if ( split && line->n_allele>2 )
-            {
-                args->nsplit++;
-                split_multiallelic_to_biallelics(args, line);
-                for (j=0; j<args->ntmp_lines; j++)
-                    normalize_line(args, args->tmp_lines[j]);
-            }
-            else
-                split = 0;
+            done = split_and_normalize(args);
+            if ( done ) break;      // no more lines available
+            int i = args->rbuf.f;
+            int j = rbuf_last(&args->rbuf);
+            if ( args->lines[i]->rid != args->lines[j]->rid ) break;
+            if ( args->lines[i]->pos != args->lines[j]->pos ) break;
         }
-        if ( !split )
-            normalize_line(args, line);
+        if ( done ) break;
 
         // find out how many sites to flush
-        ilast = rbuf_last(&args->rbuf);
-        j = 0;
+        int ifst  = args->rbuf.f;
+        int ilast = rbuf_last(&args->rbuf);
+        int i, j = 0;
         for (i=-1; rbuf_next(&args->rbuf,&i); )
         {
+            if ( args->lines[ifst]->rid != args->lines[ilast]->rid )
+            {
+                // there are two chromosomes in the buffer, count how many are on the first chromosome
+                if ( args->lines[ifst]->rid != args->lines[i]->rid ) break;
+                j++;
+                continue;
+            }
+            // there is just one chromosome, flush only lines that are unlikely to change order on
+            // realigning (the buf_win constant)
             if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
             j++;
         }
@@ -2249,7 +2425,8 @@ static void normalize_vcf(args_t *args)
     }
     if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
 
-    fprintf(stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
+    fprintf(stderr,"Lines   total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n",
+        args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter);
     if ( args->check_ref & CHECK_REF_FIX )
         fprintf(stderr,"REF/ALT total/modified/added:  \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
 }
@@ -2268,9 +2445,11 @@ static void usage(void)
     fprintf(stderr, "    -c, --check-ref e|w|x|s         Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
     fprintf(stderr, "    -D, --remove-duplicates         Remove duplicate lines of the same type.\n");
     fprintf(stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
+    fprintf(stderr, "    -e, --exclude EXPR              Do not normalize records for which the expression is true (see man page for details)\n");
     fprintf(stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
     fprintf(stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
     fprintf(stderr, "    -g, --gff-annot FILE            Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
+    fprintf(stderr, "    -i, --include EXPR              Normalize only records for which the expression is true (see man page for details)\n");
     fprintf(stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
     fprintf(stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
     fprintf(stderr, "        --multi-overlaps 0|.        Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
@@ -2283,12 +2462,14 @@ static void usage(void)
     fprintf(stderr, "    -R, --regions-file FILE         Restrict to regions listed in a file\n");
     fprintf(stderr, "        --regions-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(stderr, "    -s, --strict-filter             When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+    fprintf(stderr, "    -S, --sort METHOD               Sort order: chr_pos,lex [chr_pos]\n");
     fprintf(stderr, "    -t, --targets REGION            Similar to -r but streams rather than index-jumps\n");
     fprintf(stderr, "    -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n");
     fprintf(stderr, "        --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
-    fprintf(stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "        --threads INT               Use multithreading with INT worker threads [0]\n");
+    fprintf(stderr, "    -v, --verbose INT               Verbosity level (0-2) of GFF parsing [1]\n");
     fprintf(stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
-    fprintf(stderr, "        --write-index               Automatically index the output files [off]\n");
+    fprintf(stderr, "    -W, --write-index[=FMT]         Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   # normalize and left-align indels\n");
@@ -2319,8 +2500,10 @@ int main_vcfnorm(int argc, char *argv[])
     int region_is_file  = 0;
     int targets_is_file = 0;
     args->use_star_allele = 1;
+    args->gff_verbosity = 1;
     int regions_overlap = 1;
     int targets_overlap = 0;
+    args->cmp_func = cmp_bcf_pos;
 
     static struct option loptions[] =
     {
@@ -2328,9 +2511,12 @@ int main_vcfnorm(int argc, char *argv[])
         {"force",no_argument,NULL,7},
         {"atomize",no_argument,NULL,'a'},
         {"atom-overlaps",required_argument,NULL,11},
+        {"include",required_argument,NULL,'i'},
+        {"exclude",required_argument,NULL,'e'},
         {"old-rec-tag",required_argument,NULL,12},
         {"keep-sum",required_argument,NULL,10},
         {"fasta-ref",required_argument,NULL,'f'},
+        {"sort",required_argument,NULL,'S'},
         {"gff-annot",required_argument,NULL,'g'},
         {"right-align",no_argument,NULL,15},            // undocumented, only for debugging
         {"do-not-normalize",no_argument,NULL,'N'},
@@ -2351,11 +2537,12 @@ int main_vcfnorm(int argc, char *argv[])
         {"check-ref",required_argument,NULL,'c'},
         {"strict-filter",no_argument,NULL,'s'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,14},
+        {"write-index",optional_argument,NULL,'W'},
+        {"verbose",required_argument,NULL,'v'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:W::v:S:i:e:",loptions,NULL)) >= 0) {
         switch (c) {
             case  10:
                 // possibly generalize this also to INFO/AD and other tags
@@ -2364,7 +2551,22 @@ int main_vcfnorm(int argc, char *argv[])
                 args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
                 break;
             case 'g': args->gff_fname = optarg; break;
+            case 'v':
+                args->gff_verbosity = atoi(optarg);
+                if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
+                break;
             case 'a': args->atomize = SPLIT; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'S':
+                if ( !strcasecmp(optarg,"pos") ) args->cmp_func = cmp_bcf_pos;
+                else if ( !strcasecmp(optarg,"lex") ) args->cmp_func = cmp_bcf_pos_ref_alt;
+                else error("Error: the sort order --sort %s is not recognised\n",optarg);
+                break;
             case 11 :
                 if ( optarg[0]=='*' ) args->use_star_allele = 1;
                 else if ( optarg[0]=='.' ) args->use_star_allele = 0;
@@ -2376,7 +2578,10 @@ int main_vcfnorm(int argc, char *argv[])
                 else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
                 else error("Invalid argument to --multi-overlaps\n");
                 break;
-            case 14 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 15 : args->right_align = 1; break;
             case 'N': args->do_indels = 0; break;
             case 'd':
@@ -2485,7 +2690,6 @@ int main_vcfnorm(int argc, char *argv[])
 
     if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
     if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
-    if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
     init_data(args);
     normalize_vcf(args);
     destroy_data(args);
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c
index de9c2857b..4fe92ec9a 100644
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfnorm.c -- Left-align and normalize indels.
 
-    Copyright (C) 2013-2023 Genome Research Ltd.
+    Copyright (C) 2013-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -44,6 +44,7 @@ THE SOFTWARE.  */
 #include "abuf.h"
 #include "gff.h"
 #include "regidx.h"
+#include "filter.h"
 
 #define CHECK_REF_EXIT 1
 #define CHECK_REF_WARN 2
@@ -53,6 +54,10 @@ THE SOFTWARE.  */
 #define MROWS_SPLIT 1
 #define MROWS_MERGE  2
 
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
 // for -m+, mapping from allele indexes of a single input record
 // to allele indexes of output record
 typedef struct
@@ -66,7 +71,7 @@ typedef struct
 {
     int n;  // number of alleles
     char *ref, *alt;
-    void *hash;
+    void *hash; // str2int hash
 }
 cmpals1_t;
 
@@ -81,8 +86,8 @@ typedef struct
 {
     char *tseq, *seq;
     int mseq;
-    bcf1_t **lines, **tmp_lines, **alines, **blines, *mrow_out;
-    int ntmp_lines, mtmp_lines, nalines, malines, nblines, mblines;
+    bcf1_t **lines, **tmp_lines, **mrows, *mrow_out;
+    int ntmp_lines, mtmp_lines, nmrows, mmrows, mrows_first;
     map_t *maps;     // mrow map for each buffered record
     char **als;
     int mmaps, nals, mals;
@@ -90,8 +95,8 @@ typedef struct
     int32_t *int32_arr;
     int ntmp_arr1, ntmp_arr2, nint32_arr;
     kstring_t *tmp_str;
-    kstring_t *tmp_als, *tmp_del, tmp_kstr;
-    int ntmp_als, ntmp_del;
+    kstring_t *tmp_als, *tmp_sym, tmp_kstr;
+    int ntmp_als, ntmp_sym;
     rbuf_t rbuf;
     int buf_win;            // maximum distance between two records to consider
     int aln_win;            // the realignment window size (maximum repeat size)
@@ -102,7 +107,7 @@ typedef struct
     struct { int tot, set, swap; } nref;
     char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
     int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel;
-    int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+    int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious;
     int record_cmd_line, force, force_warned, keep_sum_ad;
     abuf_t *abuf;
     abuf_opt_t atomize;
@@ -110,12 +115,17 @@ typedef struct
     char *old_rec_tag;
     htsFile *out;
     char *index_fn;
-    int write_index;
+    int write_index, gff_verbosity;
     int right_align;
     char *gff_fname;
     gff_t *gff;
     regidx_t *idx_tscript;
     regitr_t *itr_tscript;
+    int (*cmp_func)(const void *aptr, const void *bptr);
+    char *filter_str;
+    int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+    int filter_pass;
+    filter_t *filter;
 }
 args_t;
 
@@ -557,33 +567,57 @@ static int realign(args_t *args, bcf1_t *line)
     if ( bcf_get_variant_types(line)==VCF_BND ) return ERR_SYMBOLIC;   // breakend, not an error
 
     // make a copy of each allele for trimming
-    hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
-    hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); // the actual sequence to realign
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_sym,args->tmp_sym); // the original symbolic allele strings to output
     kstring_t *als = args->tmp_als;
-    kstring_t *del = args->tmp_del;
+    kstring_t *sym = args->tmp_sym;
+    int symbolic_alts = 1;
     for (i=0; i<line->n_allele; i++)
     {
-        del[i].l = 0;
+        sym[i].l = 0;
         if ( line->d.allele[i][0]=='<' )
         {
-            // symbolic allele, only <DEL.*> will be realigned
-            if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
-            if ( nref < line->rlen )
+            // symbolic allele, only <DEL.*> and <DUP.*> will be realigned
+            // TODO: there should be check for symbolic allele length. If too big, perhaps should not attempt realignment
+            int32_t sv_len = 0;
+            if ( !strncmp("<DEL",line->d.allele[i],4) ) sv_len = -line->rlen;
+            else if ( !strncmp("<DUP",line->d.allele[i],4) )
+            {
+                if ( bcf_get_info_int32(args->hdr,line,"SVLEN",&args->int32_arr,&args->nint32_arr)==1 ) sv_len = args->int32_arr[0];
+            }
+            if ( !sv_len ) return ERR_SYMBOLIC;
+
+            als[i].l = 0;
+            if ( sv_len<0 )
+            {
+                // del, expand REF and replace ALT, for example, replace "REF=C ALT=<DEL>" with "REF=CAT ALT=C"
+                if ( nref < line->rlen )
+                {
+                    free(ref);
+                    reflen = line->rlen;
+                    ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                    if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+                    seq_to_upper(ref,0);
+                    replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
+                    als[0].l = 0;
+                    kputs(ref, &als[0]);
+                }
+                kputsn(als[0].s,1,&als[i]);
+            }
+            else // sv_len>0
             {
+                // dup, replace "REF=C ALT=<DUP>" with "REF=C ALT=CAT"
                 free(ref);
-                reflen = line->rlen;
-                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+sv_len, &nref);
                 if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
                 seq_to_upper(ref,0);
                 replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
-                als[0].l = 0;
-                kputs(ref, &als[0]);
-                als[i].l = 0;
-                kputsn(ref,1,&als[i]);
-                kputs(line->d.allele[i],&del[i]);
-                continue;
+                kputs(ref,&als[i]);
             }
+            kputs(line->d.allele[i],&sym[i]);   // preserve the symbolic allele string
+            continue;
         }
+        if ( i>0 ) symbolic_alts = 0;
         if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
         if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
         {
@@ -612,8 +646,15 @@ static int realign(args_t *args, bcf1_t *line)
     else
         new_pos = realign_right(args, line);
 
-    // Have the alleles changed?
-    als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
+    // Have the alleles changed? Consider <DEL> could have expanded the REF allele. In that
+    // case it must be trimmed, however the new REF length must reflect the entire length.
+    als[0].s[ als[0].l ] = 0;   // for strcmp to work
+    int new_reflen = strlen(als[0].s);
+    if ( symbolic_alts )
+    {
+        als[0].l = 1;
+        als[0].s[ als[0].l ] = 0;
+    }
     if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
 
     set_old_rec_tag(args, line, line, 0);
@@ -623,7 +664,7 @@ static int realign(args_t *args, bcf1_t *line)
     for (i=0; i<line->n_allele; i++)
     {
         if (i>0) kputc(',',&args->tmp_kstr);
-        if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+        if ( sym[i].l ) kputs(sym[i].s,&args->tmp_kstr);
         else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
     }
     args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
@@ -631,7 +672,6 @@ static int realign(args_t *args, bcf1_t *line)
     args->nchanged++;
 
     // Update INFO/END if necessary
-    int new_reflen = strlen(line->d.allele[0]);
     if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
     {
         // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()
@@ -672,7 +712,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                     bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \
             } \
             bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \
@@ -694,7 +734,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                     bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \
             } \
             if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \
@@ -727,7 +767,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                     bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \
             } \
             if ( ialt!=0 ) \
@@ -747,31 +787,36 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
     }
     #undef BRANCH_NUMERIC
 }
-// Find n-th field in a comma-separated list and move it to dst.
-// The memory areas may overlap.
-#define STR_MOVE_NTH(dst,src,end,nth,len) \
-{ \
-    char *ss = src, *se = src; \
-    int j = 0; \
-    while ( *se && se<(end) ) \
-    { \
-        if ( *se==',' ) \
-        { \
-            if ( j==nth ) break; \
-            j++; \
-            ss = se+1; \
-        } \
-        se++; \
-    } \
-    if ( j==nth ) \
-    { \
-        int n = se - ss; \
-        memmove((dst),ss,n); \
-        src = se; \
-        len += n; \
-    } \
-    else len = -1; \
+// Find nth field in a comma-separated list in src and move it to dst.
+// The dst and src memory areas may overlap, end points just after the last valid src
+// character.
+// On success returns pointer to the end of the parsed field and increments ndst by the
+// number of memmoved characters. Returns NULL if the field was not found.
+static inline char *string_move_nth(char *dst, char *src, char *end, int nth, size_t *ndst)
+{
+    if ( src>=end ) return NULL;
+    char *ss = src, *se = src;
+    int j = 0;
+    while ( *se && se<(end) )
+    {
+        if ( *se==',' )
+        {
+            if ( j==nth ) break;
+            j++;
+            ss = se+1;
+        }
+        se++;
+    }
+    if ( j!=nth ) return NULL;
+    if ( ss>=end ) return NULL;
+    if ( !*ss ) return NULL;
+
+    int n = se - ss;
+    memmove((dst),ss,n);
+    *ndst += n;
+    return se;
 }
+
 static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
 {
     const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
@@ -786,41 +831,57 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
     int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key);
     if ( len==BCF_VL_A )
     {
+        char *end = str.s + str.l;
         char *tmp = str.s;
-        int len = 0;
-        STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len] = 0;
+        str.l = 0;
+        tmp = string_move_nth(str.s,tmp,end,ialt,&str.l);
+        if ( !tmp ) str.l = 1, str.s[0] = '.';
+        kputc_(0,&str);
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
     }
     else if ( len==BCF_VL_R )
     {
+        char *end = str.s + str.l;
         char *tmp = str.s;
-        int len = 0;
-        STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
-        str.s[len]=','; tmp++; len++;
-        STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len] = 0;
+        str.l = 0;
+        tmp = string_move_nth(str.s,tmp,end,0,&str.l);
+        if ( tmp )
+        {
+            kputc_(',',&str);
+            tmp = string_move_nth(str.s+str.l,tmp+1,end,ialt,&str.l); // ialt is 0-based index to ALT
+        }
+        if ( !tmp ) str.l = 1, str.s[0] = '.';
+        kputc_(0,&str);
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
     }
     else if ( len==BCF_VL_G )
     {
         int i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+        char *end = str.s + str.l;
         char *tmp = str.s;
-        int len = 0;
-        STR_MOVE_NTH(str.s,tmp,str.s+str.l,0,len);
-        str.s[len]=','; tmp++; len++;
-        STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,i0a-1,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len]=','; tmp++; len++;
-        STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
-        if ( len<0 ) return;   // wrong number of fields: skip
-        str.s[len] = 0;
+        str.l = 0;
+        tmp = string_move_nth(str.s,tmp,end,0,&str.l);
+        if ( tmp )
+        {
+            kputc_(',',&str);
+            tmp = string_move_nth(str.s+str.l,tmp+1,end,i0a-1,&str.l);
+        }
+        if ( tmp )
+        {
+            kputc_(',',&str);
+            tmp = string_move_nth(str.s+str.l,tmp+1,end,iaa-i0a-1,&str.l);
+        }
+        if ( !tmp ) str.l = 1, str.s[0] = '.';
+        kputc_(0,&str);
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
     }
     else
         bcf_update_info_string(args->out_hdr,dst,tag,str.s);
+    if ( args->ntmp_arr1 < str.m )
+    {
+        args->ntmp_arr1 = str.m;
+        args->tmp_arr1 = (uint8_t*)str.s;
+    }
 }
 static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
 {
@@ -845,7 +906,7 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
         {
             if ( gt[j]==bcf_int32_vector_end ) break;
             if ( bcf_gt_is_missing(gt[j]) ) continue; // missing allele: leave as is
-            if ( (ialt==0 || args->ma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
+            if ( bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
             if ( bcf_gt_allele(gt[j])==ialt+1 )
                 gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
             else if ( args->ma_use_ref_allele )
@@ -891,7 +952,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                     bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                     tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \
             } \
             nvals /= nsmpl; \
@@ -924,7 +985,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                     bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
+                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Use --force to proceed anyway.\n", \
                     tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*nsmpl,nvals); \
             } \
             nvals /= nsmpl; \
@@ -979,7 +1040,8 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                     bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                     return; \
                 } \
-                error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
+                error("Error at %s:%"PRId64", the tag %s has wrong number of fields. Use --force to proceed anyway.\n", \
+                    bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
             } \
             nvals /= nsmpl; \
             int all_haploid = nvals==src->n_allele ? 1 : 0; \
@@ -1033,6 +1095,7 @@ static void squeeze_format_char(char *str, int src_blen, int dst_blen, int n)
         isrc += src_blen;
     }
 }
+// ialt is 0-based index to ALT
 static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
 {
     const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id);
@@ -1040,50 +1103,60 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
     if ( !ret ) return; // all values can be empty, leave out the tag, no need to panic
     assert( ret>0 );
 
+    int nsmpl = bcf_hdr_nsamples(args->hdr);
+    int blen = ret/nsmpl;   // per-sample field length
+    assert( blen>0 );
+
     kstring_t str;
     str.m = args->ntmp_arr1;
-    str.l = ret;
     str.s = (char*) args->tmp_arr1;
+    str.l = ret;
 
-    int nsmpl = bcf_hdr_nsamples(args->hdr);
-    int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
-    if ( len==BCF_VL_A )
+    int tag_len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id);
+    if ( tag_len==BCF_VL_A )
     {
-        int i, blen = ret/nsmpl, maxlen = 0;
+        int i, maxlen = 0;
         char *ptr = str.s;
         for (i=0; i<nsmpl; i++)
         {
             char *tmp = ptr;
-            int len = 0;
-            STR_MOVE_NTH(tmp,tmp,ptr+blen,ialt,len);
-            if ( len<0 ) return;   // wrong number of fields: skip
+            char *end = ptr + blen;
+            size_t len = 0;
+            tmp = string_move_nth(ptr,tmp,end,ialt,&len);
+            if ( !tmp ) ptr[0] = '.', len = 1;
             if ( maxlen < len ) maxlen = len;
+            while (len<blen) ptr[len++] = 0;
             ptr += blen;
         }
         if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
         bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
     }
-    else if ( len==BCF_VL_R )
+    else if ( tag_len==BCF_VL_R )
     {
-        int i, blen = ret/nsmpl, maxlen = 0;
+        int i, maxlen = 0;
         char *ptr = str.s;
         for (i=0; i<nsmpl; i++)
         {
             char *tmp = ptr;
-            int len = 0;
-            STR_MOVE_NTH(ptr,tmp,ptr+blen,0,len);
-            ptr[len]=','; tmp++; len++;
-            STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
-            if ( len<0 ) return;   // wrong number of fields: skip
+            char *end = ptr + blen;
+            size_t len = 0;
+            tmp = string_move_nth(ptr,tmp,end,0,&len);
+            if ( tmp )
+            {
+                ptr[len++] = ',';
+                tmp = string_move_nth(ptr+len,tmp+1,end,ialt,&len);
+            }
+            if ( !tmp ) ptr[0] = '.', len = 1;
             if ( maxlen < len ) maxlen = len;
+            while (len<blen) ptr[len++] = 0;
             ptr += blen;
         }
         if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
         bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
     }
-    else if ( len==BCF_VL_G )
+    else if ( tag_len==BCF_VL_G )
     {
-        int i, blen = ret/nsmpl, maxlen = 0, i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
+        int i, maxlen = 0, i0a = bcf_alleles2gt(0,ialt+1), iaa = bcf_alleles2gt(ialt+1,ialt+1);
         char *ptr = str.s;
         for (i=0; i<nsmpl; i++)
         {
@@ -1110,31 +1183,38 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
                     bcf_update_format_char(args->out_hdr,dst,tag,NULL,0);
                     return;
                 }
-                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n",
+                error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d. Use --force to proceed anyway.\n",
                         tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields);
             }
 
-            int len = 0;
+            char *tmp = ptr;
+            char *end = ptr + blen;
+            size_t len = 0;
+            tmp = string_move_nth(ptr,tmp,end,0,&len);
             if ( nfields==src->n_allele )   // haploid
             {
-                char *tmp = ptr;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
-                ptr[len]=','; tmp++; len++;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,ialt,len);
-                if ( len<0 ) return;   // wrong number of fields: skip
+                if ( tmp )
+                {
+                    ptr[len++] = ',';
+                    tmp = string_move_nth(ptr+len,tmp+1,end,ialt,&len);
+                }
             }
             else    // diploid
             {
-                char *tmp = ptr;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,0,len);
-                ptr[len]=','; tmp++; len++;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,i0a-1,len);
-                if ( len<0 ) return;   // wrong number of fields: skip
-                ptr[len]=','; tmp++; len++;
-                STR_MOVE_NTH(&ptr[len],tmp,ptr+blen,iaa-i0a-1,len);
-                if ( len<0 ) return;   // wrong number of fields: skip
+                if ( tmp )
+                {
+                    ptr[len++] = ',';
+                    tmp = string_move_nth(ptr+len,tmp+1,end,i0a-1,&len);
+                }
+                if ( tmp )
+                {
+                    ptr[len++] = ',';
+                    tmp = string_move_nth(ptr+len,tmp+1,end,iaa-i0a-1,&len);
+                }
             }
+            if ( !tmp ) ptr[0] = '.', len = 1;
             if ( maxlen < len ) maxlen = len;
+            while (len<blen) ptr[len++] = 0;
             ptr += blen;
         }
         if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
@@ -1394,6 +1474,23 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info
         bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1);
     }
 }
+static int gt_array_grow_ploidy(args_t *args, uint8_t **tmp_arr, int *ntmp_arr, int ngt_ori, int ngt_new, int nsmpl)
+{
+    *ntmp_arr = 4*ngt_new*nsmpl;
+    int32_t *ptr = (int32_t*)realloc(*tmp_arr,*ntmp_arr);
+    if ( !ptr ) error("Error: failed to allocate %d bytes\n",*ntmp_arr);
+    *tmp_arr = (uint8_t*) ptr;
+
+    int i,j;
+    for (i=nsmpl-1; i>=0; i--)
+    {
+        int32_t *src = ptr + i*ngt_ori;
+        int32_t *dst = ptr + i*ngt_new;
+        for (j=ngt_new; j>ngt_ori; j--) dst[j-1] = bcf_int32_vector_end;
+        for (j=ngt_ori; j>0; j--) dst[j-1] = src[j-1];
+    }
+    return ngt_new;
+}
 static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
 {
     // reusing int8_t arrays as int32_t arrays
@@ -1412,7 +1509,9 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
         int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2);
         args->ntmp_arr2 = ntmp2 * 4;
         ngts2 /= nsmpl;
-        if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
+        int ploidy_changed = ngts - ngts2;
+        if ( ngts < ngts2 ) ngts = gt_array_grow_ploidy(args,&args->tmp_arr1,&args->ntmp_arr1,ngts,ngts2,nsmpl);
+        if ( ngts > ngts2 ) ngts2 = gt_array_grow_ploidy(args,&args->tmp_arr2,&args->ntmp_arr2,ngts2,ngts,nsmpl);
 
         int32_t *gt  = (int32_t*) args->tmp_arr1;       // the first, destination line
         int32_t *gt2 = (int32_t*) args->tmp_arr2;       // one of the subsequent lines, i.e. the source line
@@ -1422,16 +1521,22 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
             // never overwrite with ref allele
             for (k2=0; k2<ngts2; k2++)
             {
-                if ( gt2[k2]==bcf_int32_vector_end ) break;
-                if ( bcf_gt_is_missing(gt2[k2]) ) continue;
+                if ( gt2[k2]==bcf_int32_vector_end )
+                {
+                    if ( ploidy_changed && bcf_gt_is_missing(gt[k2]) ) gt[k2] = bcf_int32_vector_end;
+                    break;
+                }
+                if ( bcf_gt_is_missing(gt2[k2]) ) continue;     // don't overwrite with missing
+
+                // don't overwrite with ref, unless the destination is missing, e.g. "./. + 0/1"
                 int ial2 = bcf_gt_allele(gt2[k2]);
-                if ( ial2==0 ) continue;    // never overwrite with ref
+                if ( ial2==0 && !bcf_gt_is_missing(gt[k2]) && gt[k2]!=bcf_int32_vector_end ) continue;
                 if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
 
                 // The destination allele
                 int ial = args->maps[i].map[ial2];
                 if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
-                    gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+                    gt[k2] = (gt[k2]!=bcf_int32_vector_end && bcf_gt_is_phased(gt[k2])) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
                 else
                 {
                     // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
@@ -1824,77 +1929,129 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
         else if ( type==BCF_HT_INT || type==BCF_HT_REAL ) merge_format_numeric(args, lines, nlines, fmt, dst);
         else merge_format_string(args, lines, nlines, fmt, dst);
     }
+    args->njoined++;
 }
 
 #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
-static void mrows_schedule(args_t *args, bcf1_t **line)
+static void mrows_push(args_t *args, bcf1_t **line)
 {
     int i,m;
-    if ( args->mrows_collapse==COLLAPSE_ANY         // merge all record types together
-        || bcf_get_variant_types(*line)&VCF_SNP     // SNP, put into alines
-        || bcf_get_variant_types(*line)==VCF_REF )  // ref
-    {
-        args->nalines++;
-        m = args->malines;
-        hts_expand(bcf1_t*,args->nalines,args->malines,args->alines);
-        for (i=m; i<args->malines; i++) args->alines[i] = bcf_init1();
-        SWAP(bcf1_t*, args->alines[args->nalines-1], *line);
-    }
-    else
-    {
-        args->nblines++;
-        m = args->mblines;
-        hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
-        for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
-        SWAP(bcf1_t*, args->blines[args->nblines-1], *line);
+    if ( !args->nmrows ) args->mrows_first = 0;
+    args->nmrows++;
+    m = args->mmrows;
+    hts_expand(bcf1_t*,args->nmrows,args->mmrows,args->mrows);
+    for (i=m; i<args->mmrows; i++) args->mrows[i] = bcf_init1();
+    SWAP(bcf1_t*, args->mrows[args->nmrows-1], *line);
+
+    if ( args->mrows_collapse==COLLAPSE_ANY ) return;
+
+    // move the line up the sorted list so that the same variant types end up together
+    int cur_type = bcf_get_variant_types(args->mrows[args->nmrows-1]);
+    i = args->mrows_first + args->nmrows - 1;
+    while (i>0)
+    {
+        int prev_type = bcf_get_variant_types(args->mrows[i-1]);
+        if ( prev_type <= cur_type ) break;
+        bcf1_t *tmp = args->mrows[i-1];
+        args->mrows[i-1] = args->mrows[i];
+        args->mrows[i] = tmp;
+        i--;
     }
 }
-static int mrows_ready_to_flush(args_t *args, bcf1_t *line)
+static int mrows_can_flush(args_t *args, bcf1_t *line)
 {
-    if ( args->nalines && (args->alines[0]->rid!=line->rid || args->alines[0]->pos!=line->pos) ) return 1;
-    if ( args->nblines && (args->blines[0]->rid!=line->rid || args->blines[0]->pos!=line->pos) ) return 1;
+    if ( !args->nmrows ) return 0;
+    int ibeg = args->mrows_first;
+    if ( args->mrows[ibeg]->rid != line->rid ) return 1;
+    if ( args->mrows[ibeg]->pos != line->pos ) return 1;
     return 0;
 }
 static bcf1_t *mrows_flush(args_t *args)
 {
-    if ( args->nblines && args->nalines==1 && bcf_get_variant_types(args->alines[0])==VCF_REF )
+    if ( !args->nmrows ) return NULL;
+
+    int ibeg = args->mrows_first;
+
+    //fprintf(bcftools_stderr,"flush: ibeg=%d n=%d\n",ibeg,args->nmrows);
+    //int i;
+    //for (i=ibeg; i<ibeg+args->nmrows; i++)
+    //  fprintf(bcftools_stderr,"\ti=%d type=%d %s %s\n",i,bcf_get_variant_types(args->mrows[i]),args->mrows[i]->d.allele[0],args->mrows[i]->d.allele[1]);
+
+    if ( args->nmrows==1 )
     {
-        // By default, REF lines are merged with SNPs if SNPs and indels are to be kept separately.
-        // However, if there are indels only and a single REF line, merge it with indels.
-        args->nblines++;
-        int i,m = args->mblines;
-        hts_expand(bcf1_t*,args->nblines,args->mblines,args->blines);
-        for (i=m; i<args->mblines; i++) args->blines[i] = bcf_init1();
-        SWAP(bcf1_t*, args->blines[args->nblines-1], args->alines[0]);
-        args->nalines--;
+        args->nmrows = 0;
+        return args->mrows[ibeg];
     }
-    if ( args->nalines )
+
+    if ( args->mrows_collapse==COLLAPSE_ANY )
     {
-        if ( args->nalines==1 )
-        {
-            args->nalines = 0;
-            return args->alines[0];
-        }
+        // merge everything with anything
         bcf_clear(args->mrow_out);
-        merge_biallelics_to_multiallelic(args, args->mrow_out, args->alines, args->nalines);
-        args->nalines = 0;
+        merge_biallelics_to_multiallelic(args, args->mrow_out, &args->mrows[ibeg], args->nmrows - ibeg);
+        args->nmrows = 0;
         return args->mrow_out;
     }
-    else if ( args->nblines )
+
+    int j;
+    int types[] = { VCF_SNP, VCF_MNP, VCF_INDEL, VCF_OTHER, -1 };       // merge everything within the same category
+    if ( args->mrows_collapse==COLLAPSE_SNPS ) types[1] = -1;           // merge SNPs only
+    else if ( args->mrows_collapse==COLLAPSE_INDELS ) types[0] = VCF_INDEL, types[1] = -1;    // merge indels only
+    for (j=0; types[j]!=-1; j++)
     {
-        if ( args->nblines==1 )
+        int i, type = types[j]; // to keep the compiler happy
+        for (i=ibeg; i<ibeg+args->nmrows; i++)
         {
-            args->nblines = 0;
-            return args->blines[0];
+            type = bcf_get_variant_types(args->mrows[i]);
+            if ( type!=types[j] && type!=VCF_REF ) break;
+        }
+        if ( i==ibeg+1 && type!=VCF_REF )
+        {
+            // just one line of this type, no merging, but multiple lines of different type follow
+            args->nmrows--;
+            args->mrows_first++;
+            return args->mrows[ibeg];
+        }
+        if ( i>ibeg )
+        {
+            // more than one line, merging is needed
+            int nflush = i - ibeg;
+            bcf_clear(args->mrow_out);
+            merge_biallelics_to_multiallelic(args, args->mrow_out, &args->mrows[ibeg], nflush);
+            args->nmrows -= nflush;
+            args->mrows_first += nflush;
+            return args->mrow_out;
         }
-        bcf_clear(args->mrow_out);
-        merge_biallelics_to_multiallelic(args, args->mrow_out, args->blines, args->nblines);
-        args->nblines = 0;
-        return args->mrow_out;
     }
-    return NULL;
+    args->nmrows--;
+    args->mrows_first++;
+    return args->mrows[ibeg];
 }
-static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
+static char *strdup_alt_svlen(args_t *args, bcf1_t *rec, int ial)
+{
+    if ( rec->d.allele[ial][0]!='<' ) return strdup(rec->d.allele[ial]);
+
+    int ntmp = args->ntmp_arr1 / sizeof(int32_t);
+    int n = bcf_get_info_int32(args->hdr, rec, "SVLEN", &args->tmp_arr1, &ntmp);
+    args->ntmp_arr1 = ntmp * sizeof(int32_t);
+    int32_t *svlen = (int32_t *) args->tmp_arr1;
+    if ( n<=0 ) return strdup(rec->d.allele[ial]);
+
+    if ( n+1 != rec->n_allele )
+    {
+        // there should be as many SVLEN numbers as there are ALT alleles
+        static int warned = 0;
+        if ( !warned )
+        {
+            fprintf(bcftools_stderr,"TODO: different number of ALT alleles and SVLEN fields %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+            warned = 1;
+        }
+    }
+
+    kstring_t str = {0,0,0};
+    ksprintf(&str,"%s.%d",rec->d.allele[ial],svlen[ial-1]);
+    return str.s;
+}
+static void cmpals_add(args_t *args, cmpals_t *ca, bcf1_t *rec)
 {
     ca->ncmpals++;
     hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals);
@@ -1902,10 +2059,11 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
     free(cmpals->ref);
     cmpals->ref = strdup(rec->d.allele[0]);
     cmpals->n   = rec->n_allele;
+
     if ( rec->n_allele==2 )
     {
         free(cmpals->alt);
-        cmpals->alt = strdup(rec->d.allele[1]);
+        cmpals->alt = strdup_alt_svlen(args,rec,1);
     }
     else
     {
@@ -1916,9 +2074,10 @@ static void cmpals_add(cmpals_t *ca, bcf1_t *rec)
             khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
     }
 }
-static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
+static int cmpals_match(args_t *args, cmpals_t *ca, bcf1_t *rec)
 {
     int i, j;
+    char *alt_svlen = rec->n_allele==2 ? strdup_alt_svlen(args,rec,1) : NULL;
     for (i=0; i<ca->ncmpals; i++)
     {
         cmpals1_t *cmpals = ca->cmpals + i;
@@ -1930,7 +2089,8 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
         // the most frequent case
         if ( rec->n_allele==2 )
         {
-            if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue;
+            if ( strcasecmp(alt_svlen, cmpals->alt) ) continue;
+            free(alt_svlen);
             return 1;
         }
 
@@ -1940,6 +2100,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec)
         if ( j<rec->n_allele ) continue;
         return 1;
     }
+    free(alt_svlen);
     return 0;
 }
 static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; }
@@ -1966,21 +2127,13 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
         k = rbuf_shift(&args->rbuf);
         if ( args->mrows_op==MROWS_MERGE )
         {
-            if ( mrows_ready_to_flush(args, args->lines[k]) )
+            if ( mrows_can_flush(args, args->lines[k]) )
             {
                 while ( (line=mrows_flush(args)) )
                     if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
             }
-            int merge = 1;
-            if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
-            {
-                if ( !(bcf_get_variant_types(args->lines[k]) & args->mrows_collapse) ) merge = 0;
-            }
-            if ( merge )
-            {
-                mrows_schedule(args, &args->lines[k]);
-                continue;
-            }
+            mrows_push(args, &args->lines[k]);
+            continue;
         }
         else if ( args->rmdup )
         {
@@ -1990,7 +2143,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
                 if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
                 if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
                 if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
-                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue;
             }
             else
             {
@@ -2000,7 +2153,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
                 if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out);
             }
             prev_type |= line_type;
-            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
+            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args,&args->cmpals_out, args->lines[k]);
         }
         if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
     }
@@ -2022,7 +2175,7 @@ static void init_data(args_t *args)
     else
         args->keep_sum_ad = -1;
 
-    args->out_hdr = bcf_hdr_dup(args->hdr);
+    args->out_hdr = args->hdr;
     if ( args->old_rec_tag )
         bcf_hdr_printf(args->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",args->old_rec_tag);
 
@@ -2044,22 +2197,31 @@ static void init_data(args_t *args)
         args->abuf = abuf_init(args->hdr, SPLIT);
         abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr);
         if ( args->old_rec_tag )
+        {
             abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
+            if ( bcf_hdr_sync(args->out_hdr)!=0 ) error("bcf_hdr_sync failed\n");
+        }
         abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
     }
     if ( args->gff_fname )
     {
         args->gff = gff_init(args->gff_fname);
-        gff_set(args->gff,verbosity,1);
+        gff_set(args->gff,verbosity,args->gff_verbosity);
         gff_set(args->gff,strip_chr_names,1);
         gff_parse(args->gff);
         args->idx_tscript = gff_get(args->gff,idx_tscript);
         args->itr_tscript = regitr_init(NULL);
     }
+    if ( args->filter_str )
+        args->filter = filter_init(args->hdr, args->filter_str);
+    args->filter_pass = 1;
+
+    args->out_hdr = bcf_hdr_dup(args->out_hdr);
 }
 
 static void destroy_data(args_t *args)
 {
+    if ( args->filter ) filter_destroy(args->filter);
     if ( args->gff )
     {
         gff_destroy(args->gff);
@@ -2074,20 +2236,17 @@ static void destroy_data(args_t *args)
     for (i=0; i<args->mtmp_lines; i++)
         if ( args->tmp_lines[i] ) bcf_destroy1(args->tmp_lines[i]);
     free(args->tmp_lines);
-    for (i=0; i<args->malines; i++)
-        bcf_destroy1(args->alines[i]);
-    free(args->alines);
-    for (i=0; i<args->mblines; i++)
-        bcf_destroy1(args->blines[i]);
-    free(args->blines);
+    for (i=0; i<args->mmrows; i++)
+        bcf_destroy1(args->mrows[i]);
+    free(args->mrows);
     for (i=0; i<args->mmaps; i++)
         free(args->maps[i].map);
     for (i=0; i<args->ntmp_als; i++)
         free(args->tmp_als[i].s);
-    for (i=0; i<args->ntmp_del; i++)
-        free(args->tmp_del[i].s);
+    for (i=0; i<args->ntmp_sym; i++)
+        free(args->tmp_sym[i].s);
     free(args->tmp_als);
-    free(args->tmp_del);
+    free(args->tmp_sym);
     free(args->tmp_kstr.s);
     if ( args->tmp_str )
     {
@@ -2112,10 +2271,10 @@ static void normalize_line(args_t *args, bcf1_t *line)
 {
     if ( args->fai )
     {
-        if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
+        if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line);
         if ( args->do_indels )
         {
-            int ret = realign(args, line);
+            int ret = args->filter_pass ? realign(args, line) : ERR_OK;
 
             // exclude broken VCF lines
             if ( ret==ERR_REF_MISMATCH && args->check_ref & CHECK_REF_SKIP )
@@ -2135,37 +2294,78 @@ static void normalize_line(args_t *args, bcf1_t *line)
         }
     }
 
-    // insert into sorted buffer
-    rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
-    int i,j;
-    i = j = rbuf_append(&args->rbuf);
-    if ( args->lines[i] ) bcf_destroy(args->lines[i]);
-    args->lines[i] = bcf_dup(line);
-    while ( rbuf_prev(&args->rbuf,&i) )
+    if ( args->filter_pass && args->atomize==SPLIT ) abuf_push(args->abuf,line);
+    while (1)
     {
-        if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
-        j = i;
+        if ( args->filter_pass && args->atomize==SPLIT )
+        {
+            line = abuf_flush(args->abuf, 0);
+            if ( !line ) break;
+        }
+
+        // insert into sorted buffer
+        rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
+        int i,j;
+        i = j = rbuf_append(&args->rbuf);
+        if ( args->lines[i] ) bcf_destroy(args->lines[i]);
+        args->lines[i] = bcf_dup(line);
+        while ( rbuf_prev(&args->rbuf,&i) )
+        {
+            if ( args->lines[i]->rid==args->lines[j]->rid )
+            {
+                bcf_unpack(args->lines[i], BCF_UN_STR);
+                bcf_unpack(args->lines[j], BCF_UN_STR);
+                if ( args->cmp_func(&args->lines[i], &args->lines[j]) > 0) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
+            }
+            j = i;
+        }
+        if ( !args->filter_pass || args->atomize!=SPLIT ) break;
     }
 }
 
-static bcf1_t *next_atomized_line(args_t *args)
+// return 0 on success, 1 when done
+static int split_and_normalize(args_t *args)
 {
-    bcf1_t *rec = NULL;
-    if ( args->atomize==SPLIT )
+    if ( !bcf_sr_next_line(args->files) ) return 1;
+
+    bcf1_t *line = bcf_sr_get_line(args->files,0);
+    args->ntotal++;
+
+    if ( args->filter )
     {
-        rec = abuf_flush(args->abuf, 0);
-        if ( rec ) return rec;
+        args->filter_pass = filter_test(args->filter,line,NULL);
+        if ( args->filter_logic==FLT_EXCLUDE ) args->filter_pass = args->filter_pass ? 0 : 1;
+        if ( !args->filter_pass ) args->nfilter++;
     }
 
-    if ( !bcf_sr_next_line(args->files) ) return NULL;
+    if ( args->mrows_op!=MROWS_SPLIT || line->n_allele<=2 || !args->filter_pass )
+    {
+        // normal operation, no splitting
+        normalize_line(args, line);
+        return 0;
+    }
 
-    if ( args->atomize==SPLIT )
+    // any restrictions on variant types to split?
+    if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
     {
-        abuf_push(args->abuf,bcf_sr_get_line(args->files,0));
-        return abuf_flush(args->abuf, 0);
+        int type = args->mrows_collapse==COLLAPSE_SNPS ? VCF_SNP : VCF_INDEL;
+        if ( !(bcf_get_variant_types(line) & type) )
+        {
+            normalize_line(args, line);
+            return 0;
+        }
     }
-    return bcf_sr_get_line(args->files,0);
+
+    args->nsplit++;
+    split_multiallelic_to_biallelics(args, line);
+
+    int j;
+    for (j=0; j<args->ntmp_lines; j++)
+        normalize_line(args, args->tmp_lines[j]);
+
+    return 0;
 }
+
 static void normalize_vcf(args_t *args)
 {
     char wmode[8];
@@ -2176,64 +2376,40 @@ static void normalize_vcf(args_t *args)
         hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
     if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
     if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( init_index2(args->out,args->out_hdr,args->output_fname,
+                     &args->index_fn, args->write_index)<0 )
+        error("Error: failed to initialise index for %s\n",args->output_fname);
 
-    bcf1_t *line;
-    int prev_rid = -1, prev_pos = -1, prev_type = 0;
-    while ( (line = next_atomized_line(args)) )
+    while (1)
     {
-        args->ntotal++;
-        if ( args->rmdup )
-        {
-            int line_type = bcf_get_variant_types(line);
-            if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
-            {
-                if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
-                if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
-                if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
-                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_in, line) ) continue;
-            }
-            else
-            {
-                prev_rid  = line->rid;
-                prev_pos  = line->pos;
-                prev_type = 0;
-                if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_in);
-            }
-            prev_type |= line_type;
-            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_in, line);
-        }
-
-        // still on the same chromosome?
-        int i,j,ilast = rbuf_last(&args->rbuf);
-        if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome
-
-        int split = 0;
-        if ( args->mrows_op==MROWS_SPLIT )
+        // buffer lines until a different position is reached
+        int done = 0;
+        while (1)
         {
-            split = 1;
-            if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
-            {
-                if ( !(bcf_get_variant_types(line) & args->mrows_collapse) ) split = 0;
-            }
-            if ( split && line->n_allele>2 )
-            {
-                args->nsplit++;
-                split_multiallelic_to_biallelics(args, line);
-                for (j=0; j<args->ntmp_lines; j++)
-                    normalize_line(args, args->tmp_lines[j]);
-            }
-            else
-                split = 0;
+            done = split_and_normalize(args);
+            if ( done ) break;      // no more lines available
+            int i = args->rbuf.f;
+            int j = rbuf_last(&args->rbuf);
+            if ( args->lines[i]->rid != args->lines[j]->rid ) break;
+            if ( args->lines[i]->pos != args->lines[j]->pos ) break;
         }
-        if ( !split )
-            normalize_line(args, line);
+        if ( done ) break;
 
         // find out how many sites to flush
-        ilast = rbuf_last(&args->rbuf);
-        j = 0;
+        int ifst  = args->rbuf.f;
+        int ilast = rbuf_last(&args->rbuf);
+        int i, j = 0;
         for (i=-1; rbuf_next(&args->rbuf,&i); )
         {
+            if ( args->lines[ifst]->rid != args->lines[ilast]->rid )
+            {
+                // there are two chromosomes in the buffer, count how many are on the first chromosome
+                if ( args->lines[ifst]->rid != args->lines[i]->rid ) break;
+                j++;
+                continue;
+            }
+            // there is just one chromosome, flush only lines that are unlikely to change order on
+            // realigning (the buf_win constant)
             if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
             j++;
         }
@@ -2251,7 +2427,8 @@ static void normalize_vcf(args_t *args)
     }
     if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
 
-    fprintf(bcftools_stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
+    fprintf(bcftools_stderr,"Lines   total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n",
+        args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter);
     if ( args->check_ref & CHECK_REF_FIX )
         fprintf(bcftools_stderr,"REF/ALT total/modified/added:  \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set);
 }
@@ -2270,9 +2447,11 @@ static void usage(void)
     fprintf(bcftools_stderr, "    -c, --check-ref e|w|x|s         Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
     fprintf(bcftools_stderr, "    -D, --remove-duplicates         Remove duplicate lines of the same type.\n");
     fprintf(bcftools_stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
+    fprintf(bcftools_stderr, "    -e, --exclude EXPR              Do not normalize records for which the expression is true (see man page for details)\n");
     fprintf(bcftools_stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
     fprintf(bcftools_stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
     fprintf(bcftools_stderr, "    -g, --gff-annot FILE            Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
+    fprintf(bcftools_stderr, "    -i, --include EXPR              Normalize only records for which the expression is true (see man page for details)\n");
     fprintf(bcftools_stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
     fprintf(bcftools_stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
     fprintf(bcftools_stderr, "        --multi-overlaps 0|.        Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
@@ -2285,12 +2464,14 @@ static void usage(void)
     fprintf(bcftools_stderr, "    -R, --regions-file FILE         Restrict to regions listed in a file\n");
     fprintf(bcftools_stderr, "        --regions-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
     fprintf(bcftools_stderr, "    -s, --strict-filter             When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+    fprintf(bcftools_stderr, "    -S, --sort METHOD               Sort order: chr_pos,lex [chr_pos]\n");
     fprintf(bcftools_stderr, "    -t, --targets REGION            Similar to -r but streams rather than index-jumps\n");
     fprintf(bcftools_stderr, "    -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n");
     fprintf(bcftools_stderr, "        --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
-    fprintf(bcftools_stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
+    fprintf(bcftools_stderr, "        --threads INT               Use multithreading with INT worker threads [0]\n");
+    fprintf(bcftools_stderr, "    -v, --verbose INT               Verbosity level (0-2) of GFF parsing [1]\n");
     fprintf(bcftools_stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
-    fprintf(bcftools_stderr, "        --write-index               Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "    -W, --write-index[=FMT]         Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Examples:\n");
     fprintf(bcftools_stderr, "   # normalize and left-align indels\n");
@@ -2321,8 +2502,10 @@ int main_vcfnorm(int argc, char *argv[])
     int region_is_file  = 0;
     int targets_is_file = 0;
     args->use_star_allele = 1;
+    args->gff_verbosity = 1;
     int regions_overlap = 1;
     int targets_overlap = 0;
+    args->cmp_func = cmp_bcf_pos;
 
     static struct option loptions[] =
     {
@@ -2330,9 +2513,12 @@ int main_vcfnorm(int argc, char *argv[])
         {"force",no_argument,NULL,7},
         {"atomize",no_argument,NULL,'a'},
         {"atom-overlaps",required_argument,NULL,11},
+        {"include",required_argument,NULL,'i'},
+        {"exclude",required_argument,NULL,'e'},
         {"old-rec-tag",required_argument,NULL,12},
         {"keep-sum",required_argument,NULL,10},
         {"fasta-ref",required_argument,NULL,'f'},
+        {"sort",required_argument,NULL,'S'},
         {"gff-annot",required_argument,NULL,'g'},
         {"right-align",no_argument,NULL,15},            // undocumented, only for debugging
         {"do-not-normalize",no_argument,NULL,'N'},
@@ -2353,11 +2539,12 @@ int main_vcfnorm(int argc, char *argv[])
         {"check-ref",required_argument,NULL,'c'},
         {"strict-filter",no_argument,NULL,'s'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,14},
+        {"write-index",optional_argument,NULL,'W'},
+        {"verbose",required_argument,NULL,'v'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:W::v:S:i:e:",loptions,NULL)) >= 0) {
         switch (c) {
             case  10:
                 // possibly generalize this also to INFO/AD and other tags
@@ -2366,7 +2553,22 @@ int main_vcfnorm(int argc, char *argv[])
                 args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
                 break;
             case 'g': args->gff_fname = optarg; break;
+            case 'v':
+                args->gff_verbosity = atoi(optarg);
+                if ( args->gff_verbosity<0 || args->gff_verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
+                break;
             case 'a': args->atomize = SPLIT; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'S':
+                if ( !strcasecmp(optarg,"pos") ) args->cmp_func = cmp_bcf_pos;
+                else if ( !strcasecmp(optarg,"lex") ) args->cmp_func = cmp_bcf_pos_ref_alt;
+                else error("Error: the sort order --sort %s is not recognised\n",optarg);
+                break;
             case 11 :
                 if ( optarg[0]=='*' ) args->use_star_allele = 1;
                 else if ( optarg[0]=='.' ) args->use_star_allele = 0;
@@ -2378,7 +2580,10 @@ int main_vcfnorm(int argc, char *argv[])
                 else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
                 else error("Invalid argument to --multi-overlaps\n");
                 break;
-            case 14 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 15 : args->right_align = 1; break;
             case 'N': args->do_indels = 0; break;
             case 'd':
@@ -2487,7 +2692,6 @@ int main_vcfnorm(int argc, char *argv[])
 
     if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
     if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum));
-    if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
     init_data(args);
     normalize_vcf(args);
     destroy_data(args);
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c
index 687751961..4ee99ee13 100644
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -66,7 +66,7 @@ typedef struct _plugin_t plugin_t;
  *      success or non-zero value on error.
  *
  *   int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr)
- *      - called once at startup, allows to initialize local variables.
+ *      - called once at startup, it initializes local variables.
  *      Return 1 to suppress normal VCF/BCF header output, -1 on critical
  *      errors, 0 otherwise.
  *
@@ -550,7 +550,9 @@ static void init_data(args_t *args)
         if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
         if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
-        if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+        if ( init_index2(args->out_fh,args->hdr_out,args->output_fname,
+                         &args->index_fn, args->write_index)<0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
@@ -613,7 +615,7 @@ static void usage(args_t *args)
     fprintf(stderr, "   -l, --list-plugins             List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
     fprintf(stderr, "   -v, --verbose                  Print verbose information, -vv increases verbosity\n");
     fprintf(stderr, "   -V, --version                  Print version string and exit\n");
-    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
+    fprintf(stderr, "   -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -691,11 +693,11 @@ int main_plugin(int argc, char *argv[])
         {"targets-file",required_argument,NULL,'T'},
         {"targets-overlap",required_argument,NULL,2},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vVW::",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'V': version_only = 1; break;
@@ -740,7 +742,10 @@ int main_plugin(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
-            case 10 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case '?':
             case 'h': usage_only = 1; break;
             default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c
index ad04eb44f..f19bc9635 100644
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -68,7 +68,7 @@ typedef struct _plugin_t plugin_t;
  *      success or non-zero value on error.
  *
  *   int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr)
- *      - called once at startup, allows to initialize local variables.
+ *      - called once at startup, it initializes local variables.
  *      Return 1 to suppress normal VCF/BCF header output, -1 on critical
  *      errors, 0 otherwise.
  *
@@ -552,7 +552,9 @@ static void init_data(args_t *args)
         if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
         if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
         if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
-        if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+        if ( init_index2(args->out_fh,args->hdr_out,args->output_fname,
+                         &args->index_fn, args->write_index)<0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname);
     }
 }
 
@@ -615,7 +617,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "   -l, --list-plugins             List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
     fprintf(bcftools_stderr, "   -v, --verbose                  Print verbose information, -vv increases verbosity\n");
     fprintf(bcftools_stderr, "   -V, --version                  Print version string and exit\n");
-    fprintf(bcftools_stderr, "       --write-index              Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "   -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -693,11 +695,11 @@ int main_plugin(int argc, char *argv[])
         {"targets-file",required_argument,NULL,'T'},
         {"targets-overlap",required_argument,NULL,2},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vVW::",loptions,NULL)) >= 0)
     {
         switch (c) {
             case 'V': version_only = 1; break;
@@ -742,7 +744,10 @@ int main_plugin(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
-            case 10 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case '?':
             case 'h': usage_only = 1; break;
             default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c
index 5f4eb07c6..7b1dd4391 100644
--- a/bcftools/vcfquery.c
+++ b/bcftools/vcfquery.c
@@ -55,7 +55,8 @@ typedef struct
     bcf_hdr_t *header;
     int sample_is_file;
     char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out;
-    int argc, list_columns, print_header, allow_undef_tags, force_samples;
+    char *print_filtered;
+    int argc, list_columns, print_header, allow_undef_tags, force_samples, force_newline;
     FILE *out;
 }
 args_t;
@@ -94,9 +95,10 @@ static void init_data(args_t *args)
         smpl_ilist_destroy(ilist);
     }
     args->convert = convert_init(args->header, samples, nsamples, args->format_str);
-    convert_set_option(args->convert, force_newline, 1);
+    if ( args->force_newline ) convert_set_option(args->convert, force_newline, 1);
     convert_set_option(args->convert, subset_samples, &args->smpl_pass);
     if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
+    if ( args->print_header>1 ) convert_set_option(args->convert, no_hdr_indices, 1);
     free(samples);
 
     int max_unpack = convert_max_unpack(args->convert);
@@ -106,6 +108,9 @@ static void init_data(args_t *args)
         max_unpack |= filter_max_unpack(args->filter);
     }
     args->files->max_unpack = max_unpack;
+    if ( !args->filter || args->print_filtered || !(filter_max_unpack(args->filter) & BCF_UN_FMT) )
+        convert_set_option(args->convert, header_samples, 1);
+    if ( args->print_filtered ) convert_set_option(args->convert, print_filtered, args->print_filtered);
 }
 
 static void destroy_data(args_t *args)
@@ -232,10 +237,12 @@ static void usage(void)
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "    -e, --exclude EXPR                Exclude sites for which the expression is true (see man page for details)\n");
     fprintf(stderr, "        --force-samples               Only warn about unknown subset samples\n");
+    fprintf(stderr, "    -F, --print-filtered STR          Output STR for samples failing the -i/-e filtering expression\n");
     fprintf(stderr, "    -f, --format STRING               See man page for details\n");
-    fprintf(stderr, "    -H, --print-header                Print header\n");
+    fprintf(stderr, "    -H, --print-header                Print header, -HH to omit column indices\n");
     fprintf(stderr, "    -i, --include EXPR                Select sites for which the expression is true (see man page for details)\n");
     fprintf(stderr, "    -l, --list-samples                Print the list of samples and exit\n");
+    fprintf(stderr, "    -N, --disable-automatic-newline   Disable automatic addition of newline character when not present\n");
     fprintf(stderr, "    -o, --output FILE                 Output file name [stdout]\n");
     fprintf(stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
     fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
@@ -250,6 +257,7 @@ static void usage(void)
     fprintf(stderr, "\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
+    fprintf(stderr, "\t# For more examples see http://samtools.github.io/bcftools/bcftools.html#query\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -259,6 +267,7 @@ int main_vcfquery(int argc, char *argv[])
     int c, collapse = 0;
     args_t *args = (args_t*) calloc(1,sizeof(args_t));
     args->argc   = argc; args->argv = argv;
+    args->force_newline = 1;
     int regions_is_file = 0, targets_is_file = 0;
     int regions_overlap = 1;
     int targets_overlap = 0;
@@ -267,8 +276,10 @@ int main_vcfquery(int argc, char *argv[])
     {
         {"help",0,0,'h'},
         {"list-samples",0,0,'l'},
+        {"disable-automatic-newline",required_argument,NULL,'N'},
         {"include",1,0,'i'},
         {"exclude",1,0,'e'},
+        {"print-filtered",1,0,'F'},
         {"format",1,0,'f'},
         {"force-samples",0,0,3},
         {"output-file",1,0,'o'},
@@ -288,11 +299,13 @@ int main_vcfquery(int argc, char *argv[])
         {"allow-undef-tags",0,0,'u'},
         {0,0,0,0}
     };
-    while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hlr:R:F:f:a:s:S:Ht:T:c:v:i:e:o:uN",loptions,NULL)) >= 0) {
         switch (c) {
             case 'o': args->fn_out = optarg; break;
+            case 'F': args->print_filtered = optarg; break;
             case 'f': args->format_str = strdup(optarg); break;
-            case 'H': args->print_header = 1; break;
+            case 'N': args->force_newline = 0; break;
+            case 'H': args->print_header++; break;
             case 'v': args->vcf_list = optarg; break;
             case 'c':
                 error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n");
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c
index e4f252033..407d2562e 100644
--- a/bcftools/vcfquery.c.pysam.c
+++ b/bcftools/vcfquery.c.pysam.c
@@ -57,7 +57,8 @@ typedef struct
     bcf_hdr_t *header;
     int sample_is_file;
     char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out;
-    int argc, list_columns, print_header, allow_undef_tags, force_samples;
+    char *print_filtered;
+    int argc, list_columns, print_header, allow_undef_tags, force_samples, force_newline;
     FILE *out;
 }
 args_t;
@@ -96,9 +97,10 @@ static void init_data(args_t *args)
         smpl_ilist_destroy(ilist);
     }
     args->convert = convert_init(args->header, samples, nsamples, args->format_str);
-    convert_set_option(args->convert, force_newline, 1);
+    if ( args->force_newline ) convert_set_option(args->convert, force_newline, 1);
     convert_set_option(args->convert, subset_samples, &args->smpl_pass);
     if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
+    if ( args->print_header>1 ) convert_set_option(args->convert, no_hdr_indices, 1);
     free(samples);
 
     int max_unpack = convert_max_unpack(args->convert);
@@ -108,6 +110,9 @@ static void init_data(args_t *args)
         max_unpack |= filter_max_unpack(args->filter);
     }
     args->files->max_unpack = max_unpack;
+    if ( !args->filter || args->print_filtered || !(filter_max_unpack(args->filter) & BCF_UN_FMT) )
+        convert_set_option(args->convert, header_samples, 1);
+    if ( args->print_filtered ) convert_set_option(args->convert, print_filtered, args->print_filtered);
 }
 
 static void destroy_data(args_t *args)
@@ -234,10 +239,12 @@ static void usage(void)
     fprintf(bcftools_stderr, "Options:\n");
     fprintf(bcftools_stderr, "    -e, --exclude EXPR                Exclude sites for which the expression is true (see man page for details)\n");
     fprintf(bcftools_stderr, "        --force-samples               Only warn about unknown subset samples\n");
+    fprintf(bcftools_stderr, "    -F, --print-filtered STR          Output STR for samples failing the -i/-e filtering expression\n");
     fprintf(bcftools_stderr, "    -f, --format STRING               See man page for details\n");
-    fprintf(bcftools_stderr, "    -H, --print-header                Print header\n");
+    fprintf(bcftools_stderr, "    -H, --print-header                Print header, -HH to omit column indices\n");
     fprintf(bcftools_stderr, "    -i, --include EXPR                Select sites for which the expression is true (see man page for details)\n");
     fprintf(bcftools_stderr, "    -l, --list-samples                Print the list of samples and exit\n");
+    fprintf(bcftools_stderr, "    -N, --disable-automatic-newline   Disable automatic addition of newline character when not present\n");
     fprintf(bcftools_stderr, "    -o, --output FILE                 Output file name [bcftools_stdout]\n");
     fprintf(bcftools_stderr, "    -r, --regions REGION              Restrict to comma-separated list of regions\n");
     fprintf(bcftools_stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
@@ -252,6 +259,7 @@ static void usage(void)
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Examples:\n");
     fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
+    fprintf(bcftools_stderr, "\t# For more examples see http://samtools.github.io/bcftools/bcftools.html#query\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -261,6 +269,7 @@ int main_vcfquery(int argc, char *argv[])
     int c, collapse = 0;
     args_t *args = (args_t*) calloc(1,sizeof(args_t));
     args->argc   = argc; args->argv = argv;
+    args->force_newline = 1;
     int regions_is_file = 0, targets_is_file = 0;
     int regions_overlap = 1;
     int targets_overlap = 0;
@@ -269,8 +278,10 @@ int main_vcfquery(int argc, char *argv[])
     {
         {"help",0,0,'h'},
         {"list-samples",0,0,'l'},
+        {"disable-automatic-newline",required_argument,NULL,'N'},
         {"include",1,0,'i'},
         {"exclude",1,0,'e'},
+        {"print-filtered",1,0,'F'},
         {"format",1,0,'f'},
         {"force-samples",0,0,3},
         {"output-file",1,0,'o'},
@@ -290,11 +301,13 @@ int main_vcfquery(int argc, char *argv[])
         {"allow-undef-tags",0,0,'u'},
         {0,0,0,0}
     };
-    while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hlr:R:F:f:a:s:S:Ht:T:c:v:i:e:o:uN",loptions,NULL)) >= 0) {
         switch (c) {
             case 'o': args->fn_out = optarg; break;
+            case 'F': args->print_filtered = optarg; break;
             case 'f': args->format_str = strdup(optarg); break;
-            case 'H': args->print_header = 1; break;
+            case 'N': args->force_newline = 0; break;
+            case 'H': args->print_header++; break;
             case 'v': args->vcf_list = optarg; break;
             case 'c':
                 error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n");
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c
index a0802db7e..f1d1c86e9 100644
--- a/bcftools/vcfroh.c
+++ b/bcftools/vcfroh.c
@@ -254,7 +254,7 @@ static void init_data(args_t *args)
         {
             if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size);
             args->nbuf_olap = strtol(end+1,&end,10);
-            if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size);
+            if ( *end || args->nbuf_olap<0 ) error("Could not parse: --buffer-size %s\n", args->buffer_size);
         }
         if ( tmp<0 )
             args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n;
diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c
index f9b8aab8a..7519c6e1c 100644
--- a/bcftools/vcfroh.c.pysam.c
+++ b/bcftools/vcfroh.c.pysam.c
@@ -256,7 +256,7 @@ static void init_data(args_t *args)
         {
             if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size);
             args->nbuf_olap = strtol(end+1,&end,10);
-            if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size);
+            if ( *end || args->nbuf_olap<0 ) error("Could not parse: --buffer-size %s\n", args->buffer_size);
         }
         if ( tmp<0 )
             args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n;
diff --git a/bcftools/vcfsom.c b/bcftools/vcfsom.c
index db01d24fd..f7a5dbe6e 100644
--- a/bcftools/vcfsom.c
+++ b/bcftools/vcfsom.c
@@ -37,6 +37,7 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/hts_os.h>
+#include <htslib/hts_defs.h>
 #include <inttypes.h>
 #include "bcftools.h"
 
@@ -83,10 +84,9 @@ typedef struct
 args_t;
 
 static void usage(void);
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
-void mkdir_p(const char *fmt, ...);
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4);
 
-char *msprintf(const char *fmt, ...)
+char * HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) msprintf(const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c
index effd35210..4e56158cc 100644
--- a/bcftools/vcfsom.c.pysam.c
+++ b/bcftools/vcfsom.c.pysam.c
@@ -39,6 +39,7 @@ THE SOFTWARE.  */
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
 #include <htslib/hts_os.h>
+#include <htslib/hts_defs.h>
 #include <inttypes.h>
 #include "bcftools.h"
 
@@ -85,10 +86,9 @@ typedef struct
 args_t;
 
 static void usage(void);
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
-void mkdir_p(const char *fmt, ...);
+FILE *open_file(char **fname, const char *mode, const char *fmt, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4);
 
-char *msprintf(const char *fmt, ...)
+char * HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) msprintf(const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c
index 3b208a0d3..6e21f85be 100644
--- a/bcftools/vcfsort.c
+++ b/bcftools/vcfsort.c
@@ -1,6 +1,6 @@
 /*  vcfsort.c -- sort subcommand
 
-   Copyright (C) 2017-2023 Genome Research Ltd.
+   Copyright (C) 2017-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -41,28 +41,48 @@
 #include <htslib/vcf.h>
 #include <htslib/kstring.h>
 #include <htslib/hts_os.h>
+#include <htslib/hts_defs.h>
+#include <htslib/bgzf.h>
 #include "kheap.h"
 #include "bcftools.h"
 
+#define MAX_TMP_FILES_PER_LAYER 32
+#define MERGE_LAYERS 12
+#define MAX_TMP_FILES (MAX_TMP_FILES_PER_LAYER * MERGE_LAYERS)
+
 typedef struct
 {
     char *fname;
     htsFile *fh;
+    BGZF *bgz;
+    size_t idx;
     bcf1_t *rec;
+    int is_merged;
 }
 blk_t;
 
+typedef struct
+{
+    size_t len;
+    hts_pos_t pos;
+    int rid;
+    float qual;
+    uint8_t data[];
+}
+packed_bcf_t;
+
 typedef struct _args_t
 {
     bcf_hdr_t *hdr;
     char **argv, *fname, *output_fname, *tmp_dir;
     int argc, output_type, clevel;
     size_t max_mem, mem;
-    bcf1_t **buf;
+    packed_bcf_t **buf;
     uint8_t *mem_block;
-    size_t nbuf, mbuf, nblk;
-    blk_t *blk;
-    char *index_fn;
+
+    size_t nbuf, mbuf, nblk, tmp_count;
+    blk_t blk[MAX_TMP_FILES];
+    uint32_t tmp_layers[MERGE_LAYERS];
     int write_index;
 }
 args_t;
@@ -71,9 +91,9 @@ void clean_files(args_t *args)
 {
     int i;
     fprintf(stderr,"Cleaning\n");
-    for (i=0; i<args->nblk; i++)
+    for (i=0; i<MAX_TMP_FILES; i++)
     {
-        blk_t *blk = args->blk + i;
+        blk_t *blk = &args->blk[i];
         if ( blk->fname )
         {
             unlink(blk->fname);
@@ -84,7 +104,8 @@ void clean_files(args_t *args)
     }
     rmdir(args->tmp_dir);
 }
-void clean_files_and_throw(args_t *args, const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 2, 3) HTS_NORETURN
+clean_files_and_throw(args_t *args, const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -95,6 +116,16 @@ void clean_files_and_throw(args_t *args, const char *format, ...)
 }
 
 int cmp_bcf_pos(const void *aptr, const void *bptr)
+{
+    bcf1_t *a = *((bcf1_t**)aptr);
+    bcf1_t *b = *((bcf1_t**)bptr);
+    if ( a->rid < b->rid ) return -1;
+    if ( a->rid > b->rid ) return 1;
+    if ( a->pos < b->pos ) return -1;
+    if ( a->pos > b->pos ) return 1;
+    return 0;
+}
+int cmp_bcf_pos_ref_alt(const void *aptr, const void *bptr)
 {
     bcf1_t *a = *((bcf1_t**)aptr);
     bcf1_t *b = *((bcf1_t**)bptr);
@@ -118,33 +149,279 @@ int cmp_bcf_pos(const void *aptr, const void *bptr)
     return 0;
 }
 
-void buf_flush(args_t *args)
+static int cmp_packed_bcf_pos_ref_alt(const void *aptr, const void *bptr)
 {
-    if ( !args->nbuf ) return;
+    packed_bcf_t *a = *(packed_bcf_t **) aptr;
+    packed_bcf_t *b = *(packed_bcf_t **) bptr;
 
-    qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos);
+    if ( a->rid < b->rid ) return -1;
+    if ( a->rid > b->rid ) return 1;
+    if ( a->pos < b->pos ) return -1;
+    if ( a->pos > b->pos ) return 1;
+    
+    // Sort lexicographically by ref,alt.  These are stored tab-separated
+    // as the first item in packed_bcf_t::data
+    return strcmp((char *) a->data, (char *) b->data);
+}
 
-    args->nblk++;
-    args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
-    if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk);
-    blk_t *blk = args->blk + args->nblk - 1;
+static int cmp_packed_bcf_pos_ref_alt_stable(const void *aptr, const void *bptr)
+{
+    // cmp_bcf_pos_ref_alt() with tie-breaker to make qsort stable
+    int res = cmp_packed_bcf_pos_ref_alt(aptr, bptr);
+    if (res != 0) return res;
+
+    // Got a tie - use the position in the original input to break it.
+    // As everything is read into a big memory buffer, for most records
+    // we can just compare the pointers directly.  The exception is
+    // any record that didn't quite fit in the memory buffer, causing it to be
+    // flushed.  Those are flagged by setting packed_bcf_t::len = SIZE_MAX, and
+    // as they were the last record in the segment, they should always sort
+    // after unflagged records.
+
+    packed_bcf_t *a = *(packed_bcf_t **) aptr;
+    packed_bcf_t *b = *(packed_bcf_t **) bptr;
+
+    if (a->len == SIZE_MAX) return 1;
+    if (b->len == SIZE_MAX) return -1;
+
+    return a < b ? -1 : 1;
+}
+
+static uint8_t *pack_unsigned(uint8_t *data, uint64_t val)
+{
+    do {
+        *data++ = (val & 0x7f) | ((val > 0x7f) ? 0x80 : 0);
+        val >>= 7;
+    } while (val > 0);
+    return data;    
+}
+
+static uint8_t *pack_hts_pos(uint8_t *data, hts_pos_t val)
+{
+    uint64_t sign = val < 0;
+    uint64_t v = val < 0 ? -(val + 1) : val;
+    v = v << 1 | sign;
+    return pack_unsigned(data, v);
+}
+
+static uint8_t *pack_bcf_data(packed_bcf_t *dest, const bcf1_t *src,
+                              int outside_buffer)
+{
+    uint32_t i;
+    uint8_t *data = dest->data;
+    uint8_t *start = dest->data;
+    dest->pos = src->pos;
+    dest->rid = src->rid;
+    dest->qual = src->qual;
+
+    // Copy in alleles, for the comparison function
+    for (i = 0; i < src->n_allele; i++)
+    {
+        size_t l = strlen(src->d.allele[i]);
+        if (i > 0) *data++ = '\t';
+        memcpy(data, src->d.allele[i], l);
+        data += l;
+    }
+    *data++ = '\0';
+
+    if (outside_buffer)
+    {
+        dest->len = SIZE_MAX;
+        memcpy(data, &src, sizeof(src));
+        data += sizeof(src);
+        return data;
+    }
+
+    data = pack_hts_pos(data, src->rlen);
+    data = pack_unsigned(data, src->n_info);
+    data = pack_unsigned(data, src->n_allele);
+    data = pack_unsigned(data, src->n_fmt);
+    data = pack_unsigned(data, src->n_sample);
+    data = pack_unsigned(data, src->shared.l);
+    data = pack_unsigned(data, src->indiv.l);
+    if (src->shared.l)
+        memcpy(data, src->shared.s, src->shared.l);
+    data += src->shared.l;
+    if (src->indiv.l)
+        memcpy(data, src->indiv.s, src->indiv.l);
+    data += src->indiv.l;
+    dest->len = data - start;
+    return data;
+}
+
+static int write_packed_bcf(BGZF *fp, packed_bcf_t *src)
+{
+    // Write pos, rid, qual
+    size_t len = src->data - (uint8_t *) &src->pos;
+    if (bgzf_write_small(fp, &src->pos, len) < len)
+        return -1;
+
+    // Skip the copy of the alleles
+    size_t skip = strlen((char *) src->data) + 1;
+ 
+    // Write everything else
+    if (src->len < SIZE_MAX)
+    {
+        // In main memory block
+        len = src->len - skip;
+        if (bgzf_write_small(fp, src->data + skip, len) < len)
+            return -1;
+    }
+    else
+    {
+        // Record didn't fit in the main block.  To minimize the
+        // overflow, its packed_bcf_t data will be imcomplete.  A pointer to
+        // its bcf1_t struct will have been placed after the allele data
+        // so we can finish the packing job and write it in the same format
+        // the rest of the data
+        bcf1_t *rec;
+        uint8_t tmp[100], *data = tmp;
+        memcpy(&rec, src->data + skip, sizeof(rec));
+
+        data = pack_hts_pos(data, rec->rlen);
+        data = pack_unsigned(data, rec->n_info);
+        data = pack_unsigned(data, rec->n_allele);
+        data = pack_unsigned(data, rec->n_fmt);
+        data = pack_unsigned(data, rec->n_sample);
+        data = pack_unsigned(data, rec->shared.l);
+        data = pack_unsigned(data, rec->indiv.l);
+        if (bgzf_write_small(fp, tmp, data - tmp) < data - tmp)
+            return -1;
+        if (rec->shared.l > 0 &&
+            bgzf_write_small(fp, rec->shared.s, rec->shared.l) < rec->shared.l)
+            return -1;
+        if (rec->indiv.l > 0 &&
+            bgzf_write_small(fp, rec->indiv.s, rec->indiv.l) < rec->indiv.l)
+            return -1;
+    }
+
+    return 0;
+}
+
+static uint64_t unpack_unsigned(BGZF *fp, int *err)
+{
+    uint8_t data;
+    uint64_t val = 0;
+    uint32_t i = 0;
+
+    if (bgzf_read_small(fp, &data, sizeof(data)) <= 0)
+        goto short_read;
+
+    while (data & 0x80)
+    {
+        val |= (uint64_t)(data & 0x7f) << i;
+        i += 7;
+        if (bgzf_read_small(fp, &data, sizeof(data)) <= 0)
+            goto short_read;
+    }
+    val |= (uint64_t)data << i;
+    return val;
+
+ short_read:
+    *err = 1;
+    return 0;
+}
+
+static hts_pos_t unpack_hts_pos(BGZF *fp, int *err)
+{
+    uint64_t v = unpack_unsigned(fp, err);
+
+    if ((v & 1) == 0)
+        return (hts_pos_t)(v >> 1);
+    else
+        return -(hts_pos_t)(v >> 1) - 1;
+}
+
+static int read_packed_bcf(BGZF *fp, bcf1_t *dest)
+{
+    int err = 0;
+    packed_bcf_t tmp;
+    size_t len = tmp.data - (uint8_t *) &tmp.pos;
+    
+    bcf_clear(dest);
+    ssize_t got = bgzf_read_small(fp, &tmp.pos, len);
+    if (got == 0)
+        return -1;  // EOF
+    if (got < len)
+        return -2;  // Error or short read
+    dest->pos = tmp.pos;
+    dest->rid = tmp.rid;
+    dest->qual = tmp.qual;
+    dest->rlen = unpack_hts_pos(fp, &err);
+    dest->n_info = unpack_unsigned(fp, &err);
+    dest->n_allele = unpack_unsigned(fp, &err);
+    dest->n_fmt = unpack_unsigned(fp, &err);
+    dest->n_sample = unpack_unsigned(fp, &err);
+    len = unpack_unsigned(fp, &err);
+    if (ks_resize(&dest->shared, len) != 0)
+        return -2;
+    dest->shared.l = len;
+    len = unpack_unsigned(fp, &err);
+    if (ks_resize(&dest->indiv, len) != 0)
+        return -2;
+    dest->indiv.l = len;
+    err |= bgzf_read_small(fp, dest->shared.s, dest->shared.l) < dest->shared.l;
+    err |= bgzf_read_small(fp, dest->indiv.s, dest->indiv.l) < dest->indiv.l;
+    return err == 0 ? 0 : -2;
+}
 
+void open_tmp_file(args_t *args, blk_t *blk, int is_merged)
+{
     kstring_t str = {0,0,0};
-    ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk);
-    blk->fname = str.s;
-    blk->rec   = NULL;
-    blk->fh    = NULL;
+    int tries = 1000;
 
-    htsFile *fh = hts_open(blk->fname, "wbu");
-    if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno));
-    if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
+    blk->fh = NULL;
+    blk->bgz = NULL;
+
+    do {
+        if (ksprintf(ks_clear(&str), "%s/%05zd%s",
+                     args->tmp_dir, args->tmp_count++,
+                     is_merged ? ".bcf" : "") < 0) {
+            clean_files_and_throw(args, "%s", strerror(errno));
+        }
+
+        if (is_merged)
+            blk->fh = hts_open(str.s, "wbx1");
+        else
+            blk->bgz = bgzf_open(str.s, "wx1");
+        if ( blk->fh == NULL && blk->bgz == NULL && (errno != EEXIST || --tries <= 0)) {
+            clean_files_and_throw(args, "Cannot write %s: %s\n",
+                                  str.s, strerror(errno));
+        }
+    } while (blk->fh == NULL && blk->bgz == NULL);
+
+    blk->fname = ks_release(&str);
+    blk->idx = args->tmp_count - 1;
+}
+
+void do_partial_merge(args_t *args);
+
+void buf_flush(args_t *args, bcf1_t *last_rec)
+{
+    if ( !args->nbuf ) return;
+
+    qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_packed_bcf_pos_ref_alt_stable);
+
+    if (args->tmp_layers[0] >= MAX_TMP_FILES_PER_LAYER)
+        do_partial_merge(args);
+
+    assert(args->nblk < MAX_TMP_FILES);
+    blk_t *blk = &args->blk[args->nblk];
+    blk->is_merged = 0;
+    args->nblk++;
+    args->tmp_layers[0]++;
 
+    assert(blk->fname == NULL && blk->fh == NULL && blk->bgz == NULL);
+
+    open_tmp_file(args, blk, 0);
     int i;
     for (i=0; i<args->nbuf; i++)
     {
-        if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
+        if ( write_packed_bcf(blk->bgz, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
     }
-    if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname);
+
+    if ( bgzf_close(blk->bgz)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname);
+    blk->bgz = NULL;
 
     args->nbuf = 0;
     args->mem  = 0;
@@ -156,19 +433,38 @@ static inline uint8_t *_align_up(uint8_t *ptr)
     return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1)));
 }
 
+#define varint_size(X) ((sizeof(X) * 8 + 7) / 7) // worst case
+
 void buf_push(args_t *args, bcf1_t *rec)
 {
-    size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1]
-        + sizeof(*rec->d.allele)*rec->d.m_allele
-        + sizeof(bcf1_t*)       // args->buf
+    size_t delta = sizeof(rec->pos)
+        + sizeof(rec->rid)
+        + sizeof(rec->qual)
+        + varint_size(rec->rlen)
+        + varint_size(2) // n_info
+        + varint_size(2) // n_allele
+        + varint_size(1) // n_fmt
+        + varint_size(3) // n_sample
+        + varint_size(rec->shared.l)
+        + varint_size(rec->indiv.l)
+        + rec->shared.l + rec->indiv.l
+        + rec->unpack_size[1]   // Alleles
         + 8;                    // the number of _align_up() calls
 
     if ( delta > args->max_mem - args->mem )
     {
+        packed_bcf_t *tmp = malloc(sizeof(*tmp) + rec->unpack_size[1] * sizeof(bcf1_t *));
+        if (!tmp)
+            clean_files_and_throw(args, "[%s] Out of memory\n", __func__);
+        pack_bcf_data(tmp, rec, 1);
+
         args->nbuf++;
         hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
-        args->buf[args->nbuf-1] = rec;
-        buf_flush(args);
+        args->buf[args->nbuf-1] = tmp;
+
+        buf_flush(args, rec);
+
+        free(tmp);
         bcf_destroy(rec);
         return;
     }
@@ -178,48 +474,13 @@ void buf_push(args_t *args, bcf1_t *rec)
 
     uint8_t *ptr_beg = args->mem_block + args->mem;
     uint8_t *ptr = _align_up(ptr_beg);
-    bcf1_t *new_rec = (bcf1_t*)ptr;
-    memcpy(new_rec,rec,sizeof(*rec));
-    ptr += sizeof(*rec);
-
-    // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest
-    // data type in the structure
-    char **allele = (char**)ptr;
-    ptr += rec->n_allele*sizeof(*allele);
-
-    // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
-    // and the end may be uninitialized
-    delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0];
-    while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break;
-    memcpy(ptr,rec->d.als,delta);
-    new_rec->d.als = (char*)ptr;
-    ptr = ptr + delta;
+    packed_bcf_t *packed_rec = (packed_bcf_t *) ptr;
 
-    int i;
-    for (i=0; i<rec->n_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]);
-    new_rec->d.allele = allele;
-
-    memcpy(ptr,rec->shared.s,rec->shared.l);
-    new_rec->shared.s = (char*)ptr;
-    new_rec->shared.m = rec->shared.l;
-    ptr += rec->shared.l;
-
-    memcpy(ptr,rec->indiv.s,rec->indiv.l);
-    new_rec->indiv.s = (char*)ptr;
-    new_rec->indiv.m = rec->indiv.l;
-    ptr += rec->indiv.l;
-
-    // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
-    // and the end may be uninitialized
-    i = 0;
-    while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break;
-    memcpy(ptr,rec->d.id,i);
-    new_rec->d.id = (char*)ptr;
-    ptr += i;
+    ptr = pack_bcf_data(packed_rec, rec, 0);
 
     args->nbuf++;
     hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
-    args->buf[args->nbuf-1] = new_rec;
+    args->buf[args->nbuf-1] = packed_rec;
 
     delta = ptr - ptr_beg;
     args->mem += delta;
@@ -246,11 +507,11 @@ void sort_blocks(args_t *args)
             bcf_destroy(rec);
             break;
         }
-        if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+        if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1);
         bcf_unpack(rec, BCF_UN_STR);
         buf_push(args, rec);
     }
-    buf_flush(args);
+    buf_flush(args, NULL);
     free(args->buf);
 
     if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname);
@@ -260,49 +521,80 @@ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
 {
     blk_t *a = *aptr;
     blk_t *b = *bptr;
-    int ret = cmp_bcf_pos(&a->rec, &b->rec);
+    int ret = cmp_bcf_pos_ref_alt(&a->rec, &b->rec);
     if ( ret < 0 ) return 1;
+    if (ret == 0 && a->idx < b->idx) return 1;
     return 0;
 }
 KHEAP_INIT(blk, blk_t*, blk_is_smaller)
 
 void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk)
 {
-    if ( !blk->fh ) return;
-    int ret = bcf_read(blk->fh, hdr, blk->rec);
+    int ret;
+    if (blk->is_merged)
+    {
+        if ( !blk->fh ) return;
+        ret = bcf_read(blk->fh, hdr, blk->rec);
+    }
+    else
+    {
+        if ( !blk->bgz ) return;
+        ret = read_packed_bcf(blk->bgz, blk->rec);
+    }
     if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname);
     if ( ret == -1 )
     {
-        if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname);
-        blk->fh = 0;
-        return;
+        if (blk->is_merged)
+        {
+            if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname);
+            blk->fh = NULL;
+            return;
+        }
+        else
+        {
+            if ( bgzf_close(blk->bgz) != 0)
+                clean_files_and_throw(args, "Close failed: %s\n", blk->fname);
+            blk->bgz = NULL;
+            return;
+        }
     }
     bcf_unpack(blk->rec, BCF_UN_STR);
     khp_insert(blk, bhp, &blk);
 }
 
-void merge_blocks(args_t *args)
+void merge_blocks(args_t *args, htsFile *out, const char *output_fname,
+                  int idx_fmt, size_t from)
 {
-    fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk);
     khp_blk_t *bhp = khp_init(blk);
+    char *index_fn = NULL;
+    size_t i;
 
-    int i;
-    for (i=0; i<args->nblk; i++)
+    for (i=from; i<args->nblk; i++)
     {
-        blk_t *blk = args->blk + i;
-        blk->fh = hts_open(blk->fname, "r");
-        if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno));
-        bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
-        bcf_hdr_destroy(hdr);
-        blk->rec = bcf_init();
+        blk_t *blk = &args->blk[i];
+        if (blk->is_merged)
+        {
+            blk->fh = hts_open(blk->fname, "r");
+            if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno));
+            bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
+            bcf_hdr_destroy(hdr);
+        }
+        else
+        {
+            blk->bgz = bgzf_open(blk->fname, "r");
+            if (!blk->bgz)
+                clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno));
+        }
         blk_read(args, bhp, args->hdr, blk);
     }
 
-    char wmode[8];
-    set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
-    htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
-    if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__, output_fname);
+
+    if (idx_fmt) {
+        if ( init_index2(out,args->hdr,output_fname,&index_fn,idx_fmt)<0 )
+            error("Error: failed to initialise index for %s\n",output_fname);
+    }
+
     while ( bhp->ndat )
     {
         blk_t *blk = bhp->dat[0];
@@ -310,22 +602,93 @@ void merge_blocks(args_t *args)
         khp_delete(blk, bhp);
         blk_read(args, bhp, args->hdr, blk);
     }
-    if ( args->write_index )
+    if ( idx_fmt )
     {
         if ( bcf_idx_save(out)<0 )
         {
-            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
-            error("Error: cannot write to index %s\n", args->index_fn);
+            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", output_fname);
+            error("Error: cannot write to index %s\n", index_fn);
         }
-        free(args->index_fn);
+        free(index_fn);
     }
-    if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
 
-    clean_files(args);
+    for (i = from; i < args->nblk; i++)
+    {
+        blk_t *blk = &args->blk[i];
+        if (unlink(blk->fname) != 0)
+            clean_files_and_throw(args, "Couldn't remove temporary file %s\n", blk->fname);
+        free(blk->fname);
+        blk->fname = NULL;
+    }
 
-    free(args->blk);
     khp_destroy(blk, bhp);
+}
+
+void do_partial_merge(args_t *args)
+{
+    uint32_t to_layer = 0;
+    size_t to_merge = 0;
+
+    // Temp. files are arranged in layers of at most MAX_TMP_FILES_PER_LAYER.
+    // When a layer is full, it is merged into the next layer up.  Each
+    // layer will therefore contain files with exponentially more records
+    // then the previous one, but will be merged exponentially less frequently.
+    // The result is that the overall complexity will remain O(n*log(n))
+    // even if we need to do lots of partial merges.
+
+    while (to_layer < MERGE_LAYERS
+           && args->tmp_layers[to_layer] >= MAX_TMP_FILES_PER_LAYER)
+    {
+        to_merge += args->tmp_layers[to_layer];
+        args->tmp_layers[to_layer] = 0;
+        to_layer++;
+    }
+
+    assert(to_merge > 0 && to_merge <= args->nblk);
+
+    if (to_layer == MERGE_LAYERS) {
+        // Edge case - if we've got here, we've completely used the
+        // temp file allocation, so merge absolutely everything and
+        // leave one file at the highest level.  Strictly this breaks
+        // the O(n*log(n)) complexity, but unless MERGE_LAYERS and
+        // MAX_TMP_FILES_PER_LAYER are too small it would take so long
+        // to get here it should never actually happen...
+        assert(to_merge == MAX_TMP_FILES_PER_LAYER * MERGE_LAYERS);
+        to_layer = MERGE_LAYERS - 1;
+    }
+
+    blk_t tmp = { NULL };
+    open_tmp_file(args, &tmp, 1);
+    merge_blocks(args, tmp.fh, tmp.fname, 0, args->nblk - to_merge);
+    if (hts_close(tmp.fh) != 0)
+        clean_files_and_throw(args, "Close failed: %s\n", tmp.fname);
+
+    args->nblk -= to_merge;
+    assert(args->blk[args->nblk].fh == NULL);
+    assert(args->blk[args->nblk].fname == NULL);
+    args->blk[args->nblk].is_merged = 1;
+    args->blk[args->nblk].idx = tmp.idx;
+    args->blk[args->nblk++].fname = tmp.fname;
+    args->tmp_layers[to_layer]++;
+}
+
+void merge_to_output(args_t *args)
+{
+    char wmode[8] = { 0 };
+    set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+    const char *output_fname = args->output_fname ? args->output_fname : "-";
+
+    htsFile *out = hts_open(output_fname, wmode);
+    if (!out) clean_files_and_throw(args, "[%s] Error: cannot open %s\n", __func__, output_fname);
+
+    fprintf(stderr,"Merging %zd temporary files\n", args->nblk);
+    merge_blocks(args, out, output_fname, args->write_index, 0);
     fprintf(stderr,"Done\n");
+
+    if ( hts_close(out)!=0 )
+        clean_files_and_throw(args, "Close failed: %s\n", output_fname);
+
+    clean_files(args);
 }
 
 static void usage(args_t *args)
@@ -337,7 +700,6 @@ static void usage(args_t *args)
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "    -m, --max-mem FLOAT[kMG]       maximum memory to use [768M]\n");    // using metric units, 1M=1e6
     fprintf(stderr, "    -o, --output FILE              output file name [stdout]\n");
-    fprintf(stderr, "    -O, --output-type b|u|z|v      b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
     fprintf(stderr, "    -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
 
 #ifdef _WIN32
@@ -345,7 +707,7 @@ static void usage(args_t *args)
 #else
     fprintf(stderr, "    -T, --temp-dir DIR             temporary files [/tmp/bcftools.XXXXXX]\n");
 #endif
-    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
+    fprintf(stderr, "    -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -361,14 +723,23 @@ size_t parse_mem_string(const char *str)
     return mem;
 }
 
-void mkdir_p(const char *fmt, ...);
 static void init(args_t *args)
 {
+    size_t i;
     args->max_mem *= 0.9;
     args->mem_block = malloc(args->max_mem);
     if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem);
     args->mem = 0;
 
+    for (i = 0; i < MAX_TMP_FILES; i++)
+    {
+        args->blk[i].fname = NULL;
+        args->blk[i].rec = bcf_init();
+        if (!args->blk[i].rec)
+            clean_files_and_throw(args,"Couldn't allocate bcf record\n");
+    }
+
+
     args->tmp_dir = init_tmp_prefix(args->tmp_dir);
 
 #ifdef _WIN32
@@ -408,11 +779,11 @@ int main_sort(int argc, char *argv[])
         {"output-file",required_argument,NULL,'o'},
         {"output",required_argument,NULL,'o'},
         {"help",no_argument,NULL,'h'},
-        {"write-index",no_argument,NULL,1},
+        {"write-index",optional_argument,NULL,'W'},
         {0,0,0,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "m:T:O:o:W::h?",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -437,7 +808,10 @@ int main_sort(int argc, char *argv[])
                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                       }
                       break;
-            case  1 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -453,7 +827,7 @@ int main_sort(int argc, char *argv[])
 
     init(args);
     sort_blocks(args);
-    merge_blocks(args);
+    merge_to_output(args);
     destroy(args);
 
     return 0;
diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c
index 948d60b77..da899ae76 100644
--- a/bcftools/vcfsort.c.pysam.c
+++ b/bcftools/vcfsort.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfsort.c -- sort subcommand
 
-   Copyright (C) 2017-2023 Genome Research Ltd.
+   Copyright (C) 2017-2024 Genome Research Ltd.
 
    Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -43,28 +43,48 @@
 #include <htslib/vcf.h>
 #include <htslib/kstring.h>
 #include <htslib/hts_os.h>
+#include <htslib/hts_defs.h>
+#include <htslib/bgzf.h>
 #include "kheap.h"
 #include "bcftools.h"
 
+#define MAX_TMP_FILES_PER_LAYER 32
+#define MERGE_LAYERS 12
+#define MAX_TMP_FILES (MAX_TMP_FILES_PER_LAYER * MERGE_LAYERS)
+
 typedef struct
 {
     char *fname;
     htsFile *fh;
+    BGZF *bgz;
+    size_t idx;
     bcf1_t *rec;
+    int is_merged;
 }
 blk_t;
 
+typedef struct
+{
+    size_t len;
+    hts_pos_t pos;
+    int rid;
+    float qual;
+    uint8_t data[];
+}
+packed_bcf_t;
+
 typedef struct _args_t
 {
     bcf_hdr_t *hdr;
     char **argv, *fname, *output_fname, *tmp_dir;
     int argc, output_type, clevel;
     size_t max_mem, mem;
-    bcf1_t **buf;
+    packed_bcf_t **buf;
     uint8_t *mem_block;
-    size_t nbuf, mbuf, nblk;
-    blk_t *blk;
-    char *index_fn;
+
+    size_t nbuf, mbuf, nblk, tmp_count;
+    blk_t blk[MAX_TMP_FILES];
+    uint32_t tmp_layers[MERGE_LAYERS];
     int write_index;
 }
 args_t;
@@ -73,9 +93,9 @@ void clean_files(args_t *args)
 {
     int i;
     fprintf(bcftools_stderr,"Cleaning\n");
-    for (i=0; i<args->nblk; i++)
+    for (i=0; i<MAX_TMP_FILES; i++)
     {
-        blk_t *blk = args->blk + i;
+        blk_t *blk = &args->blk[i];
         if ( blk->fname )
         {
             unlink(blk->fname);
@@ -86,7 +106,8 @@ void clean_files(args_t *args)
     }
     rmdir(args->tmp_dir);
 }
-void clean_files_and_throw(args_t *args, const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 2, 3) HTS_NORETURN
+clean_files_and_throw(args_t *args, const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -97,6 +118,16 @@ void clean_files_and_throw(args_t *args, const char *format, ...)
 }
 
 int cmp_bcf_pos(const void *aptr, const void *bptr)
+{
+    bcf1_t *a = *((bcf1_t**)aptr);
+    bcf1_t *b = *((bcf1_t**)bptr);
+    if ( a->rid < b->rid ) return -1;
+    if ( a->rid > b->rid ) return 1;
+    if ( a->pos < b->pos ) return -1;
+    if ( a->pos > b->pos ) return 1;
+    return 0;
+}
+int cmp_bcf_pos_ref_alt(const void *aptr, const void *bptr)
 {
     bcf1_t *a = *((bcf1_t**)aptr);
     bcf1_t *b = *((bcf1_t**)bptr);
@@ -120,33 +151,279 @@ int cmp_bcf_pos(const void *aptr, const void *bptr)
     return 0;
 }
 
-void buf_flush(args_t *args)
+static int cmp_packed_bcf_pos_ref_alt(const void *aptr, const void *bptr)
 {
-    if ( !args->nbuf ) return;
+    packed_bcf_t *a = *(packed_bcf_t **) aptr;
+    packed_bcf_t *b = *(packed_bcf_t **) bptr;
 
-    qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos);
+    if ( a->rid < b->rid ) return -1;
+    if ( a->rid > b->rid ) return 1;
+    if ( a->pos < b->pos ) return -1;
+    if ( a->pos > b->pos ) return 1;
+    
+    // Sort lexicographically by ref,alt.  These are stored tab-separated
+    // as the first item in packed_bcf_t::data
+    return strcmp((char *) a->data, (char *) b->data);
+}
 
-    args->nblk++;
-    args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
-    if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk);
-    blk_t *blk = args->blk + args->nblk - 1;
+static int cmp_packed_bcf_pos_ref_alt_stable(const void *aptr, const void *bptr)
+{
+    // cmp_bcf_pos_ref_alt() with tie-breaker to make qsort stable
+    int res = cmp_packed_bcf_pos_ref_alt(aptr, bptr);
+    if (res != 0) return res;
+
+    // Got a tie - use the position in the original input to break it.
+    // As everything is read into a big memory buffer, for most records
+    // we can just compare the pointers directly.  The exception is
+    // any record that didn't quite fit in the memory buffer, causing it to be
+    // flushed.  Those are flagged by setting packed_bcf_t::len = SIZE_MAX, and
+    // as they were the last record in the segment, they should always sort
+    // after unflagged records.
+
+    packed_bcf_t *a = *(packed_bcf_t **) aptr;
+    packed_bcf_t *b = *(packed_bcf_t **) bptr;
+
+    if (a->len == SIZE_MAX) return 1;
+    if (b->len == SIZE_MAX) return -1;
+
+    return a < b ? -1 : 1;
+}
+
+static uint8_t *pack_unsigned(uint8_t *data, uint64_t val)
+{
+    do {
+        *data++ = (val & 0x7f) | ((val > 0x7f) ? 0x80 : 0);
+        val >>= 7;
+    } while (val > 0);
+    return data;    
+}
+
+static uint8_t *pack_hts_pos(uint8_t *data, hts_pos_t val)
+{
+    uint64_t sign = val < 0;
+    uint64_t v = val < 0 ? -(val + 1) : val;
+    v = v << 1 | sign;
+    return pack_unsigned(data, v);
+}
+
+static uint8_t *pack_bcf_data(packed_bcf_t *dest, const bcf1_t *src,
+                              int outside_buffer)
+{
+    uint32_t i;
+    uint8_t *data = dest->data;
+    uint8_t *start = dest->data;
+    dest->pos = src->pos;
+    dest->rid = src->rid;
+    dest->qual = src->qual;
+
+    // Copy in alleles, for the comparison function
+    for (i = 0; i < src->n_allele; i++)
+    {
+        size_t l = strlen(src->d.allele[i]);
+        if (i > 0) *data++ = '\t';
+        memcpy(data, src->d.allele[i], l);
+        data += l;
+    }
+    *data++ = '\0';
+
+    if (outside_buffer)
+    {
+        dest->len = SIZE_MAX;
+        memcpy(data, &src, sizeof(src));
+        data += sizeof(src);
+        return data;
+    }
+
+    data = pack_hts_pos(data, src->rlen);
+    data = pack_unsigned(data, src->n_info);
+    data = pack_unsigned(data, src->n_allele);
+    data = pack_unsigned(data, src->n_fmt);
+    data = pack_unsigned(data, src->n_sample);
+    data = pack_unsigned(data, src->shared.l);
+    data = pack_unsigned(data, src->indiv.l);
+    if (src->shared.l)
+        memcpy(data, src->shared.s, src->shared.l);
+    data += src->shared.l;
+    if (src->indiv.l)
+        memcpy(data, src->indiv.s, src->indiv.l);
+    data += src->indiv.l;
+    dest->len = data - start;
+    return data;
+}
+
+static int write_packed_bcf(BGZF *fp, packed_bcf_t *src)
+{
+    // Write pos, rid, qual
+    size_t len = src->data - (uint8_t *) &src->pos;
+    if (bgzf_write_small(fp, &src->pos, len) < len)
+        return -1;
+
+    // Skip the copy of the alleles
+    size_t skip = strlen((char *) src->data) + 1;
+ 
+    // Write everything else
+    if (src->len < SIZE_MAX)
+    {
+        // In main memory block
+        len = src->len - skip;
+        if (bgzf_write_small(fp, src->data + skip, len) < len)
+            return -1;
+    }
+    else
+    {
+        // Record didn't fit in the main block.  To minimize the
+        // overflow, its packed_bcf_t data will be imcomplete.  A pointer to
+        // its bcf1_t struct will have been placed after the allele data
+        // so we can finish the packing job and write it in the same format
+        // the rest of the data
+        bcf1_t *rec;
+        uint8_t tmp[100], *data = tmp;
+        memcpy(&rec, src->data + skip, sizeof(rec));
+
+        data = pack_hts_pos(data, rec->rlen);
+        data = pack_unsigned(data, rec->n_info);
+        data = pack_unsigned(data, rec->n_allele);
+        data = pack_unsigned(data, rec->n_fmt);
+        data = pack_unsigned(data, rec->n_sample);
+        data = pack_unsigned(data, rec->shared.l);
+        data = pack_unsigned(data, rec->indiv.l);
+        if (bgzf_write_small(fp, tmp, data - tmp) < data - tmp)
+            return -1;
+        if (rec->shared.l > 0 &&
+            bgzf_write_small(fp, rec->shared.s, rec->shared.l) < rec->shared.l)
+            return -1;
+        if (rec->indiv.l > 0 &&
+            bgzf_write_small(fp, rec->indiv.s, rec->indiv.l) < rec->indiv.l)
+            return -1;
+    }
+
+    return 0;
+}
+
+static uint64_t unpack_unsigned(BGZF *fp, int *err)
+{
+    uint8_t data;
+    uint64_t val = 0;
+    uint32_t i = 0;
+
+    if (bgzf_read_small(fp, &data, sizeof(data)) <= 0)
+        goto short_read;
+
+    while (data & 0x80)
+    {
+        val |= (uint64_t)(data & 0x7f) << i;
+        i += 7;
+        if (bgzf_read_small(fp, &data, sizeof(data)) <= 0)
+            goto short_read;
+    }
+    val |= (uint64_t)data << i;
+    return val;
+
+ short_read:
+    *err = 1;
+    return 0;
+}
+
+static hts_pos_t unpack_hts_pos(BGZF *fp, int *err)
+{
+    uint64_t v = unpack_unsigned(fp, err);
+
+    if ((v & 1) == 0)
+        return (hts_pos_t)(v >> 1);
+    else
+        return -(hts_pos_t)(v >> 1) - 1;
+}
+
+static int read_packed_bcf(BGZF *fp, bcf1_t *dest)
+{
+    int err = 0;
+    packed_bcf_t tmp;
+    size_t len = tmp.data - (uint8_t *) &tmp.pos;
+    
+    bcf_clear(dest);
+    ssize_t got = bgzf_read_small(fp, &tmp.pos, len);
+    if (got == 0)
+        return -1;  // EOF
+    if (got < len)
+        return -2;  // Error or short read
+    dest->pos = tmp.pos;
+    dest->rid = tmp.rid;
+    dest->qual = tmp.qual;
+    dest->rlen = unpack_hts_pos(fp, &err);
+    dest->n_info = unpack_unsigned(fp, &err);
+    dest->n_allele = unpack_unsigned(fp, &err);
+    dest->n_fmt = unpack_unsigned(fp, &err);
+    dest->n_sample = unpack_unsigned(fp, &err);
+    len = unpack_unsigned(fp, &err);
+    if (ks_resize(&dest->shared, len) != 0)
+        return -2;
+    dest->shared.l = len;
+    len = unpack_unsigned(fp, &err);
+    if (ks_resize(&dest->indiv, len) != 0)
+        return -2;
+    dest->indiv.l = len;
+    err |= bgzf_read_small(fp, dest->shared.s, dest->shared.l) < dest->shared.l;
+    err |= bgzf_read_small(fp, dest->indiv.s, dest->indiv.l) < dest->indiv.l;
+    return err == 0 ? 0 : -2;
+}
 
+void open_tmp_file(args_t *args, blk_t *blk, int is_merged)
+{
     kstring_t str = {0,0,0};
-    ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk);
-    blk->fname = str.s;
-    blk->rec   = NULL;
-    blk->fh    = NULL;
+    int tries = 1000;
 
-    htsFile *fh = hts_open(blk->fname, "wbu");
-    if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno));
-    if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
+    blk->fh = NULL;
+    blk->bgz = NULL;
+
+    do {
+        if (ksprintf(ks_clear(&str), "%s/%05zd%s",
+                     args->tmp_dir, args->tmp_count++,
+                     is_merged ? ".bcf" : "") < 0) {
+            clean_files_and_throw(args, "%s", strerror(errno));
+        }
+
+        if (is_merged)
+            blk->fh = hts_open(str.s, "wbx1");
+        else
+            blk->bgz = bgzf_open(str.s, "wx1");
+        if ( blk->fh == NULL && blk->bgz == NULL && (errno != EEXIST || --tries <= 0)) {
+            clean_files_and_throw(args, "Cannot write %s: %s\n",
+                                  str.s, strerror(errno));
+        }
+    } while (blk->fh == NULL && blk->bgz == NULL);
+
+    blk->fname = ks_release(&str);
+    blk->idx = args->tmp_count - 1;
+}
+
+void do_partial_merge(args_t *args);
+
+void buf_flush(args_t *args, bcf1_t *last_rec)
+{
+    if ( !args->nbuf ) return;
+
+    qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_packed_bcf_pos_ref_alt_stable);
+
+    if (args->tmp_layers[0] >= MAX_TMP_FILES_PER_LAYER)
+        do_partial_merge(args);
+
+    assert(args->nblk < MAX_TMP_FILES);
+    blk_t *blk = &args->blk[args->nblk];
+    blk->is_merged = 0;
+    args->nblk++;
+    args->tmp_layers[0]++;
 
+    assert(blk->fname == NULL && blk->fh == NULL && blk->bgz == NULL);
+
+    open_tmp_file(args, blk, 0);
     int i;
     for (i=0; i<args->nbuf; i++)
     {
-        if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
+        if ( write_packed_bcf(blk->bgz, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
     }
-    if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname);
+
+    if ( bgzf_close(blk->bgz)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname);
+    blk->bgz = NULL;
 
     args->nbuf = 0;
     args->mem  = 0;
@@ -158,19 +435,38 @@ static inline uint8_t *_align_up(uint8_t *ptr)
     return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1)));
 }
 
+#define varint_size(X) ((sizeof(X) * 8 + 7) / 7) // worst case
+
 void buf_push(args_t *args, bcf1_t *rec)
 {
-    size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1]
-        + sizeof(*rec->d.allele)*rec->d.m_allele
-        + sizeof(bcf1_t*)       // args->buf
+    size_t delta = sizeof(rec->pos)
+        + sizeof(rec->rid)
+        + sizeof(rec->qual)
+        + varint_size(rec->rlen)
+        + varint_size(2) // n_info
+        + varint_size(2) // n_allele
+        + varint_size(1) // n_fmt
+        + varint_size(3) // n_sample
+        + varint_size(rec->shared.l)
+        + varint_size(rec->indiv.l)
+        + rec->shared.l + rec->indiv.l
+        + rec->unpack_size[1]   // Alleles
         + 8;                    // the number of _align_up() calls
 
     if ( delta > args->max_mem - args->mem )
     {
+        packed_bcf_t *tmp = malloc(sizeof(*tmp) + rec->unpack_size[1] * sizeof(bcf1_t *));
+        if (!tmp)
+            clean_files_and_throw(args, "[%s] Out of memory\n", __func__);
+        pack_bcf_data(tmp, rec, 1);
+
         args->nbuf++;
         hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
-        args->buf[args->nbuf-1] = rec;
-        buf_flush(args);
+        args->buf[args->nbuf-1] = tmp;
+
+        buf_flush(args, rec);
+
+        free(tmp);
         bcf_destroy(rec);
         return;
     }
@@ -180,48 +476,13 @@ void buf_push(args_t *args, bcf1_t *rec)
 
     uint8_t *ptr_beg = args->mem_block + args->mem;
     uint8_t *ptr = _align_up(ptr_beg);
-    bcf1_t *new_rec = (bcf1_t*)ptr;
-    memcpy(new_rec,rec,sizeof(*rec));
-    ptr += sizeof(*rec);
-
-    // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest
-    // data type in the structure
-    char **allele = (char**)ptr;
-    ptr += rec->n_allele*sizeof(*allele);
-
-    // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
-    // and the end may be uninitialized
-    delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0];
-    while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break;
-    memcpy(ptr,rec->d.als,delta);
-    new_rec->d.als = (char*)ptr;
-    ptr = ptr + delta;
+    packed_bcf_t *packed_rec = (packed_bcf_t *) ptr;
 
-    int i;
-    for (i=0; i<rec->n_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]);
-    new_rec->d.allele = allele;
-
-    memcpy(ptr,rec->shared.s,rec->shared.l);
-    new_rec->shared.s = (char*)ptr;
-    new_rec->shared.m = rec->shared.l;
-    ptr += rec->shared.l;
-
-    memcpy(ptr,rec->indiv.s,rec->indiv.l);
-    new_rec->indiv.s = (char*)ptr;
-    new_rec->indiv.m = rec->indiv.l;
-    ptr += rec->indiv.l;
-
-    // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
-    // and the end may be uninitialized
-    i = 0;
-    while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break;
-    memcpy(ptr,rec->d.id,i);
-    new_rec->d.id = (char*)ptr;
-    ptr += i;
+    ptr = pack_bcf_data(packed_rec, rec, 0);
 
     args->nbuf++;
     hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
-    args->buf[args->nbuf-1] = new_rec;
+    args->buf[args->nbuf-1] = packed_rec;
 
     delta = ptr - ptr_beg;
     args->mem += delta;
@@ -248,11 +509,11 @@ void sort_blocks(args_t *args)
             bcf_destroy(rec);
             break;
         }
-        if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+        if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,rec),rec->pos+1);
         bcf_unpack(rec, BCF_UN_STR);
         buf_push(args, rec);
     }
-    buf_flush(args);
+    buf_flush(args, NULL);
     free(args->buf);
 
     if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname);
@@ -262,49 +523,80 @@ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
 {
     blk_t *a = *aptr;
     blk_t *b = *bptr;
-    int ret = cmp_bcf_pos(&a->rec, &b->rec);
+    int ret = cmp_bcf_pos_ref_alt(&a->rec, &b->rec);
     if ( ret < 0 ) return 1;
+    if (ret == 0 && a->idx < b->idx) return 1;
     return 0;
 }
 KHEAP_INIT(blk, blk_t*, blk_is_smaller)
 
 void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk)
 {
-    if ( !blk->fh ) return;
-    int ret = bcf_read(blk->fh, hdr, blk->rec);
+    int ret;
+    if (blk->is_merged)
+    {
+        if ( !blk->fh ) return;
+        ret = bcf_read(blk->fh, hdr, blk->rec);
+    }
+    else
+    {
+        if ( !blk->bgz ) return;
+        ret = read_packed_bcf(blk->bgz, blk->rec);
+    }
     if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname);
     if ( ret == -1 )
     {
-        if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname);
-        blk->fh = 0;
-        return;
+        if (blk->is_merged)
+        {
+            if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname);
+            blk->fh = NULL;
+            return;
+        }
+        else
+        {
+            if ( bgzf_close(blk->bgz) != 0)
+                clean_files_and_throw(args, "Close failed: %s\n", blk->fname);
+            blk->bgz = NULL;
+            return;
+        }
     }
     bcf_unpack(blk->rec, BCF_UN_STR);
     khp_insert(blk, bhp, &blk);
 }
 
-void merge_blocks(args_t *args)
+void merge_blocks(args_t *args, htsFile *out, const char *output_fname,
+                  int idx_fmt, size_t from)
 {
-    fprintf(bcftools_stderr,"Merging %d temporary files\n", (int)args->nblk);
     khp_blk_t *bhp = khp_init(blk);
+    char *index_fn = NULL;
+    size_t i;
 
-    int i;
-    for (i=0; i<args->nblk; i++)
+    for (i=from; i<args->nblk; i++)
     {
-        blk_t *blk = args->blk + i;
-        blk->fh = hts_open(blk->fname, "r");
-        if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno));
-        bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
-        bcf_hdr_destroy(hdr);
-        blk->rec = bcf_init();
+        blk_t *blk = &args->blk[i];
+        if (blk->is_merged)
+        {
+            blk->fh = hts_open(blk->fname, "r");
+            if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno));
+            bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
+            bcf_hdr_destroy(hdr);
+        }
+        else
+        {
+            blk->bgz = bgzf_open(blk->fname, "r");
+            if (!blk->bgz)
+                clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno));
+        }
         blk_read(args, bhp, args->hdr, blk);
     }
 
-    char wmode[8];
-    set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
-    htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
-    if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
-    if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
+    if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__, output_fname);
+
+    if (idx_fmt) {
+        if ( init_index2(out,args->hdr,output_fname,&index_fn,idx_fmt)<0 )
+            error("Error: failed to initialise index for %s\n",output_fname);
+    }
+
     while ( bhp->ndat )
     {
         blk_t *blk = bhp->dat[0];
@@ -312,22 +604,93 @@ void merge_blocks(args_t *args)
         khp_delete(blk, bhp);
         blk_read(args, bhp, args->hdr, blk);
     }
-    if ( args->write_index )
+    if ( idx_fmt )
     {
         if ( bcf_idx_save(out)<0 )
         {
-            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
-            error("Error: cannot write to index %s\n", args->index_fn);
+            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", output_fname);
+            error("Error: cannot write to index %s\n", index_fn);
         }
-        free(args->index_fn);
+        free(index_fn);
     }
-    if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
 
-    clean_files(args);
+    for (i = from; i < args->nblk; i++)
+    {
+        blk_t *blk = &args->blk[i];
+        if (unlink(blk->fname) != 0)
+            clean_files_and_throw(args, "Couldn't remove temporary file %s\n", blk->fname);
+        free(blk->fname);
+        blk->fname = NULL;
+    }
 
-    free(args->blk);
     khp_destroy(blk, bhp);
+}
+
+void do_partial_merge(args_t *args)
+{
+    uint32_t to_layer = 0;
+    size_t to_merge = 0;
+
+    // Temp. files are arranged in layers of at most MAX_TMP_FILES_PER_LAYER.
+    // When a layer is full, it is merged into the next layer up.  Each
+    // layer will therefore contain files with exponentially more records
+    // then the previous one, but will be merged exponentially less frequently.
+    // The result is that the overall complexity will remain O(n*log(n))
+    // even if we need to do lots of partial merges.
+
+    while (to_layer < MERGE_LAYERS
+           && args->tmp_layers[to_layer] >= MAX_TMP_FILES_PER_LAYER)
+    {
+        to_merge += args->tmp_layers[to_layer];
+        args->tmp_layers[to_layer] = 0;
+        to_layer++;
+    }
+
+    assert(to_merge > 0 && to_merge <= args->nblk);
+
+    if (to_layer == MERGE_LAYERS) {
+        // Edge case - if we've got here, we've completely used the
+        // temp file allocation, so merge absolutely everything and
+        // leave one file at the highest level.  Strictly this breaks
+        // the O(n*log(n)) complexity, but unless MERGE_LAYERS and
+        // MAX_TMP_FILES_PER_LAYER are too small it would take so long
+        // to get here it should never actually happen...
+        assert(to_merge == MAX_TMP_FILES_PER_LAYER * MERGE_LAYERS);
+        to_layer = MERGE_LAYERS - 1;
+    }
+
+    blk_t tmp = { NULL };
+    open_tmp_file(args, &tmp, 1);
+    merge_blocks(args, tmp.fh, tmp.fname, 0, args->nblk - to_merge);
+    if (hts_close(tmp.fh) != 0)
+        clean_files_and_throw(args, "Close failed: %s\n", tmp.fname);
+
+    args->nblk -= to_merge;
+    assert(args->blk[args->nblk].fh == NULL);
+    assert(args->blk[args->nblk].fname == NULL);
+    args->blk[args->nblk].is_merged = 1;
+    args->blk[args->nblk].idx = tmp.idx;
+    args->blk[args->nblk++].fname = tmp.fname;
+    args->tmp_layers[to_layer]++;
+}
+
+void merge_to_output(args_t *args)
+{
+    char wmode[8] = { 0 };
+    set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+    const char *output_fname = args->output_fname ? args->output_fname : "-";
+
+    htsFile *out = hts_open(output_fname, wmode);
+    if (!out) clean_files_and_throw(args, "[%s] Error: cannot open %s\n", __func__, output_fname);
+
+    fprintf(bcftools_stderr,"Merging %zd temporary files\n", args->nblk);
+    merge_blocks(args, out, output_fname, args->write_index, 0);
     fprintf(bcftools_stderr,"Done\n");
+
+    if ( hts_close(out)!=0 )
+        clean_files_and_throw(args, "Close failed: %s\n", output_fname);
+
+    clean_files(args);
 }
 
 static void usage(args_t *args)
@@ -339,7 +702,6 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "Options:\n");
     fprintf(bcftools_stderr, "    -m, --max-mem FLOAT[kMG]       maximum memory to use [768M]\n");    // using metric units, 1M=1e6
     fprintf(bcftools_stderr, "    -o, --output FILE              output file name [bcftools_stdout]\n");
-    fprintf(bcftools_stderr, "    -O, --output-type b|u|z|v      b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
     fprintf(bcftools_stderr, "    -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
 
 #ifdef _WIN32
@@ -347,7 +709,7 @@ static void usage(args_t *args)
 #else
     fprintf(bcftools_stderr, "    -T, --temp-dir DIR             temporary files [/tmp/bcftools.XXXXXX]\n");
 #endif
-    fprintf(bcftools_stderr, "        --write-index              Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "    -W, --write-index[=FMT]        Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -363,14 +725,23 @@ size_t parse_mem_string(const char *str)
     return mem;
 }
 
-void mkdir_p(const char *fmt, ...);
 static void init(args_t *args)
 {
+    size_t i;
     args->max_mem *= 0.9;
     args->mem_block = malloc(args->max_mem);
     if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem);
     args->mem = 0;
 
+    for (i = 0; i < MAX_TMP_FILES; i++)
+    {
+        args->blk[i].fname = NULL;
+        args->blk[i].rec = bcf_init();
+        if (!args->blk[i].rec)
+            clean_files_and_throw(args,"Couldn't allocate bcf record\n");
+    }
+
+
     args->tmp_dir = init_tmp_prefix(args->tmp_dir);
 
 #ifdef _WIN32
@@ -410,11 +781,11 @@ int main_sort(int argc, char *argv[])
         {"output-file",required_argument,NULL,'o'},
         {"output",required_argument,NULL,'o'},
         {"help",no_argument,NULL,'h'},
-        {"write-index",no_argument,NULL,1},
+        {"write-index",optional_argument,NULL,'W'},
         {0,0,0,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "m:T:O:o:W::h?",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -439,7 +810,10 @@ int main_sort(int argc, char *argv[])
                           if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                       }
                       break;
-            case  1 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case 'h':
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
@@ -455,7 +829,7 @@ int main_sort(int argc, char *argv[])
 
     init(args);
     sort_blocks(args);
-    merge_blocks(args);
+    merge_to_output(args);
     destroy(args);
 
     return 0;
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c
index e2744ab3c..38b4caf51 100644
--- a/bcftools/vcfstats.c
+++ b/bcftools/vcfstats.c
@@ -1,6 +1,6 @@
 /*  vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
 
-    Copyright (C) 2012-2023 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -408,7 +408,8 @@ static void init_user_stats(args_t *args, bcf_hdr_t *hdr, stats_t *stats)
 {
     stats->nusr = args->nusr;
     stats->usr = (user_stats_t*)malloc(sizeof(user_stats_t)*args->nusr);
-    memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t));
+    if (args->nusr)
+        memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t));
     int i;
     for (i=0; i<stats->nusr; i++)
     {
@@ -894,7 +895,7 @@ static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ia
     *ial = 0;
     #define BRANCH_INT(type_t,missing,vector_end) { \
         type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
-        for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+        for (iv=1; iv<ad_fmt_ptr->n && iv<line->n_allele; iv++) \
         { \
             if ( ptr[iv]==vector_end ) break; \
             if ( ptr[iv]==missing ) continue; \
@@ -938,9 +939,12 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
 #define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
 static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
 {
-    int idx = vaf2bin(vaf);
-    if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
-    else smpl_vaf->indel[idx]++;
+    if ( vaf>=0 && vaf<=1 )
+    {
+        int idx = vaf2bin(vaf);
+        if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+        else smpl_vaf->indel[idx]++;
+    }
 }
 
 static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
@@ -1373,7 +1377,10 @@ static void print_stats(args_t *args)
         printf("SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals);
         printf("SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals);
     }
-    printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
+    printf("# TSTV, transitions/transversions\n"
+           "#   - transitions, see https://en.wikipedia.org/wiki/Transition_(genetics)\n"
+           "#   - transversions, see https://en.wikipedia.org/wiki/Transversion\n");
+    printf("# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
     for (id=0; id<args->nstats; id++)
     {
         stats_t *stats = &args->stats[id];
@@ -1393,7 +1400,9 @@ static void print_stats(args_t *args)
     }
     if ( args->indel_ctx )
     {
-        printf("# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
+        printf("# ICS, Indel context:\n"
+               "#   - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n");
+        printf("# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
         for (id=0; id<args->nstats; id++)
         {
             int nc = 0, ni = 0, na = args->stats[id].n_repeat_na;
@@ -1404,7 +1413,9 @@ static void print_stats(args_t *args)
             }
             printf("ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0);
         }
-        printf("# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
+        printf("# ICL, Indel context by length:\n"
+               "#   - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n");
+        printf("# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
         for (id=0; id<args->nstats; id++)
         {
             for (i=1; i<IRC_RLEN; i++)
@@ -1416,7 +1427,12 @@ static void print_stats(args_t *args)
             }
         }
     }
-    printf("# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+    printf("# SiS, Singleton stats:\n"
+           "#   - allele count, i.e. the number of singleton genotypes (AC=1)\n"
+           "#   - number of transitions, see above\n"
+           "#   - number of transversions, see above\n"
+           "#   - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n");
+    printf("# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
     for (id=0; id<args->nstats; id++)
     {
         stats_t *stats = &args->stats[id];
@@ -1513,7 +1529,7 @@ static void print_stats(args_t *args)
             {
                 if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue;   // skip empty bins
                 float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
-                const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n";
+                const char * const fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n";
                 printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
             }
         }
@@ -1731,6 +1747,13 @@ static void print_stats(args_t *args)
         }
     }
 
+    printf("# DP, depth:\n"
+           "#   - set id, see above\n"
+           "#   - the depth bin, corresponds to the depth (unless --depth was given)\n"
+           "#   - number of genotypes with this depth (zero unless -s/-S was given)\n"
+           "#   - fraction of genotypes with this depth (zero unless -s/-S was given)\n"
+           "#   - number of sites with this depth\n"
+           "#   - fraction of sites with this depth\n");
     printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
     for (id=0; id<args->nstats; id++)
     {
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c
index 11db1d1cc..946032ed5 100644
--- a/bcftools/vcfstats.c.pysam.c
+++ b/bcftools/vcfstats.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
 
-    Copyright (C) 2012-2023 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -410,7 +410,8 @@ static void init_user_stats(args_t *args, bcf_hdr_t *hdr, stats_t *stats)
 {
     stats->nusr = args->nusr;
     stats->usr = (user_stats_t*)malloc(sizeof(user_stats_t)*args->nusr);
-    memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t));
+    if (args->nusr)
+        memcpy(stats->usr,args->usr,args->nusr*sizeof(user_stats_t));
     int i;
     for (i=0; i<stats->nusr; i++)
     {
@@ -896,7 +897,7 @@ static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ia
     *ial = 0;
     #define BRANCH_INT(type_t,missing,vector_end) { \
         type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
-        for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+        for (iv=1; iv<ad_fmt_ptr->n && iv<line->n_allele; iv++) \
         { \
             if ( ptr[iv]==vector_end ) break; \
             if ( ptr[iv]==missing ) continue; \
@@ -940,9 +941,12 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
 #define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
 static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
 {
-    int idx = vaf2bin(vaf);
-    if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
-    else smpl_vaf->indel[idx]++;
+    if ( vaf>=0 && vaf<=1 )
+    {
+        int idx = vaf2bin(vaf);
+        if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+        else smpl_vaf->indel[idx]++;
+    }
 }
 
 static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
@@ -1375,7 +1379,10 @@ static void print_stats(args_t *args)
         fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals);
         fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals);
     }
-    fprintf(bcftools_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
+    fprintf(bcftools_stdout, "# TSTV, transitions/transversions\n"
+           "#   - transitions, see https://en.wikipedia.org/wiki/Transition_(genetics)\n"
+           "#   - transversions, see https://en.wikipedia.org/wiki/Transversion\n");
+    fprintf(bcftools_stdout, "# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n");
     for (id=0; id<args->nstats; id++)
     {
         stats_t *stats = &args->stats[id];
@@ -1395,7 +1402,9 @@ static void print_stats(args_t *args)
     }
     if ( args->indel_ctx )
     {
-        fprintf(bcftools_stdout, "# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
+        fprintf(bcftools_stdout, "# ICS, Indel context:\n"
+               "#   - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n");
+        fprintf(bcftools_stdout, "# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n");
         for (id=0; id<args->nstats; id++)
         {
             int nc = 0, ni = 0, na = args->stats[id].n_repeat_na;
@@ -1406,7 +1415,9 @@ static void print_stats(args_t *args)
             }
             fprintf(bcftools_stdout, "ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0);
         }
-        fprintf(bcftools_stdout, "# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
+        fprintf(bcftools_stdout, "# ICL, Indel context by length:\n"
+               "#   - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n");
+        fprintf(bcftools_stdout, "# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n");
         for (id=0; id<args->nstats; id++)
         {
             for (i=1; i<IRC_RLEN; i++)
@@ -1418,7 +1429,12 @@ static void print_stats(args_t *args)
             }
         }
     }
-    fprintf(bcftools_stdout, "# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
+    fprintf(bcftools_stdout, "# SiS, Singleton stats:\n"
+           "#   - allele count, i.e. the number of singleton genotypes (AC=1)\n"
+           "#   - number of transitions, see above\n"
+           "#   - number of transversions, see above\n"
+           "#   - repeat-consistent, inconsistent and n/a: experimental and useless stats [DEPRECATED]\n");
+    fprintf(bcftools_stdout, "# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
     for (id=0; id<args->nstats; id++)
     {
         stats_t *stats = &args->stats[id];
@@ -1515,7 +1531,7 @@ static void print_stats(args_t *args)
             {
                 if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue;   // skip empty bins
                 float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
-                const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n";
+                const char * const fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n";
                 fprintf(bcftools_stdout, fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
             }
         }
@@ -1733,6 +1749,13 @@ static void print_stats(args_t *args)
         }
     }
 
+    fprintf(bcftools_stdout, "# DP, depth:\n"
+           "#   - set id, see above\n"
+           "#   - the depth bin, corresponds to the depth (unless --depth was given)\n"
+           "#   - number of genotypes with this depth (zero unless -s/-S was given)\n"
+           "#   - fraction of genotypes with this depth (zero unless -s/-S was given)\n"
+           "#   - number of sites with this depth\n"
+           "#   - fraction of sites with this depth\n");
     fprintf(bcftools_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
     for (id=0; id<args->nstats; id++)
     {
diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c
index e09efa0bc..58063ebb4 100644
--- a/bcftools/vcfview.c
+++ b/bcftools/vcfview.c
@@ -36,6 +36,7 @@ THE SOFTWARE.  */
 #include <htslib/vcf.h>
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
+#include <htslib/kbitset.h>
 #include "bcftools.h"
 #include "filter.h"
 #include "htslib/khash_str2int.h"
@@ -78,6 +79,7 @@ typedef struct _args_t
     int record_cmd_line;
     char *index_fn;
     int write_index;
+    int trim_star_allele;
     htsFile *out;
 }
 args_t;
@@ -456,6 +458,19 @@ int subset_vcf(args_t *args, bcf1_t *line)
         int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
         if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
     }
+    if (args->trim_star_allele)
+    {
+        int iunseen;
+        if ( args->trim_star_allele && (line->n_allele > 2 || args->trim_star_allele > 1) && (iunseen=get_unseen_allele(line)) && iunseen>0 )
+        {
+            // the unobserved star allele should be trimmed, either it is variant site or trimming of all sites was requested
+            kbitset_t *rm_set = kbs_init(line->n_allele);
+            kbs_insert(rm_set, iunseen);
+            if ( bcf_remove_allele_set(args->hdr,line,rm_set) )
+                error("[%s] Error: failed to trim the unobserved allele at %s:%"PRIhts_pos"\n",__func__,bcf_seqname(args->hdr,line),line->pos+1);
+            kbs_destroy(rm_set);
+        }
+    }
     if (args->phased) {
         int phased = bcf_all_phased(args->hdr, line);
         if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
@@ -512,6 +527,7 @@ static void usage(args_t *args)
     fprintf(stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "Subset options:\n");
+    fprintf(stderr, "    -A, --trim-unseen-allele          Remove '<*>' or '<NON_REF>' at variant (-A) or at all (-AA) sites\n");
     fprintf(stderr, "    -a, --trim-alt-alleles            Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
     fprintf(stderr, "    -I, --no-update                   Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
     fprintf(stderr, "    -s, --samples [^]LIST             Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n");
@@ -534,7 +550,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
     fprintf(stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
     fprintf(stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
-    fprintf(stderr, "          --write-index                    Automatically index the output files [off]\n");
+    fprintf(stderr, "    -W,   --write-index[=FMT]              Automatically index the output files [off]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -568,6 +584,7 @@ int main_vcfview(int argc, char *argv[])
         {"exclude",required_argument,NULL,'e'},
         {"include",required_argument,NULL,'i'},
         {"trim-alt-alleles",no_argument,NULL,'a'},
+        {"trim-unseen-allele",no_argument,NULL,'A'},
         {"no-update",no_argument,NULL,'I'},
         {"drop-genotypes",no_argument,NULL,'G'},
         {"private",no_argument,NULL,'x'},
@@ -600,11 +617,11 @@ int main_vcfview(int argc, char *argv[])
         {"phased",no_argument,NULL,'p'},
         {"exclude-phased",no_argument,NULL,'P'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:aAuUhHc:C:Ii:e:xXpPq:Q:g:W::",loptions,NULL)) >= 0)
     {
         char allele_type[9] = "nref";
         switch (c)
@@ -646,6 +663,7 @@ int main_vcfview(int argc, char *argv[])
             case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
             case  1 : args->force_samples = 1; break;
             case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
+            case 'A': args->trim_star_allele++; break;
             case 'I': args->update_info = 0; break;
             case 'G': args->sites_only = 1; break;
 
@@ -732,7 +750,10 @@ int main_vcfview(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
-            case 10 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
         }
@@ -789,7 +810,9 @@ int main_vcfview(int argc, char *argv[])
     else if ( args->output_type & FT_BCF )
         error("BCF output requires header, cannot proceed with -H\n");
 
-    if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+    if ( init_index2(args->out,out_hdr,args->fn_out, &args->index_fn,
+                     args->write_index) < 0 )
+        error("Error: failed to initialise index for %s\n",args->fn_out);
 
     int ret = 0;
     if (!args->header_only)
diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c
index 1485b1e65..d430bca0b 100644
--- a/bcftools/vcfview.c.pysam.c
+++ b/bcftools/vcfview.c.pysam.c
@@ -38,6 +38,7 @@ THE SOFTWARE.  */
 #include <htslib/vcf.h>
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/vcfutils.h>
+#include <htslib/kbitset.h>
 #include "bcftools.h"
 #include "filter.h"
 #include "htslib/khash_str2int.h"
@@ -80,6 +81,7 @@ typedef struct _args_t
     int record_cmd_line;
     char *index_fn;
     int write_index;
+    int trim_star_allele;
     htsFile *out;
 }
 args_t;
@@ -458,6 +460,19 @@ int subset_vcf(args_t *args, bcf1_t *line)
         int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
         if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1);
     }
+    if (args->trim_star_allele)
+    {
+        int iunseen;
+        if ( args->trim_star_allele && (line->n_allele > 2 || args->trim_star_allele > 1) && (iunseen=get_unseen_allele(line)) && iunseen>0 )
+        {
+            // the unobserved star allele should be trimmed, either it is variant site or trimming of all sites was requested
+            kbitset_t *rm_set = kbs_init(line->n_allele);
+            kbs_insert(rm_set, iunseen);
+            if ( bcf_remove_allele_set(args->hdr,line,rm_set) )
+                error("[%s] Error: failed to trim the unobserved allele at %s:%"PRIhts_pos"\n",__func__,bcf_seqname(args->hdr,line),line->pos+1);
+            kbs_destroy(rm_set);
+        }
+    }
     if (args->phased) {
         int phased = bcf_all_phased(args->hdr, line);
         if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
@@ -514,6 +529,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "        --threads INT                 Use multithreading with INT worker threads [0]\n");
     fprintf(bcftools_stderr, "\n");
     fprintf(bcftools_stderr, "Subset options:\n");
+    fprintf(bcftools_stderr, "    -A, --trim-unseen-allele          Remove '<*>' or '<NON_REF>' at variant (-A) or at all (-AA) sites\n");
     fprintf(bcftools_stderr, "    -a, --trim-alt-alleles            Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
     fprintf(bcftools_stderr, "    -I, --no-update                   Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
     fprintf(bcftools_stderr, "    -s, --samples [^]LIST             Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n");
@@ -536,7 +552,7 @@ static void usage(args_t *args)
     fprintf(bcftools_stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
     fprintf(bcftools_stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
     fprintf(bcftools_stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
-    fprintf(bcftools_stderr, "          --write-index                    Automatically index the output files [off]\n");
+    fprintf(bcftools_stderr, "    -W,   --write-index[=FMT]              Automatically index the output files [off]\n");
     fprintf(bcftools_stderr, "\n");
     bcftools_exit(1);
 }
@@ -570,6 +586,7 @@ int main_vcfview(int argc, char *argv[])
         {"exclude",required_argument,NULL,'e'},
         {"include",required_argument,NULL,'i'},
         {"trim-alt-alleles",no_argument,NULL,'a'},
+        {"trim-unseen-allele",no_argument,NULL,'A'},
         {"no-update",no_argument,NULL,'I'},
         {"drop-genotypes",no_argument,NULL,'G'},
         {"private",no_argument,NULL,'x'},
@@ -602,11 +619,11 @@ int main_vcfview(int argc, char *argv[])
         {"phased",no_argument,NULL,'p'},
         {"exclude-phased",no_argument,NULL,'P'},
         {"no-version",no_argument,NULL,8},
-        {"write-index",no_argument,NULL,10},
+        {"write-index",optional_argument,NULL,'W'},
         {NULL,0,NULL,0}
     };
     char *tmp;
-    while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:aAuUhHc:C:Ii:e:xXpPq:Q:g:W::",loptions,NULL)) >= 0)
     {
         char allele_type[9] = "nref";
         switch (c)
@@ -648,6 +665,7 @@ int main_vcfview(int argc, char *argv[])
             case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
             case  1 : args->force_samples = 1; break;
             case 'a': args->trim_alts = 1; args->calc_ac = 1; break;
+            case 'A': args->trim_star_allele++; break;
             case 'I': args->update_info = 0; break;
             case 'G': args->sites_only = 1; break;
 
@@ -734,7 +752,10 @@ int main_vcfview(int argc, char *argv[])
                 break;
             case  9 : args->n_threads = strtol(optarg, 0, 0); break;
             case  8 : args->record_cmd_line = 0; break;
-            case 10 : args->write_index = 1; break;
+            case 'W':
+                if (!(args->write_index = write_index_parse(optarg)))
+                    error("Unsupported index format '%s'\n", optarg);
+                break;
             case '?': usage(args); break;
             default: error("Unknown argument: %s\n", optarg);
         }
@@ -791,7 +812,9 @@ int main_vcfview(int argc, char *argv[])
     else if ( args->output_type & FT_BCF )
         error("BCF output requires header, cannot proceed with -H\n");
 
-    if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+    if ( init_index2(args->out,out_hdr,args->fn_out, &args->index_fn,
+                     args->write_index) < 0 )
+        error("Error: failed to initialise index for %s\n",args->fn_out);
 
     int ret = 0;
     if (!args->header_only)
diff --git a/bcftools/version.c b/bcftools/version.c
index 38417a78b..2defb4fbb 100644
--- a/bcftools/version.c
+++ b/bcftools/version.c
@@ -1,6 +1,6 @@
 /*  version.c -- report version numbers for plugins.
 
-    Copyright (C) 2014-2023 Genome Research Ltd.
+    Copyright (C) 2014-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -88,7 +88,8 @@ void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
     const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
     if ( !end ) end = fname ? fname + strlen(fname) : fname;
     int len = end - fname;
-    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) )
+        ret = hts_bcf_wmode(file_type & FT_BCF ? file_type : FT_BCF|FT_GZ);
     else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
     else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
     else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
@@ -112,27 +113,55 @@ int parse_overlap_option(const char *arg)
     else return -1;
 }
 
+// Used to set args->write_index in CLI.
+// It will be true if set correctly.
+// Note due to HTS_FMT_CSI being zero we have to use an additional bit.
+int write_index_parse(char *arg) {
+    int fmt = HTS_FMT_CSI;
+
+    if (arg) {
+        if (strcmp(arg, "csi") == 0 || strcmp(arg, "=csi") == 0)
+            fmt = HTS_FMT_CSI;
+        else if (strcmp(arg, "tbi") == 0 || strcmp(arg, "=tbi") == 0)
+            fmt = HTS_FMT_TBI;
+        else
+            return 0;
+    }
+
+    return 128 | fmt;
+}
+
 // See also samtools/sam_utils.c auto_index()
-int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
-{
-    int min_shift = 14; // CSI
+int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname,
+                char **idx_fname, int idx_fmt) {
+    // Nothing to do == success.  This simplifies the main code simpler.
+    if (!idx_fmt)
+        return 0;
+
+    int min_shift;
+    char *idx_suffix;
+
+    if (idx_fmt && (idx_fmt&127) == HTS_FMT_TBI && fh->format.format == vcf) {
+        min_shift = 0;  // TBI
+        idx_suffix = "tbi";
+    } else {
+        min_shift = 14; // CSI
+        idx_suffix = "csi";
+    }
 
     if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
 
     char *delim = strstr(fname, HTS_IDX_DELIM);
-    if (delim)
-    {
+    if (delim) {
         delim += strlen(HTS_IDX_DELIM);
         *idx_fname = strdup(delim);
         if ( !*idx_fname ) return -1;
 
         size_t l = strlen(*idx_fname);
         if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
-    }
-    else
-    {
+    } else {
         if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
-        sprintf(*idx_fname, "%s.csi", fname);
+        sprintf(*idx_fname, "%s.%s", fname, idx_suffix);
     }
 
     if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
@@ -140,4 +169,7 @@ int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
     return 0;
 }
 
-
+int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname)
+{
+    return init_index2(fh,hdr, fname, idx_fname, HTS_FMT_CSI);
+}
diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c
index 23949bf02..4944b57e9 100644
--- a/bcftools/version.c.pysam.c
+++ b/bcftools/version.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  version.c -- report version numbers for plugins.
 
-    Copyright (C) 2014-2023 Genome Research Ltd.
+    Copyright (C) 2014-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -90,7 +90,8 @@ void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
     const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
     if ( !end ) end = fname ? fname + strlen(fname) : fname;
     int len = end - fname;
-    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) )
+        ret = hts_bcf_wmode(file_type & FT_BCF ? file_type : FT_BCF|FT_GZ);
     else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
     else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
     else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
@@ -114,27 +115,55 @@ int parse_overlap_option(const char *arg)
     else return -1;
 }
 
+// Used to set args->write_index in CLI.
+// It will be true if set correctly.
+// Note due to HTS_FMT_CSI being zero we have to use an additional bit.
+int write_index_parse(char *arg) {
+    int fmt = HTS_FMT_CSI;
+
+    if (arg) {
+        if (strcmp(arg, "csi") == 0 || strcmp(arg, "=csi") == 0)
+            fmt = HTS_FMT_CSI;
+        else if (strcmp(arg, "tbi") == 0 || strcmp(arg, "=tbi") == 0)
+            fmt = HTS_FMT_TBI;
+        else
+            return 0;
+    }
+
+    return 128 | fmt;
+}
+
 // See also samtools/sam_utils.c auto_index()
-int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
-{
-    int min_shift = 14; // CSI
+int init_index2(htsFile *fh, bcf_hdr_t *hdr, const char *fname,
+                char **idx_fname, int idx_fmt) {
+    // Nothing to do == success.  This simplifies the main code simpler.
+    if (!idx_fmt)
+        return 0;
+
+    int min_shift;
+    char *idx_suffix;
+
+    if (idx_fmt && (idx_fmt&127) == HTS_FMT_TBI && fh->format.format == vcf) {
+        min_shift = 0;  // TBI
+        idx_suffix = "tbi";
+    } else {
+        min_shift = 14; // CSI
+        idx_suffix = "csi";
+    }
 
     if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
 
     char *delim = strstr(fname, HTS_IDX_DELIM);
-    if (delim)
-    {
+    if (delim) {
         delim += strlen(HTS_IDX_DELIM);
         *idx_fname = strdup(delim);
         if ( !*idx_fname ) return -1;
 
         size_t l = strlen(*idx_fname);
         if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
-    }
-    else
-    {
+    } else {
         if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
-        sprintf(*idx_fname, "%s.csi", fname);
+        sprintf(*idx_fname, "%s.%s", fname, idx_suffix);
     }
 
     if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
@@ -142,4 +171,7 @@ int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
     return 0;
 }
 
-
+int init_index(htsFile *fh, bcf_hdr_t *hdr, const char *fname, char **idx_fname)
+{
+    return init_index2(fh,hdr, fname, idx_fname, HTS_FMT_CSI);
+}
diff --git a/bcftools/version.sh b/bcftools/version.sh
index 69bf963de..007c916a2 100755
--- a/bcftools/version.sh
+++ b/bcftools/version.sh
@@ -24,7 +24,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Master version, for use in tarballs or non-git source copies
-VERSION=1.18
+VERSION=1.21
 
 # If we have a git clone, then check against the current tag
 if [ -e .git ]
diff --git a/devtools/buildwheels.sh b/devtools/buildwheels.sh
deleted file mode 100755
index 4d12566da..000000000
--- a/devtools/buildwheels.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-#
-# Build manylinux1 wheels for pysam. Based on the example at
-# <https://github.com/pypa/python-manylinux-demo>
-#
-# It is best to run this in a fresh clone of the repository!
-#
-# Before running, make sure to update image:
-#
-#   docker pull quay.io/pypa/manylinux1_x86_64
-#
-# Run this within the repository root:
-#   docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
-#
-# The wheels will be put into the wheelhouse/ subdirectory.
-#
-# For interactive tests:
-#   docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash
-
-set -xeuo pipefail
-
-# For convenience, if this script is called from outside of a docker container,
-# it starts a container and runs itself inside of it.
-if ! grep -q docker /proc/1/cgroup; then
-  # We are not inside a container
-  exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0
-fi
-
-yum install -y zlib-devel bzip2-devel xz-devel
-
-# Without libcurl support, htslib can open files from HTTP and FTP URLs.
-# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
-# current version of OpenSSL, and we do not want to be responsible for
-# updating the wheels as soon as there are any security issues. So disable
-# libcurl for now.
-# See also <https://github.com/pypa/manylinux/issues/74>.
-#
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
-
-PYBINS="/opt/python/*/bin"
-echo $PYBINS
-
-for PYBIN in ${PYBINS}; do
-    ${PYBIN}/pip install -r /io/requirements.txt
-    ${PYBIN}/pip wheel /io/ -w wheelhouse/
-done
-
-# Bundle external shared libraries into the wheels
-#
-# The '-L ""' option is a workaround. By default, auditwheel puts all external
-# libraries (.so files) into a .libs directory and sets the RUNPATH to $ORIGIN/.libs.
-# When HTSLIB_MODE is 'shared' (now the default), then all so libraries part of
-# pysam require that RUNPATH is set to $ORIGIN (without the .libs). It seems
-# auditwheel overwrites $ORIGIN with $ORIGIN/.libs. This workaround makes
-# auditwheel keeps the RUNPATH at "$ORIGIN" and put all the external libraries into
-# the pysam directory.
-for whl in wheelhouse/*.whl; do
-    auditwheel repair -L "" $whl -w /io/wheelhouse/
-done
-
-# mkdir -p /io/wheelhouse
-# cp wheelhouse/*.whl /io/wheelhouse
-
-# Created files are owned by root, so fix permissions.
-chown -R --reference=/io/setup.py /io/wheelhouse/
-
-# TODO Install packages and test them
-for PYBIN in ${PYBINS}; do
-   ${PYBIN}/pip install pysam --no-index -f /io/wheelhouse
-   # smoketest
-   (cd $HOME; ${PYBIN}/python -c 'import pysam')
-   # todo: add more tests
-done
diff --git a/devtools/check-platform.sh b/devtools/check-platform.sh
new file mode 100755
index 000000000..1203133a7
--- /dev/null
+++ b/devtools/check-platform.sh
@@ -0,0 +1,32 @@
+#!/bin/sh -e
+
+case $1 in
+    ubuntu-*-arm)  expected=arm ;;
+    macos-13)      expected=x86_64 ;;
+    ubuntu-*)      expected=x86_64 ;;
+    macos-*)       expected=arm ;;
+    windows-*)     expected=x86_64 ;;
+    *)
+	echo Unknown platform $1 >&2
+	exit 2
+	;;
+esac
+
+arch=$(uname -m)
+case $arch in
+    arm*|aarch*)  actual=arm ;;
+    x86*)         actual=x86_64 ;;
+    *)
+	echo Unrecognized uname result $arch >&2
+	exit 2
+	;;
+esac
+
+if test $actual = $expected
+then
+    echo Running on $arch as expected
+    exit 0
+else
+    echo Platform $arch is not the expected $expected >&2
+    exit 1
+fi
diff --git a/devtools/conda-recipe/build.sh b/devtools/conda-recipe/build.sh
deleted file mode 100644
index 32b67dbf2..000000000
--- a/devtools/conda-recipe/build.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# Use internal htslib
-chmod a+x ./htslib/configure
-export CFLAGS="-I${PREFIX}/include/curl/ -I${PREFIX}/include -L${PREFIX}/lib"
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
-
-$PYTHON setup.py install
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
deleted file mode 100644
index 4e57895bf..000000000
--- a/devtools/conda-recipe/meta.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-package:
-  name: pysam
-  version: 0.8.5
-
-source:
-  path: ../../
-
-build:
-  number: 0
-
-requirements:
-  build:
-    - python
-    - setuptools
-    - zlib
-    - cython
-
-  run:
-    - python
-    - zlib
-
-test:
-  imports:
-    - pysam
-
-about:
-  home: https://github.com/pysam-developers/pysam
-  license: MIT
-  summary: Pysam is a python module for reading and manipulating Samfiles. It's a lightweight wrapper of the samtools C-API. Pysam also includes an interface for tabix.
diff --git a/devtools/emulate-tools.py b/devtools/emulate-tools.py
new file mode 100755
index 000000000..30dd75ab1
--- /dev/null
+++ b/devtools/emulate-tools.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+This script can be symlinked to samtools, bcftools, bgzip, or tabix.
+When invoked under one of those names, it will emulate that tool's
+behaviour by using pysam's facilities.
+"""
+
+import argparse
+import gzip
+import os
+import sys
+import tempfile
+
+import pysam
+
+command = os.path.basename(sys.argv[0])
+
+if command in ("samtools", "bcftools"):
+    if len(sys.argv) > 1:
+        try:
+            tool = pysam.utils.PysamDispatcher(command, sys.argv[1])
+            tool(*sys.argv[2:], catch_stdout=None)
+            print(tool.stderr, end="", file=sys.stderr)
+        except pysam.utils.SamtoolsError as e:
+            sys.exit(f"emulate-tools.py: {e}")
+
+    else:
+        version = getattr(pysam.version, f"__{command}_version__")
+        print(f"Program: {command}\nVersion: {version}", file=sys.stderr)
+
+else:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--stdout", action="store_true")
+    parser.add_argument("-d", "--decompress", action="store_true")
+    parser.add_argument("-f", "--force", action="store_true")
+    parser.add_argument("-p", "--preset")
+    parser.add_argument("input_file", nargs="?")
+    opt = parser.parse_args()
+
+    if command == "bgzip":
+        if opt.decompress:
+            with gzip.open(sys.stdin.buffer, "rb") as f:
+                sys.stdout.buffer.write(f.read())
+
+        elif opt.input_file and opt.stdout:
+            pysam.tabix_compress(opt.input_file, "-", force=True)
+
+        else:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(sys.stdin.buffer.read())
+            f.close()
+            pysam.tabix_compress(f.name, "-", force=True)
+            os.remove(f.name)
+
+    elif command == "tabix":
+        pysam.tabix_index(opt.input_file, preset=opt.preset, force=opt.force)
+
+    else:
+        sys.exit(f"emulate-tools.py: unknown command {command!r}")
diff --git a/devtools/environment-dev.yaml b/devtools/environment-dev.yaml
index c98cc4736..224ed084f 100644
--- a/devtools/environment-dev.yaml
+++ b/devtools/environment-dev.yaml
@@ -2,5 +2,7 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
+  - bzip2
   - cython
   - setuptools
+  - xz
diff --git a/devtools/import.py b/devtools/import.py
index a4652f44f..2d955dea2 100644
--- a/devtools/import.py
+++ b/devtools/import.py
@@ -8,6 +8,7 @@
 
 import fnmatch
 import os
+from pathlib import Path
 import re
 import itertools
 import shutil
@@ -35,8 +36,8 @@
         "peakfit.h",
         "polysomy.c"),
     "htslib": (
-        'htslib/tabix.c', 'htslib/bgzip.c',
-        'htslib/htsfile.c',
+        'annot-tsv.c', 'bgzip.c', 'htsfile.c', 'tabix.c',
+        'hts_probe_cc.sh',
         "samples", "test", "tests"),
 }
 
@@ -119,6 +120,7 @@ def _update_pysam_files(cf, destdir):
                 if fn == "bamtk.c":
                     lines = re.sub(r'(#include "version.h")', r'\1\n#include "samtools_config_vars.h"', lines)
                     lines = re.sub(r'(else if.*"tview")', r'//\1', lines)
+                    lines = re.sub(r'(if[ (]*fclose)', r'if (0) { //\1', lines)
 
                 outfile.write(lines)
 
@@ -152,9 +154,10 @@ def _update_pysam_files(cf, destdir):
                              locate("version.sh", srcdir, exclude_htslib=True))
 
     if dest == "htslib":
-        # Add build files, including *.ac *.in *.mk *.m4 *.sh
+        # Add build files, including *.ac config.{guess,sub} *.in *.mk *.m4 *.sh
         mfiles = itertools.chain(mfiles, locate("Makefile", srcdir),
                                  locate("configure", srcdir),
+                                 locate("config.*", srcdir),
                                  locate("*.[aims][cnk4h]", srcdir, exclude))
 
     ncopied = 0
@@ -243,6 +246,9 @@ def _update_version_doc_file(dest, value, filename):
     _update_version_file(C_VERSION[dest], version + " (pysam)", "pysam/version.h")
     _update_version_doc_file(dest, version, "README.rst")
     _update_version_doc_file(dest, version, "doc/index.rst")
+    if dest in MAIN:
+        path = Path("doc/conf.py")
+        path.write_text(re.sub(rf"doc/[\d.]*/{dest}", rf"doc/{version}/{dest}", path.read_text()))
 
     sys.exit(0)
 
diff --git a/devtools/install-CGAT-tools.sh b/devtools/install-CGAT-tools.sh
deleted file mode 100755
index e45d39159..000000000
--- a/devtools/install-CGAT-tools.sh
+++ /dev/null
@@ -1,282 +0,0 @@
-#!/usr/bin/env bash
-
-# function to detect the Operating System
-detect_os(){
-
-if [ -f /etc/os-release ]; then
-
-   OS=$(cat /etc/os-release | awk '/VERSION_ID/ {sub("="," "); print $2;}' | sed 's/\"//g' | awk '{sub("\\."," "); print $1;}')
-   if [ "$OS" != "12" ] ; then
-
-      echo       
-      echo " Ubuntu version not supported "
-      echo
-      echo " Only Ubuntu 12.x has been tested so far "
-      echo 
-      exit 1;
-
-   fi
-
-   OS="ubuntu"
-
-elif [ -f /etc/system-release ]; then
-
-   OS=$(cat /etc/system-release | awk ' {print $4;}' | awk '{sub("\\."," "); print $1;}')
-   if [ "$OS" != "6" ] ; then
-      echo
-      echo " Scientific Linux version not supported "
-      echo
-      echo " Only 6.x Scientific Linux has been tested so far "
-      echo
-      exit 1;
-   fi
-
-   OS="sl"
-
-else
-
-   echo
-   echo " Operating system not supported "
-   echo
-   echo " Exiting installation "
-   echo
-   exit 1;
-
-fi
-} # detect_os
-
-# message to display when the OS is not correct
-sanity_check_os() {
-   echo
-   echo " Unsupported operating system: $OS "
-   echo " Installation aborted "
-   echo
-   exit 1;
-} # sanity_check_os
-
-# function to install operating system dependencies
-install_os_packages() {
-
-if [ "$OS" == "ubuntu" -o "$OS" == "travis" ] ; then
-
-   echo
-   echo " Installing packages for Ubuntu "
-   echo
-
-   apt-get install -y gcc g++
-
-elif [ "$OS" == "sl" ] ; then
-
-   echo 
-   echo " Installing packages for Scientific Linux "
-   echo
-
-   yum -y install gcc zlib-devel gcc-c++
-
-else
-
-   sanity_check_os
-
-fi # if-OS
-} # install_os_packages
-
-# function to install Python dependencies
-install_python_deps() {
-
-if [ "$OS" == "ubuntu" -o "$OS" == "sl" ] ; then
-
-   echo
-   echo " Installing Python dependencies for $1 "
-   echo
-
-   # Create virtual environment
-   cd
-   mkdir CGAT
-   cd CGAT
-   wget --no-check-certificate https://pypi.python.org/packages/source/v/virtualenv/virtualenv-1.10.1.tar.gz
-   tar xvfz virtualenv-1.10.1.tar.gz
-   rm virtualenv-1.10.1.tar.gz
-   cd virtualenv-1.10.1
-   python virtualenv.py cgat-venv
-   source cgat-venv/bin/activate
-
-   # Install Python prerequisites
-   pip install cython
-
-elif [ "$OS" == "travis" ] ; then
-   # Travis-CI provides a virtualenv with Python 2.7
-   echo 
-   echo " Installing Python dependencies in travis "
-   echo
-
-   # Install Python prerequisites
-   pip install cython
-   pip install nose
-
-else
-
-   sanity_check_os
-
-fi # if-OS
-} # install_python_deps
-
-# common set of tasks to prepare external dependencies
-nosetests_external_deps() {
-echo
-echo " Running nosetests for $1 "
-echo
-
-pushd .
-
-# create a new folder to store external tools
-mkdir -p $HOME/CGAT/external-tools
-
-# install samtools
-cd $HOME/CGAT/external-tools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3/samtools-1.3.tar.bz2 > samtools-1.3.tar.bz2
-tar xjf samtools-1.3.tar.bz2 
-cd samtools-1.3
-make
-PATH=$PATH:$HOME/CGAT/external-tools/samtools-1.3
-
-echo "installed samtools"
-samtools --version
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-# install bcftools
-cd $HOME/CGAT/external-tools
-curl -L https://github.com/samtools/bcftools/releases/download/1.3/bcftools-1.3.tar.bz2 > bcftools-1.3.tar.bz2
-tar xjf bcftools-1.3.tar.bz2
-cd bcftools-1.3
-make
-PATH=$PATH:$HOME/CGAT/external-tools/bcftools-1.3
-
-echo "installed bcftools"
-bcftools --version
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-popd
-
-} # nosetests_external_deps
-
-
-# function to run nosetests
-run_nosetests() {
-
-echo
-echo " Running nosetests for $1 "
-echo
-
-# prepare external dependencies
-nosetests_external_deps $OS
-
-# install code
-python setup.py install
-
-# change into tests directory. Otherwise,
-# 'import pysam' will import the repository,
-# not the installed version. This causes
-# problems in the compilation test.
-cd tests
-
-# create auxiliary data
-echo
-echo 'building test data'
-echo 
-make -C pysam_data all
-make -C cbcf_data all
-make -C tabix_data all
-
-# run nosetests
-# -s: do not capture stdout, conflicts with pysam.dispatch
-# -v: verbose output
-nosetests -s -v 
-
-} # run_nosetests
-
-# function to display help message
-help_message() {
-echo
-echo " Use this script as follows: "
-echo
-echo " 1) Become root and install the operating system* packages: "
-echo " ./install-CGAT-tools.sh --install-os-packages"
-echo
-echo " 2) Now, as a normal user (non root), install the Python dependencies**: "
-echo " ./install-CGAT-tools.sh --install-python-deps"
-echo
-echo " At this stage the CGAT Code Collection is ready to go and you do not need further steps. Please type the following for more information:"
-echo " source $HOME/CGAT/virtualenv-1.10.1/cgat-venv/bin/activate"
-echo " cgat --help "
-echo
-echo " The CGAT Code Collection tests the software with nosetests. If you are interested in running those, please continue with the following steps:"
-echo
-echo " 3) Become root to install external tools and set up the environment: "
-echo " ./install-CGAT-tools.sh --install-nosetests-deps"
-echo
-echo " 4) Then, back as a normal user (non root), run nosetests as follows:"
-echo " ./install-CGAT-tools.sh --run-nosetests"
-echo 
-echo " NOTES: "
-echo " * Supported operating systems: Ubuntu 12.x and Scientific Linux 6.x "
-echo " ** An isolated virtual environment will be created to install Python dependencies "
-echo
-exit 1;
-}
-
-
-# the main script starts here
-
-if [ $# -eq 0 -o $# -gt 1 ] ; then
-
-   help_message
-
-else
-
-   if [ "$1" == "--help" ] ; then
-
-      help_message
-
-   elif [ "$1" == "--travis" ] ; then
-
-      OS="travis"
-      install_os_packages
-      install_python_deps
-      run_nosetests
-
-   elif [ "$1" == "--install-os-packages" ] ; then
-
-      detect_os
-      install_os_packages
-
-   elif [ "$1" == "--install-python-deps" ] ; then
-
-      detect_os
-      install_python_deps
-
-   elif [ "$1" == "--install-nosetests-deps" ] ; then
-
-      detect_os
-      install_nosetests_deps
-
-   elif [ "$1" == "--run-nosetests" ] ; then
-
-      detect_os
-      run_nosetests
-
-   else 
-
-      echo 
-      echo " Incorrect input parameter: $1 "
-      help_message
-
-   fi # if argument 1
-
-fi # if number of input parameters
-
diff --git a/devtools/install-prerequisites.sh b/devtools/install-prerequisites.sh
index 83be6aa1f..e78ef596b 100755
--- a/devtools/install-prerequisites.sh
+++ b/devtools/install-prerequisites.sh
@@ -12,6 +12,7 @@ elif test -x /usr/bin/yum; then
     else
         echo Installing non-test prerequisites via yum...
         yum -y install zlib-devel bzip2-devel xz-devel curl-devel openssl-devel
+        emulate=yes
     fi
 
 elif test -d /etc/dpkg; then
@@ -23,6 +24,7 @@ elif test -x /sbin/apk; then
     echo Installing non-test prerequisites via apk...
     apk update
     apk add zlib-dev bzip2-dev xz-dev curl-dev openssl-dev
+    emulate=yes
 
 elif test -x ${HOMEBREW_PREFIX-/usr/local}/bin/brew; then
     echo Installing prerequisites via brew...
@@ -32,3 +34,14 @@ elif test -x ${HOMEBREW_PREFIX-/usr/local}/bin/brew; then
 else
     echo No package manager detected
 fi
+
+if test -n "$emulate" && test $# -ge 2; then
+    emulator=$1
+    bindir=$2
+    echo Creating symlinks to $emulator in $bindir...
+    mkdir -p $bindir
+    ln -s $emulator $bindir/samtools
+    ln -s $emulator $bindir/bcftools
+    ln -s $emulator $bindir/bgzip
+    ln -s $emulator $bindir/tabix
+fi
diff --git a/devtools/run_tests_travis.sh b/devtools/run_tests_travis.sh
deleted file mode 100755
index 1f14fc34d..000000000
--- a/devtools/run_tests_travis.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env bash
-
-# test script for pysam.
-# The script performs the following tasks:
-# 1. Setup a conda environment and install dependencies via conda
-# 2. Build pysam via the conda recipe
-# 3. Build pysam via setup.py from repository
-# 4. Run tests on the setup.py version
-# 5. Additional build tests
-# 5.1 pip install with cython
-# 5.2 pip install without cython
-# 5.3 pip install without cython and without configure options
-
-pushd .
-
-WORKDIR=`pwd`
-
-#Install miniconda python
-if [ $TRAVIS_OS_NAME == "osx" ]; then
-	wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh
-else
-	wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate  # Default OS versions are old and have SSL / CERT issues
-fi
-
-bash Miniconda3.sh -b
-
-# Create a new conda environment with the target python version
-~/miniconda3/bin/conda install conda-build -y
-~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy pytest psutil pip
-
-# activate testenv environment
-source ~/miniconda3/bin/activate testenv
-
-conda config --add channels defaults
-conda config --add channels bioconda
-conda config --add channels conda-forge
-
-# pin versions, so that tests do not fail when pysam/htslib out of step
-# add htslib dependencies
-# NB: force conda-forge:blas due to conda/conda#7548
-conda install -y "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" xz curl bzip2 "conda-forge::blas=*=openblas"
-
-# As HTSLIB_MODE is (defaulted to) 'shared', ensure we don't pick up
-# the external headers from the Conda-installed htslib package.
-mv $CONDA_PREFIX/include/htslib $CONDA_PREFIX/include/htslib-disable
-
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
-
-echo "show samtools, htslib, and bcftools versions"
-samtools --version
-htsfile --version
-bcftools --version
-
-# Try building conda recipe first
-~/miniconda3/bin/conda-build devtools/conda-recipe/ --python=$CONDA_PY
-
-# install code from the repository via setup.py
-echo
-echo "============ installing via setup.py from repository ============"
-echo
-python setup.py install || exit
-
-# create auxiliary data
-echo
-echo 'building test data'
-echo
-make -C tests/pysam_data
-make -C tests/cbcf_data
-make -C tests/tabix_data
-
-# echo any limits that are in place
-ulimit -a
-
-# run tests
-pytest
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-# build source tar-ball. Make sure to run 'build' target so that .pyx
-# files are cythonized.
-python setup.py build sdist
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-# check for presence of config.h files
-echo "checking for presence of config.h files in tar-ball"
-tar -tvzf dist/pysam-*.tar.gz | grep "config.h$"
-
-if [ $? != 1 ]; then
-    echo "ERROR: found config.h in tar-ball"
-    tar -tvzf dist/pysam-*.tar.gz | grep "config.h%"
-    exit 1
-fi
-
-# test pip installation from tar-ball with cython
-echo "pip installing with cython"
-pip install --verbose --no-deps --no-binary=:all: dist/pysam-*.tar.gz
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-# attempt pip installation without cython
-echo "pip installing without cython"
-~/miniconda3/bin/conda remove -y cython
-~/miniconda3/bin/conda list
-echo "python is" `which python`
-pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-# attempt pip installation without cython and without
-# command line options
-echo "pip installing without cython and no configure options"
-export HTSLIB_CONFIGURE_OPTIONS=""
-pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
-
-if [ $? != 0 ]; then
-    exit 1
-fi
diff --git a/doc/api.rst b/doc/api.rst
index 47fe314b7..fc88f1729 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -15,7 +15,7 @@ To use the module to read a file in BAM format, create a
    import pysam
    samfile = pysam.AlignmentFile("ex1.bam", "rb")
 
-Once a file is opened you can iterate over all of the read mapping to
+Once a file is opened you can iterate over all of the reads mapping to
 a specified region using :meth:`~pysam.AlignmentFile.fetch`.  Each
 iteration returns a :class:`~pysam.AlignedSegment` object which
 represents a single read along with its fields and optional tags::
@@ -103,7 +103,7 @@ tabix indexed tab-separated file formats with genomic data::
 :class:`~pysam.TabixFile` implements lazy parsing in order to iterate
 over large tables efficiently.
 
-More detailed usage instructions is at :ref:`usage`.
+More detailed usage instructions are available at :ref:`usage`.
 
 .. note::
 
@@ -200,7 +200,6 @@ FASTQ files
 .. autoclass:: pysam.FastxFile
    :members:
 
-
 .. autoclass:: pysam.FastqProxy
    :members:
 
@@ -214,10 +213,10 @@ VCF/BCF files
 .. autoclass:: pysam.VariantHeader
    :members:
 
-.. autoclass:: pysam.VariantRecord
+.. autoclass:: pysam.VariantHeaderRecord
    :members:
 
-.. autoclass:: pysam.VariantHeaderRecord
+.. autoclass:: pysam.VariantRecord
    :members:
 
 HTSFile
diff --git a/doc/conf.py b/doc/conf.py
index 40081eb4c..27c389ce8 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -51,20 +51,19 @@
 
 # General information about the project.
 project = u'pysam'
-copyright = '2009–2024 Andreas Heger, John Marshall, Kevin Jacobs, et al'
+copyright = '2009–2025 Andreas Heger, John Marshall, Kevin Jacobs, et al'
 
 # Included at the end of each rst file
 rst_epilog = '''
-.. _CGAT Training Programme: http://www.cgat.org
 .. _pysam: https://github.com/pysam-developers/pysam
-.. _samtools: http://samtools.sourceforge.net/
-.. _bcftools: https://samtools.github.io/bcftools/bcftools.html
-.. _htslib: http://www.htslib.org/
-.. _tabix: http://www.htslib.org/doc/tabix.html
-.. _Galaxy: https://main.g2.bx.psu.edu/
+.. _samtools: https://www.htslib.org/doc/1.21/samtools.html
+.. _bcftools: https://www.htslib.org/doc/1.21/bcftools.html
+.. _htslib: https://www.htslib.org/
+.. _tabix: https://www.htslib.org/doc/tabix.html
+.. _Galaxy: https://usegalaxy.org/
 .. _cython: https://cython.org/
 .. _python: https://www.python.org/
-.. _pypi: https://pypi.org/
+.. _PyPI: https://pypi.org/
 .. _pip: https://pip.pypa.io/
 .. _pyximport: https://github.com/cython/cython/tree/master/pyximport
 .. _conda: https://conda.io/docs/
diff --git a/doc/developer.rst b/doc/developer.rst
index 5bc306603..8d4248470 100644
--- a/doc/developer.rst
+++ b/doc/developer.rst
@@ -33,6 +33,26 @@ directories:
    :file:`import.py` about importing.
 
 
+Python language level
+=====================
+
+Pysam currently requires Python 3.8 as a minimum language level.
+For example, this means that the following comparatively recent
+language features and library functions are available for use:
+
+* f-strings
+* ``raise ... from None``
+* :meth:`str.startswith`, :meth:`str.endswith`, :meth:`str.rstrip`, etc
+* walrus ``:=`` operator in Python code
+
+However in particular the following should not be used in
+pysam source code or infrastructure scripts:
+
+* :meth:`str.removeprefix`, :meth:`str.removesuffix` (new in 3.9)
+* walrus ``:=`` operator in Cython code (requires Cython 3)
+* ``Optional[type]`` type hints written as ``type | None`` etc (new in 3.10)
+
+
 Importing new versions of htslib and samtools
 =============================================
 
diff --git a/doc/faq.rst b/doc/faq.rst
index e2352ebf0..37cd7d431 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -121,20 +121,24 @@ Thus, ``multiple_iterators`` is set to ``False`` by default.
 AlignmentFile.fetch does not show unmapped reads
 ================================================
 
-:meth:`~pysam.AlignmentFile.fetch` will only iterate over alignments
-in the SAM/BAM file. The following thus always works::
+By default, :meth:`~pysam.AlignmentFile.fetch` will only iterate over
+placed alignments in the SAM/BAM/CRAM file. Thus the following always
+works::
 
-    bf = pysam.AlignmentFile(fname, "rb")
-    for r in bf.fetch():
-        assert not r.is_unmapped
+    f = pysam.AlignmentFile(fname, "r")
+    for r in f.fetch():
+        assert r.reference_name is not None
 
-If the SAM/BAM file contains unaligned reads, they can be included
+If the file contains unaligned reads, they can be included
 in the iteration by adding the ``until_eof=True`` flag::
 
-    bf = pysam.AlignmentFile(fname, "rb")
-    for r in bf.fetch(until_eof=True):
+    f = pysam.AlignmentFile(fname, "r")
+    for r in f.fetch(until_eof=True):
         if r.is_unmapped:
-	    print("read is unmapped")
+            print("read is unmapped")
+
+See also :meth:`fetch("*") <pysam.AlignmentFile.fetch>` which iterates
+only over the unplaced unmapped reads at the end of the file.
 
 I can't call AlignmentFile.fetch on a file without an index
 ===========================================================
diff --git a/doc/index.rst b/doc/index.rst
index 0b4485ce8..17d3d3089 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as
 using cython and a high-level, pythonic API for convenient access to
 the data within genomic file formats. 
 
-The current version wraps *htslib-1.18*, *samtools-1.18*, and *bcftools-1.18*.
+The current version wraps *htslib-1.21*, *samtools-1.21*, and *bcftools-1.21*.
 
 To install the latest release, type::
 
diff --git a/doc/installation.rst b/doc/installation.rst
index a659f9da6..9cce3f01f 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -4,7 +4,7 @@
 Installing pysam
 ================
 
-Pysam can be installed through conda_, pypi_ and from the repository.
+Pysam can be installed through conda_, PyPI_ and from the repository.
 The recommended way to install pysam is through conda/bioconda.
 
 Conda installation
@@ -12,8 +12,8 @@ Conda installation
 
 To install pysam in your current conda_ environment, type::
 
-   conda config --add channels r
    conda config --add channels bioconda
+   conda config --add channels conda-forge
    conda install pysam
 
 This will install pysam from the bioconda_ channel and automatically
@@ -21,7 +21,7 @@ makes sure that dependencies are installed. Also, compilation flags
 will be set automatically, which will potentially save a lot of
 trouble on OS X.
 
-Pypi installation
+PyPI installation
 =================
 
 Pysam provides a python interface to the functionality contained
@@ -31,7 +31,7 @@ can be combined, ``builtin`` and ``external``.
 Builtin
 -------
 
-The typical installation will be through pypi_::
+The typical installation will be through PyPI_::
 
    pip install pysam
 
@@ -89,7 +89,7 @@ To install from repository, type::
 
     python setup.py install
 
-For compilation options, see the section on Pypi installation above.
+For compilation options, see the section on PyPI installation above.
 
 Requirements
 ============
diff --git a/doc/usage.rst b/doc/usage.rst
index 3c8ab04e1..df4a2255e 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -119,23 +119,33 @@ formatted file on stdout::
 
 Note that the file open mode needs to changed from ``r`` to ``rb``.
 
-=====================================
-Using samtools commands within python
-=====================================
+==================================================
+Using samtools and bcftools commands within Python
+==================================================
 
-Commands available in `samtools`_ are available as simple
-function calls. Command line options are provided as arguments. For
+Commands available in `samtools`_ and `bcftools`_ are available as simple
+function calls, with command line options provided as arguments. For
 example::
 
-   pysam.sort("-o", "output.bam", "ex1.bam")
+   import pysam.samtools
+   pysam.samtools.sort("-o", "output.bam", "ex1.bam", catch_stdout=False)
 
-corresponds to the command line::
+   import pysam.bcftools
+   pysam.bcftools.index("--csi", "ex2.vcf.gz")
+
+corresponds to the command lines::
 
    samtools sort -o output.bam ex1.bam
+   bcftools index --csi ex2.vcf.gz
+
+Samtools commands are also imported into the main ``pysam`` namespace.
+For example::
 
-Or for example::
+   pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam", catch_stdout=False)
 
-   pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam")
+To make them valid Python identifiers, the functions :func:`!cram_size`
+and :func:`!fqimport` are spelt thus, differently from their
+corresponding commands.
 
 In order to get usage information, try::
 
@@ -153,24 +163,63 @@ Argument errors raise a :class:`pysam.SamtoolsError`::
    pysam.SamtoolsError: 'Usage: samtools sort [-n] [-m <maxMem>] <in.bam> <out.prefix>\n'
 
 Messages from `samtools`_ on stderr are captured and are
-available using the :meth:`getMessages` method::
+available using the :meth:`~PysamDispatcher.get_messages` method::
+
+   pysam.sort.get_messages()
+
+By default, pysam captures the samtools command's standard output and returns it
+as the function's return value. To redirect stdout to a file instead, either use
+the ``save_stdout`` keyword argument, or use ``"-o", "filename"`` in the arguments
+and also use ``catch_stdout=False`` to prevent pysam's capturing from overriding
+your redirection. Finally, ``catch_stdout=False`` by itself discards standard output,
+which may help resolve problems in environments such as IPython notebooks::
+
+   # Return value
+   pileup_text = pysam.samtools.mpileup("in.bam")
+
+   # Save to file
+   pysam.samtools.mpileup("in.bam", save_stdout=pileup_filename)
+   pysam.samtools.mpileup("-o", pileup_filename, "in.bam", catch_stdout=False)
+
+   # Discard standard output
+   pysam.samtools.mpileup("in.bam", catch_stdout=False)  # Returns None
+
+For each :obj:`!command` available as a `samtools`_ subcommand,
+the following functions are provided:
+
+.. py:function:: pysam.samtools.command(args, *, catch_stdout=True, save_stdout=None, split_lines=False)
+
+   :param args: Arguments to be passed to the samtools subcommand.
+   :param bool catch_stdout: Whether to return stdout as the function's value.
+   :param str save_stdout: Filename to which stdout should be written.
+   :param bool split_lines: Whether to split the return value into a list of lines.
+   :returns: Standard output if it was caught, otherwise None.
+
+   If `save_stdout` is not None, the command's standard ouput is written to the
+   file specified and the function returns None.
+
+   Otherwise, if `catch_stdout` is true, the command's standard output is captured
+   and used as the function's return value --- either as a single :obj:`str` or as
+   :obj:`list[str] <list>` according to `split_lines`. If `catch_stdout` is false,
+   the command's standard output is discarded and the function returns None.
+
+   The command's standard error is always captured and made available via
+   :func:`~pysam.samtools.command.get_messages`.
+
+.. py:function:: pysam.samtools.command.get_messages()
 
-   pysam.sort.getMessage()
+   Returns the standard error from the most recent invocation of the particular
+   :obj:`!command`, either as a single :obj:`str` or as :obj:`list[str] <list>`
+   according to `split_lines` as specified in that invocation.
 
-Note that only the output from the last invocation of a command is
-stored.
+.. py:function:: pysam.samtools.command.usage()
 
-In order for pysam to make the output of samtools commands accessible
-the stdout stream needs to be redirected. This is the default
-behaviour, but can cause problems in environments such as the ipython
-notebook. A solution is to pass the ``catch_stdout`` keyword
-argument::
+   Returns the command's usage/help message, as a single :obj:`str`.
 
-   pysam.sort(catch_stdout=False)
+For each :obj:`!command` available as a `bcftools`_ subcommand, the
+:func:`!pysam.bcftools.command`, :func:`!pysam.bcftools.command.get_messages`,
+and :func:`!pysam.bcftools.command.usage` functions operate similarly.
 
-Note that this means that output from commands which produce output on
-stdout will not be available. The only solution is to run samtools
-commands through subprocess.
 
 ================================
 Working with tabix-indexed files
diff --git a/htslib/LICENSE b/htslib/LICENSE
index 925d47b40..87931faea 100644
--- a/htslib/LICENSE
+++ b/htslib/LICENSE
@@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.]
 
 The MIT/Expat License
 
-Copyright (C) 2012-2023 Genome Research Ltd.
+Copyright (C) 2012-2024 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.]
 
 The Modified-BSD License
 
-Copyright (C) 2012-2023 Genome Research Ltd.
+Copyright (C) 2012-2024 Genome Research Ltd.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/htslib/Makefile b/htslib/Makefile
index bbe471c59..630720b34 100644
--- a/htslib/Makefile
+++ b/htslib/Makefile
@@ -1,6 +1,6 @@
 # Makefile for htslib, a C library for high-throughput sequencing data formats.
 #
-#    Copyright (C) 2013-2023 Genome Research Ltd.
+#    Copyright (C) 2013-2024 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -68,6 +68,7 @@ INSTALL_PROGRAM = $(INSTALL)
 plugindir =
 
 BUILT_PROGRAMS = \
+	annot-tsv \
 	bgzip \
 	htsfile \
 	tabix
@@ -84,8 +85,10 @@ BUILT_TEST_PROGRAMS = \
 	test/test_expr \
 	test/test_faidx \
 	test/test_kfunc \
+	test/test_khash \
 	test/test_kstring \
 	test/test_mod \
+	test/test_nibbles \
 	test/test_realn \
 	test/test-regidx \
 	test/test_str2int \
@@ -110,8 +113,14 @@ BUILT_THRASH_PROGRAMS = \
 	test/thrash_threads6 \
 	test/thrash_threads7
 
-all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \
-     htslib_static.mk htslib-uninstalled.pc
+all: lib-static lib-shared $(BUILT_PROGRAMS) plugins \
+	$(BUILT_TEST_PROGRAMS) htslib_static.mk htslib-uninstalled.pc
+
+# Report compiler and version
+cc-version:
+	-@$(CC) --version  2>/dev/null || true
+	-@$(CC) --qversion 2>/dev/null || true
+	-@$(CC) -V         2>/dev/null || true
 
 ALL_CPPFLAGS = -I. $(CPPFLAGS)
 
@@ -127,9 +136,7 @@ HTS_CFLAGS_SSE4 =
 # Control building of SIMD code.  Not used if configure has been run.
 HTS_BUILD_AVX2 =
 HTS_BUILD_AVX512 =
-HTS_BUILD_SSSE3 =
-HTS_BUILD_POPCNT =
-HTS_BUILD_SSE4_1 =
+HTS_BUILD_SSE4 =
 
 include htslib_vars.mk
 -include htscodecs.mk
@@ -144,12 +151,8 @@ LIBHTS_SOVERSION = 3
 # is not strictly necessary and should be removed the next time
 # LIBHTS_SOVERSION is bumped (see #1144 and
 # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23)
-MACH_O_COMPATIBILITY_VERSION = 3.1.18
-MACH_O_CURRENT_VERSION = 3.1.18
-
-# $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string
-# even if this is a dirty or untagged Git working tree.
-NUMERIC_VERSION := $(shell $(srcdir)/version.sh numeric)
+MACH_O_COMPATIBILITY_VERSION = 3.1.21
+MACH_O_CURRENT_VERSION = 3.1.21
 
 # Force version.h to be remade if $(PACKAGE_VERSION) has changed.
 version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force))
@@ -162,7 +165,6 @@ print-version:
 
 show-version:
 	@echo PACKAGE_VERSION = $(PACKAGE_VERSION)
-	@echo NUMERIC_VERSION = $(NUMERIC_VERSION)
 
 config_vars.h: override escape=$(subst ',\x27,$(subst ",\",$(subst \,\\,$(1))))
 config_vars.h: override hts_cc_escaped=$(call escape,$(CC))
@@ -208,6 +210,7 @@ LIBHTS_OBJS = \
 	region.o \
 	sam.o \
 	sam_mods.o \
+	simd.o \
 	synced_bcf_reader.o \
 	vcf_sweep.o \
 	tbx.o \
@@ -245,6 +248,7 @@ cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h)
 cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) $(htslib_cram_h) cram/string_alloc.h cram/mFILE.h $(htslib_khash_h)
 cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
 bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h)
+fuzz_settings_h = fuzz_settings.h
 header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h)
 hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h)
 hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h)
@@ -276,9 +280,11 @@ config.h:
 	echo '#endif' >> $@
 	echo '#define HAVE_DRAND48 1' >> $@
 	echo '#define HAVE_LIBCURL 1' >> $@
-	if [ "x$(HTS_BUILD_POPCNT)" != "x" ] && \
-	   [ "x$(HTS_BUILD_SSE4_1)" != "x" ] && \
-	   [ "x$(HTS_BUILD_SSSE3)" != "x" ]; then \
+	if [ "x$(HTS_HAVE_CPUID)" != "x" ]; then \
+	    echo '#define HAVE_DECL___CPUID_COUNT 1' >> $@ ; \
+	    echo '#define HAVE_DECL___GET_CPUID_MAX 1' >> $@ ; \
+	fi
+	if [ "x$(HTS_BUILD_SSE4)" != "x" ]; then \
 	    echo '#define HAVE_POPCNT 1' >> $@ ; \
 	    echo '#define HAVE_SSE4_1 1' >> $@ ; \
 	    echo '#define HAVE_SSSE3 1' >> $@ ; \
@@ -292,6 +298,13 @@ config.h:
 	if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \
 	    echo '#define HAVE_AVX512 1' >> $@ ; \
 	fi
+	echo '#if defined __x86_64__ || defined __arm__ || defined __aarch64__' >> $@
+	echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@
+	echo '#endif' >> $@
+	echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
+	echo '#define HAVE_ATTRIBUTE_TARGET_SSSE3 1' >> $@
+	echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@
+	echo '#endif' >> $@
 
 # And similarly for htslib.pc.tmp ("pkg-config template").  No dependency
 # on htslib.pc.in listed, as if that file is newer the usual way to regenerate
@@ -448,9 +461,10 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h
 hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h)
 hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h)
 hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
-vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
-sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
+vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
+sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
 sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h)
+simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h)
 tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h)
 faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h)
 bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
@@ -467,12 +481,12 @@ probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h)
 realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h)
 textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(htslib_sam_h) $(hts_internal_h)
 
-cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h)
+cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(fuzz_settings_h) $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h)
 cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h)
 cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h)
 cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htscodecs_rANS_static4x16_h) $(htslib_hfile_h) $(cram_h)
 cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h)
-cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
+cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(fuzz_settings_h) $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
 cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h)
 cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h
 cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h)
@@ -500,6 +514,9 @@ htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x
 htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX512)
 htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: TARGET_CFLAGS = $(HTS_CFLAGS_SSE4)
 
+annot-tsv: annot-tsv.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ annot-tsv.o libhts.a $(LIBS) -lpthread
+
 bgzip: bgzip.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread
 
@@ -509,9 +526,10 @@ htsfile: htsfile.o libhts.a
 tabix: tabix.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread
 
+annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) $(textutils_internal_h)
 bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h)
 htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
-tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h)
+tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_thread_pool_h)
 
 # Runes to check that the htscodecs submodule is present
 ifdef HTSCODECS_SOURCES
@@ -548,7 +566,7 @@ htscodecs/htscodecs/version.h: force
 	  vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \
 	  case "$$vers" in \
 	    v*) vers=$${vers#v} ;; \
-	    *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
+	    *) iv=`awk '/^AC_INIT\(htscodecs,/ { match($$0, /[0-9]+(\.[0-9]+)*/); print substr($$0, RSTART, RLENGTH) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
 	  esac ; \
 	  if ! grep -s -q '"'"$$vers"'"' $@ ; then \
 	    echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \
@@ -557,10 +575,12 @@ htscodecs/htscodecs/version.h: force
 	fi
 endif
 
+# Maintainer extra targets built
+# - compile public headers as C++
 # Maintainer source code checks
 # - copyright boilerplate presence
 # - tab and trailing space detection
-maintainer-check:
+maintainer-check: test/usepublic.o
 	test/maintainer/check_copyright.pl .
 	test/maintainer/check_spaces.pl .
 
@@ -585,7 +605,9 @@ check test: all $(HTSCODECS_TEST_TARGETS)
 	test/hts_endian
 	test/test_expr
 	test/test_kfunc
+	test/test_khash
 	test/test_kstring
+	test/test_nibbles -v
 	test/test_str2int
 	test/test_time_funcs
 	test/fieldarith test/fieldarith.sam
@@ -611,7 +633,11 @@ check test: all $(HTSCODECS_TEST_TARGETS)
 test/hts_endian: test/hts_endian.o
 	$(CC) $(LDFLAGS) -o $@ test/hts_endian.o $(LIBS)
 
-test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o
+# To build the fuzzer, try:
+# make  CC="clang16 -fsanitize=address,undefined,fuzzer" \
+#     CFLAGS="-g -O3 -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION" \
+#     test/fuzz/hts_open_fuzzer
+test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/fuzz/hts_open_fuzzer.o libhts.a $(LIBS) -lpthread
 
 test/fieldarith: test/fieldarith.o libhts.a
@@ -633,23 +659,29 @@ test/sam: test/sam.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/sam.o libhts.a $(LIBS) -lpthread
 
 test/test_bgzf: test/test_bgzf.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a $(LIBS) -lpthread
 
 test/test_expr: test/test_expr.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a $(LIBS) -lpthread
 
 test/test_faidx: test/test_faidx.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a $(LIBS) -lpthread
 
 test/test_kfunc: test/test_kfunc.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a $(LIBS) -lpthread
+
+test/test_khash: test/test_khash.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a $(LIBS) -lpthread
 
 test/test_kstring: test/test_kstring.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a $(LIBS) -lpthread
 
 test/test_mod: test/test_mod.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread
 
+test/test_nibbles: test/test_nibbles.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_nibbles.o libhts.a $(LIBS) -lpthread
+
 test/test_realn: test/test_realn.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread
 
@@ -678,10 +710,10 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LIBS) -lpthread
 
 test/test-bcf-sr: test/test-bcf-sr.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a $(LIBS) -lpthread
 
 test/test-bcf-translate: test/test-bcf-translate.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a $(LIBS) -lpthread
 
 test/test_introspection: test/test_introspection.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread
@@ -750,8 +782,10 @@ test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_fa
 test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h)
 test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
 test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
+test/test_khash.o: test/test_khash.c config.h $(htslib_khash_h) $(htslib_kroundup_h)
 test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
 test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
+test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)
 test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h)
 test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
 test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h)
@@ -767,27 +801,32 @@ test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h)
 test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h)
 test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c
 
+# Standalone target not added to $(BUILT_TEST_PROGRAMS) as some may not
+# have a compiler that compiles as C++ when given a .cpp source file.
+test/usepublic.o: test/usepublic.cpp config.h $(htslib_bgzf_h) $(htslib_cram_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(htslib_hts_log_h) $(htslib_hts_os_h) $(htslib_kbitset_h) $(htslib_kfunc_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_klist_h) $(HTSPREFIX)htslib/knetfile.h $(htslib_kroundup_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_sam_h) $(htslib_synced_bcf_reader_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(htslib_vcf_h) $(htslib_vcf_sweep_h) $(htslib_vcfutils_h)
+	$(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ test/usepublic.cpp
+
 
 test/thrash_threads1: test/thrash_threads1.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads2: test/thrash_threads2.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads3: test/thrash_threads3.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads4: test/thrash_threads4.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads5: test/thrash_threads5.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads6: test/thrash_threads6.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads7: test/thrash_threads7.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a $(LIBS) -lpthread
 
 test_thrash: $(BUILT_THRASH_PROGRAMS)
 
@@ -847,7 +886,7 @@ install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB
 	if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi
 	$(INSTALL_DATA) $(SRC)htslib/*.h $(DESTDIR)$(includedir)/htslib
 	$(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a
-	$(INSTALL_MAN) $(SRC)bgzip.1 $(SRC)htsfile.1 $(SRC)tabix.1 $(DESTDIR)$(man1dir)
+	$(INSTALL_MAN) $(SRC)annot-tsv.1 $(SRC)bgzip.1 $(SRC)htsfile.1 $(SRC)tabix.1 $(DESTDIR)$(man1dir)
 	$(INSTALL_MAN) $(SRC)faidx.5 $(SRC)sam.5 $(SRC)vcf.5 $(DESTDIR)$(man5dir)
 	$(INSTALL_MAN) $(SRC)htslib-s3-plugin.7 $(DESTDIR)$(man7dir)
 
@@ -890,8 +929,9 @@ htslib-uninstalled.pc: htslib.pc.tmp
 
 
 testclean:
-	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \
-               test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \
+	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* \
+               test/longrefs/*.tmp.* test/tabix/*.tmp.* \
+               test/bgzf_boundaries/*.tmp.* test/*/FAIL* \
                header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt
 	-rm -rf htscodecs/tests/test.out
 
@@ -955,3 +995,4 @@ force:
 .PHONY: clean-dylib install-dylib
 .PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith
 .PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint
+.PHONY: cc-version
diff --git a/htslib/bgzf.c b/htslib/bgzf.c
index 45f2b1150..8092c7b9a 100644
--- a/htslib/bgzf.c
+++ b/htslib/bgzf.c
@@ -2,7 +2,7 @@
 
    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
                  2011, 2012 Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2009, 2013-2022 Genome Research Ltd
+   Copyright (C) 2009, 2013-2023 Genome Research Ltd
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -230,41 +230,8 @@ int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t e
     return 0;
 }
 
-/*
- * bgzf analogue to hts_idx_amend_last.
- *
- * This is needed when multi-threading and writing indices on the fly.
- * At the point of writing a record we know the virtual offset for start
- * and end, but that end virtual offset may be the end of the current
- * block.  In standard indexing our end virtual offset becomes the start
- * of the next block.  Thus to ensure bit for bit compatibility we
- * detect this boundary case and fix it up here.
- *
- * In theory this has no behavioural change, but it also works around
- * a bug elsewhere which causes bgzf_read to return 0 when our offset
- * is the end of a block rather than the start of the next.
- */
-void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset) {
-    mtaux_t *mt = fp->mt;
-    if (!mt) {
-        hts_idx_amend_last(hidx, offset);
-        return;
-    }
-
-    pthread_mutex_lock(&mt->idx_m);
-    hts_idx_cache_t *ic = &mt->idx_cache;
-    if (ic->nentries > 0) {
-        hts_idx_cache_entry *e = &ic->e[ic->nentries-1];
-        if ((offset & 0xffff) == 0 && e->offset != 0) {
-            // bumped to next block number
-            e->offset = 0;
-            e->block_number++;
-        }
-    }
-    pthread_mutex_unlock(&mt->idx_m);
-}
-
-static int bgzf_idx_flush(BGZF *fp) {
+static int bgzf_idx_flush(BGZF *fp,
+                          size_t block_uncomp_len, size_t block_comp_len) {
     mtaux_t *mt = fp->mt;
 
     if (!mt->idx_cache.e) {
@@ -280,6 +247,37 @@ static int bgzf_idx_flush(BGZF *fp) {
     assert(mt->idx_cache.nentries == 0 || mt->block_written <= e[0].block_number);
 
     for (i = 0; i < mt->idx_cache.nentries && e[i].block_number == mt->block_written; i++) {
+        if (block_uncomp_len > 0 && e[i].offset == block_uncomp_len) {
+            /*
+             * If the virtual offset is at the end of the current block,
+             * adjust it to point to the start of the next one.  This
+             * is needed when on-the-fly indexing has recorded a virtual
+             * offset just before a new block has been started, and makes
+             * on-the-fly and standard indexing give exactly the same results.
+             *
+             * In theory the two virtual offsets are equivalent, but pointing
+             * to the end of a block is inefficient, and caused problems with
+             * versions of HTSlib before 1.11 where bgzf_read() would
+             * incorrectly return EOF.
+             */
+
+            // Assert that this is the last entry for the current block_number
+            assert(i == mt->idx_cache.nentries - 1
+                   || e[i].block_number < e[i + 1].block_number);
+
+            // Work out where the next block starts.  For this entry, the
+            // offset will be zero.
+            uint64_t next_block_addr = mt->block_address + block_comp_len;
+            if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end,
+                             next_block_addr << 16, e[i].is_mapped) < 0) {
+                pthread_mutex_unlock(&mt->idx_m);
+                return -1;
+            }
+            // Count this entry and drop out of the loop
+            i++;
+            break;
+        }
+
         if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end,
                          (mt->block_address << 16) + e[i].offset,
                          e[i].is_mapped) < 0) {
@@ -733,6 +731,10 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen,
     }
 
     uint32_t crc = libdeflate_crc32(0, (unsigned char *)dst, *dlen);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
+    crc = expected_crc;
+#endif
     if (crc != expected_crc) {
         hts_log_error("CRC32 checksum mismatch");
         return -2;
@@ -775,6 +777,10 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen,
     *dlen = *dlen - zs.avail_out;
 
     uint32_t crc = crc32(crc32(0L, NULL, 0L), (unsigned char *)dst, *dlen);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
+    crc = expected_crc;
+#endif
     if (crc != expected_crc) {
         hts_log_error("CRC32 checksum mismatch");
         return -2;
@@ -1415,7 +1421,7 @@ static void *bgzf_mt_writer(void *vp) {
         }
 
         // Flush any cached hts_idx_push calls
-        if (bgzf_idx_flush(fp) < 0)
+        if (bgzf_idx_flush(fp, j->uncomp_len, j->comp_len) < 0)
             goto err;
 
         if (hwrite(fp->fp, j->comp_data, j->comp_len) != j->comp_len)
@@ -1953,6 +1959,11 @@ int bgzf_flush(BGZF *fp)
         return ret;
     }
 #endif
+
+    if (!fp->is_compressed) {
+        return hflush(fp->fp);
+    }
+
     while (fp->block_offset > 0) {
         int block_length;
         if ( fp->idx_build_otf )
@@ -2280,7 +2291,13 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
             if (fp->block_length == 0) { state = -1; break; }
         }
         unsigned char *buf = fp->uncompressed_block;
-        for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+
+        // Equivalent to a naive byte by byte search from
+        // buf + block_offset to buf + block_length.
+        void *e = memchr(&buf[fp->block_offset], delim,
+                         fp->block_length - fp->block_offset);
+        l = e ? (unsigned char *)e - buf : fp->block_length;
+
         if (l < fp->block_length) state = 1;
         l -= fp->block_offset;
         if (ks_expand(str, l + 2) < 0) { state = -3; break; }
@@ -2552,6 +2569,7 @@ int bgzf_useek(BGZF *fp, off_t uoffset, int where)
         else break;
     }
     int i = ilo-1;
+    off_t offset = 0;
     if (bgzf_seek_common(fp, fp->idx->offs[i].caddr, 0) < 0)
         return -1;
 
@@ -2559,9 +2577,14 @@ int bgzf_useek(BGZF *fp, off_t uoffset, int where)
         fp->errcode |= BGZF_ERR_IO;
         return -1;
     }
-    if ( uoffset - fp->idx->offs[i].uaddr > 0 )
+    offset = uoffset - fp->idx->offs[i].uaddr;
+    if ( offset > 0 )
     {
-        fp->block_offset = uoffset - fp->idx->offs[i].uaddr;
+        if (offset > fp->block_length) {
+            fp->errcode |= BGZF_ERR_IO;
+            return -1;                                      //offset outside the available data
+        }
+        fp->block_offset = offset;
         assert( fp->block_offset <= fp->block_length );     // todo: skipped, unindexed, blocks
     }
     fp->uncompressed_address = uoffset;
diff --git a/htslib/bgzip.c b/htslib/bgzip.c
deleted file mode 100644
index 589f79f66..000000000
--- a/htslib/bgzip.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/* bgzip.c -- Block compression/decompression utility.
-
-   Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
-   Copyright (C) 2010, 2013-2019, 2021-2022 Genome Research Ltd.
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notices and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-*/
-
-#include <config.h>
-
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <getopt.h>
-#include <inttypes.h>
-#include "htslib/bgzf.h"
-#include "htslib/hts.h"
-#include "htslib/hfile.h"
-
-#ifdef _WIN32
-#  define WIN32_LEAN_AND_MEAN
-#  include <windows.h>
-#endif
-
-static const int WINDOW_SIZE = BGZF_BLOCK_SIZE;
-
-static void error(const char *format, ...)
-{
-    va_list ap;
-    va_start(ap, format);
-    vfprintf(stderr, format, ap);
-    va_end(ap);
-    exit(EXIT_FAILURE);
-}
-
-static int ask_yn()
-{
-    char line[1024];
-    if (fgets(line, sizeof line, stdin) == NULL)
-        return 0;
-    return line[0] == 'Y' || line[0] == 'y';
-}
-
-static int confirm_overwrite(const char *fn)
-{
-    int save_errno = errno;
-    int ret = 0;
-
-    if (isatty(STDIN_FILENO)) {
-        fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
-        if (ask_yn()) ret = 1;
-    }
-
-    errno = save_errno;
-    return ret;
-}
-
-static int known_extension(const char *ext)
-{
-    static const char *known[] = {
-        "gz", "bgz", "bgzf",
-        NULL
-    };
-
-    const char **p;
-    for (p = known; *p; p++)
-        if (strcasecmp(ext, *p) == 0) return 1;
-    return 0;
-}
-
-static int confirm_filename(int *is_forced, const char *name, const char *ext)
-{
-    if (*is_forced) {
-        (*is_forced)--;
-        return 1;
-    }
-
-    if (!isatty(STDIN_FILENO))
-        return 0;
-
-    fprintf(stderr, "[bgzip] .%s is not a known extension; do you wish to decompress to %s (y or n)? ", ext, name);
-    return ask_yn();
-}
-
-static int bgzip_main_usage(FILE *fp, int status)
-{
-    fprintf(fp, "\n");
-    fprintf(fp, "Version: %s\n", hts_version());
-    fprintf(fp, "Usage:   bgzip [OPTIONS] [FILE] ...\n");
-    fprintf(fp, "Options:\n");
-    fprintf(fp, "   -b, --offset INT           decompress at virtual file pointer (0-based uncompressed offset)\n");
-    fprintf(fp, "   -c, --stdout               write on standard output, keep original files unchanged\n");
-    fprintf(fp, "   -d, --decompress           decompress\n");
-    fprintf(fp, "   -f, --force                overwrite files without asking\n");
-    fprintf(fp, "   -g, --rebgzip              use an index file to bgzip a file\n");
-    fprintf(fp, "   -h, --help                 give this help\n");
-    fprintf(fp, "   -i, --index                compress and create BGZF index\n");
-    fprintf(fp, "   -I, --index-name FILE      name of BGZF index file [file.gz.gzi]\n");
-    fprintf(fp, "   -k, --keep                 don't delete input files during operation\n");
-    fprintf(fp, "   -l, --compress-level INT   Compression level to use when compressing; 0 to 9, or -1 for default [-1]\n");
-    fprintf(fp, "   -r, --reindex              (re)index compressed file\n");
-    fprintf(fp, "   -s, --size INT             decompress INT bytes (uncompressed size)\n");
-    fprintf(fp, "   -t, --test                 test integrity of compressed file\n");
-    fprintf(fp, "       --binary               Don't align blocks with text lines\n");
-    fprintf(fp, "   -@, --threads INT          number of compression threads to use [1]\n");
-    return status;
-}
-
-int main(int argc, char **argv)
-{
-    int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary;
-    BGZF *fp;
-    char *buffer;
-    long start, end, size;
-    char *index_fname = NULL;
-    int threads = 1;
-
-    static const struct option loptions[] =
-    {
-        {"help", no_argument, NULL, 'h'},
-        {"offset", required_argument, NULL, 'b'},
-        {"stdout", no_argument, NULL, 'c'},
-        {"decompress", no_argument, NULL, 'd'},
-        {"force", no_argument, NULL, 'f'},
-        {"index", no_argument, NULL, 'i'},
-        {"index-name", required_argument, NULL, 'I'},
-        {"compress-level", required_argument, NULL, 'l'},
-        {"reindex", no_argument, NULL, 'r'},
-        {"rebgzip",no_argument,NULL,'g'},
-        {"size", required_argument, NULL, 's'},
-        {"threads", required_argument, NULL, '@'},
-        {"test", no_argument, NULL, 't'},
-        {"version", no_argument, NULL, 1},
-        {"keep", no_argument, NULL, 'k'},
-        {"binary", no_argument, NULL, 2},
-        {NULL, 0, NULL, 0}
-    };
-
-    compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0;
-    while((c  = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){
-        switch(c){
-        case 'd': compress = 0; break;
-        case 'c': pstdout = 1; break;
-        case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
-        case 's': size = atol(optarg); pstdout = 1; break;
-        case 'f': is_forced++; break;
-        case 'i': index = 1; break;
-        case 'I': index_fname = optarg; break;
-        case 'l': compress_level = atol(optarg); break;
-        case 'g': rebgzip = 1; break;
-        case 'r': reindex = 1; compress = 0; break;
-        case '@': threads = atoi(optarg); break;
-        case 't': test = 1; compress = 0; reindex = 0; break;
-        case 'k': keep = 1; break;
-        case 1:
-            printf(
-"bgzip (htslib) %s\n"
-"Copyright (C) 2023 Genome Research Ltd.\n", hts_version());
-            return EXIT_SUCCESS;
-        case  2:  binary = 1; break;
-        case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS);
-        case '?': return bgzip_main_usage(stderr, EXIT_FAILURE);
-        }
-    }
-    if (size >= 0) end = start + size;
-    if (end >= 0 && end < start) {
-        fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
-        return 1;
-    }
-    if (compress == 1) {
-        hFILE* f_src = NULL;
-        char out_mode[3] = "w\0";
-        char out_mode_exclusive[4] = "wx\0";
-
-        if (compress_level < -1 || compress_level > 9) {
-            fprintf(stderr, "[bgzip] Invalid compress-level: %d\n", compress_level);
-            return 1;
-        }
-        if (compress_level >= 0) {
-            out_mode[1] = compress_level + '0';
-            out_mode_exclusive[2] = compress_level + '0';
-        }
-
-        if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) {
-            fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
-            return 1;
-        }
-
-        if ( argc>optind )
-        {
-            if (pstdout)
-                fp = bgzf_open("-", out_mode);
-            else
-            {
-                char *name = malloc(strlen(argv[optind]) + 5);
-                strcpy(name, argv[optind]);
-                strcat(name, ".gz");
-                fp = bgzf_open(name, is_forced? out_mode : out_mode_exclusive);
-                if (fp == NULL && errno == EEXIST && confirm_overwrite(name))
-                    fp = bgzf_open(name, out_mode);
-                if (fp == NULL) {
-                    fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
-                    free(name);
-                    return 1;
-                }
-                free(name);
-            }
-        }
-        else if (!pstdout && isatty(fileno((FILE *)stdout)) )
-            return bgzip_main_usage(stderr, EXIT_FAILURE);
-        else if ( index && !index_fname )
-        {
-            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
-            return 1;
-        }
-        else
-            fp = bgzf_open("-", out_mode);
-
-        if ( index && rebgzip )
-        {
-            fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n");
-            return 1;
-        }
-
-        if ( rebgzip && !index_fname )
-        {
-            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout.  See -I option.\n");
-            return 1;
-        }
-
-        if ( index ) bgzf_index_build_init(fp);
-        if (threads > 1)
-            bgzf_mt(fp, threads, 256);
-
-        buffer = malloc(WINDOW_SIZE);
-        if (!buffer)
-            return 1;
-        if (rebgzip){
-            if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
-
-            while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0)
-                if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
-        }
-        else {
-            htsFormat fmt;
-            int textual = 0;
-            if (!binary
-                && hts_detect_format(f_src, &fmt) == 0
-                && fmt.compression == no_compression) {
-                switch(fmt.format) {
-                case text_format:
-                case sam:
-                case vcf:
-                case bed:
-                case fasta_format:
-                case fastq_format:
-                case fai_format:
-                case fqi_format:
-                    textual = 1;
-                    break;
-                default: break; // silence clang warnings
-                }
-            }
-
-            if (binary || !textual) {
-                // Binary data, either detected or explicit
-                while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0)
-                    if (bgzf_write(fp, buffer, c) < 0)
-                        error("Could not write %d bytes: Error %d\n",
-                              c, fp->errcode);
-            } else {
-                /* Text mode, try a flush after a newline */
-                int in_header = 1, n = 0, long_line = 0;
-                while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) {
-                    int c2 = c+n;
-                    int flush = 0;
-                    if (in_header &&
-                        (long_line || buffer[0] == '@' || buffer[0] == '#')) {
-                        // Scan forward to find the last header line.
-                        int last_start = 0;
-                        n = 0;
-                        while (n < c2) {
-                            if (buffer[n++] != '\n')
-                                continue;
-
-                            last_start = n;
-                            if (n < c2 &&
-                                !(buffer[n] == '@' || buffer[n] == '#')) {
-                                in_header = 0;
-                                break;
-                            }
-                        }
-                        if (!last_start) {
-                            n = c2;
-                            long_line = 1;
-                        } else {
-                            n = last_start;
-                            flush = 1;
-                            long_line = 0;
-                        }
-                    } else {
-                        // Scan backwards to find the last newline.
-                        n += c; // c read plus previous n overflow
-                        while (--n >= 0 && ((char *)buffer)[n] != '\n')
-                            ;
-
-                        if (n >= 0) {
-                            flush = 1;
-                            n++;
-                        } else {
-                            n = c2;
-                        }
-                    }
-
-                    // Pos n is either at the end of the buffer with flush==0,
-                    // or the first byte after a newline and a flush point.
-                    if (bgzf_write(fp, buffer, n) < 0)
-                        error("Could not write %d bytes: Error %d\n",
-                              n, fp->errcode);
-                    if (flush)
-                        if (bgzf_flush_try(fp, 65536) < 0) // force
-                            return -1;
-
-                    memmove(buffer, buffer+n, c2-n);
-                    n = c2-n;
-                }
-
-                // Trailing data.
-                if (bgzf_write(fp, buffer, n) < 0)
-                    error("Could not write %d bytes: Error %d\n",
-                          n, fp->errcode);
-            }
-        }
-        if ( index )
-        {
-            if (index_fname) {
-                if (bgzf_index_dump(fp, index_fname, NULL) < 0)
-                    error("Could not write index to '%s'\n", index_fname);
-            } else {
-                if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0)
-                    error("Could not write index to '%s.gz.gzi'\n",
-                          argv[optind]);
-            }
-        }
-        if (bgzf_close(fp) < 0)
-            error("Output close failed: Error %d\n", fp->errcode);
-        if (hclose(f_src) < 0)
-            error("Input close failed\n");
-        if (argc > optind && !pstdout && !keep) unlink(argv[optind]);
-        free(buffer);
-        return 0;
-    }
-    else if ( reindex )
-    {
-        if ( argc>optind )
-        {
-            fp = bgzf_open(argv[optind], "r");
-            if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
-        }
-        else
-        {
-            if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
-            fp = bgzf_open("-", "r");
-            if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
-        }
-
-        buffer = malloc(BGZF_BLOCK_SIZE);
-        bgzf_index_build_init(fp);
-        int ret;
-        while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
-        free(buffer);
-        if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
-
-        if ( index_fname ) {
-            if (bgzf_index_dump(fp, index_fname, NULL) < 0)
-                error("Could not write index to '%s'\n", index_fname);
-        } else {
-            if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0)
-                error("Could not write index to '%s.gzi'\n", argv[optind]);
-        }
-
-        if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
-        return 0;
-    }
-    else
-    {
-        int f_dst;
-
-        if ( argc>optind )
-        {
-            fp = bgzf_open(argv[optind], "r");
-            if (fp == NULL) {
-                fprintf(stderr, "[bgzip] Could not open %s: %s\n", argv[optind], strerror(errno));
-                return 1;
-            }
-            if (bgzf_compression(fp) == no_compression) {
-                fprintf(stderr, "[bgzip] %s: not a compressed file -- ignored\n", argv[optind]);
-                bgzf_close(fp);
-                return 1;
-            }
-
-            if (pstdout || test) {
-                f_dst = fileno(stdout);
-            }
-            else {
-                const int wrflags = O_WRONLY | O_CREAT | O_TRUNC;
-                char *name = argv[optind], *ext;
-                size_t pos;
-                for (pos = strlen(name); pos > 0; --pos)
-                    if (name[pos] == '.' || name[pos] == '/') break;
-                if (pos == 0 || name[pos] != '.') {
-                    fprintf(stderr, "[bgzip] can't remove an extension from %s -- please rename\n", argv[optind]);
-                    bgzf_close(fp);
-                    return 1;
-                }
-                name = strdup(argv[optind]);
-                name[pos] = '\0';
-                ext = &name[pos+1];
-                if (! (known_extension(ext) || confirm_filename(&is_forced, name, ext))) {
-                    fprintf(stderr, "[bgzip] unknown extension .%s -- declining to decompress to %s\n", ext, name);
-                    bgzf_close(fp);
-                    free(name);
-                    return 1;
-                }
-                f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666);
-                if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name))
-                    f_dst = open(name, wrflags, 0666);
-                if (f_dst < 0) {
-                    fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
-                    free(name);
-                    return 1;
-                }
-                free(name);
-            }
-        }
-        else if (!pstdout && isatty(fileno((FILE *)stdin)) )
-            return bgzip_main_usage(stderr, EXIT_FAILURE);
-        else
-        {
-            f_dst = fileno(stdout);
-            fp = bgzf_open("-", "r");
-            if (fp == NULL) {
-                fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
-                return 1;
-            }
-            if (bgzf_compression(fp) == no_compression) {
-                fprintf(stderr, "[bgzip] stdin is not compressed -- ignored\n");
-                bgzf_close(fp);
-                return 1;
-            }
-        }
-
-        buffer = malloc(WINDOW_SIZE);
-        if ( start>0 )
-        {
-            if (index_fname) {
-                if ( bgzf_index_load(fp, index_fname, NULL) < 0 )
-                    error("Could not load index: %s\n", index_fname);
-            } else {
-                if (optind >= argc) {
-                    error("The -b option requires -I when reading from stdin "
-                          "(and stdin must be seekable)\n");
-                }
-                if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 )
-                    error("Could not load index: %s.gzi\n", argv[optind]);
-            }
-            if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
-        }
-
-        if (threads > 1)
-            bgzf_mt(fp, threads, 256);
-
-#ifdef _WIN32
-        _setmode(f_dst, O_BINARY);
-#endif
-        while (1) {
-            if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
-            else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
-            if (c == 0) break;
-            if (c < 0) error("Error %d in block starting at offset %" PRId64 "(%" PRIX64 ")\n", fp->errcode, fp->block_address, fp->block_address);
-            start += c;
-            if ( !test && write(f_dst, buffer, c) != c ) {
-#ifdef _WIN32
-                if (GetLastError() != ERROR_NO_DATA)
-#endif
-                error("Could not write %d bytes\n", c);
-            }
-            if (end >= 0 && start >= end) break;
-        }
-        free(buffer);
-        if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
-        if (argc > optind && !pstdout && !test && !keep) unlink(argv[optind]);
-        return 0;
-    }
-}
diff --git a/htslib/config.guess b/htslib/config.guess
new file mode 100755
index 000000000..cdfc43920
--- /dev/null
+++ b/htslib/config.guess
@@ -0,0 +1,1807 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright 1992-2023 Free Software Foundation, Inc.
+
+# shellcheck disable=SC2006,SC2268 # see below for rationale
+
+timestamp='2023-08-22'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+#
+# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
+#
+# You can get the latest version of this script from:
+# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
+#
+# Please send patches to <config-patches@gnu.org>.
+
+
+# The "shellcheck disable" line above the timestamp inhibits complaints
+# about features and limitations of the classic Bourne shell that were
+# superseded or lifted in POSIX.  However, this script identifies a wide
+# variety of pre-POSIX systems that do not have POSIX shells at all, and
+# even some reasonably current systems (Solaris 10 as case-in-point) still
+# have a pre-POSIX /bin/sh.
+
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system '$me' is run on.
+
+Options:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright 1992-2023 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try '$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+# Just in case it came from the environment.
+GUESS=
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, 'CC_FOR_BUILD' used to be named 'HOST_CC'. We still
+# use 'HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+tmp=
+# shellcheck disable=SC2172
+trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15
+
+set_cc_for_build() {
+    # prevent multiple calls if $tmp is already set
+    test "$tmp" && return 0
+    : "${TMPDIR=/tmp}"
+    # shellcheck disable=SC2039,SC3028
+    { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+	{ test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } ||
+	{ tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+	{ echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; }
+    dummy=$tmp/dummy
+    case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in
+	,,)    echo "int x;" > "$dummy.c"
+	       for driver in cc gcc c89 c99 ; do
+		   if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then
+		       CC_FOR_BUILD=$driver
+		       break
+		   fi
+	       done
+	       if test x"$CC_FOR_BUILD" = x ; then
+		   CC_FOR_BUILD=no_compiler_found
+	       fi
+	       ;;
+	,,*)   CC_FOR_BUILD=$CC ;;
+	,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+    esac
+}
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if test -f /.attbin/uname ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+case $UNAME_SYSTEM in
+Linux|GNU|GNU/*)
+	LIBC=unknown
+
+	set_cc_for_build
+	cat <<-EOF > "$dummy.c"
+	#if defined(__ANDROID__)
+	LIBC=android
+	#else
+	#include <features.h>
+	#if defined(__UCLIBC__)
+	LIBC=uclibc
+	#elif defined(__dietlibc__)
+	LIBC=dietlibc
+	#elif defined(__GLIBC__)
+	LIBC=gnu
+	#else
+	#include <stdarg.h>
+	/* First heuristic to detect musl libc.  */
+	#ifdef __DEFINED_va_list
+	LIBC=musl
+	#endif
+	#endif
+	#endif
+	EOF
+	cc_set_libc=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+	eval "$cc_set_libc"
+
+	# Second heuristic to detect musl libc.
+	if [ "$LIBC" = unknown ] &&
+	   command -v ldd >/dev/null &&
+	   ldd --version 2>&1 | grep -q ^musl; then
+		LIBC=musl
+	fi
+
+	# If the system lacks a compiler, then just pick glibc.
+	# We could probably try harder.
+	if [ "$LIBC" = unknown ]; then
+		LIBC=gnu
+	fi
+	;;
+esac
+
+# Note: order is significant - the case branches are not exclusive.
+
+case $UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+	    /sbin/sysctl -n hw.machine_arch 2>/dev/null || \
+	    /usr/sbin/sysctl -n hw.machine_arch 2>/dev/null || \
+	    echo unknown)`
+	case $UNAME_MACHINE_ARCH in
+	    aarch64eb) machine=aarch64_be-unknown ;;
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    earmv*)
+		arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+		endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'`
+		machine=${arch}${endian}-unknown
+		;;
+	    *) machine=$UNAME_MACHINE_ARCH-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently (or will in the future) and ABI.
+	case $UNAME_MACHINE_ARCH in
+	    earm*)
+		os=netbsdelf
+		;;
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep -q __ELF__
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+		os=netbsd
+		;;
+	esac
+	# Determine ABI tags.
+	case $UNAME_MACHINE_ARCH in
+	    earm*)
+		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+		abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"`
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case $UNAME_VERSION in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	GUESS=$machine-${os}${release}${abi-}
+	;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-bitrig$UNAME_RELEASE
+	;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-openbsd$UNAME_RELEASE
+	;;
+    *:SecBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/SecBSD.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-secbsd$UNAME_RELEASE
+	;;
+    *:LibertyBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-libertybsd$UNAME_RELEASE
+	;;
+    *:MidnightBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-midnightbsd$UNAME_RELEASE
+	;;
+    *:ekkoBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-ekkobsd$UNAME_RELEASE
+	;;
+    *:SolidBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-solidbsd$UNAME_RELEASE
+	;;
+    *:OS108:*:*)
+	GUESS=$UNAME_MACHINE-unknown-os108_$UNAME_RELEASE
+	;;
+    macppc:MirBSD:*:*)
+	GUESS=powerpc-unknown-mirbsd$UNAME_RELEASE
+	;;
+    *:MirBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-mirbsd$UNAME_RELEASE
+	;;
+    *:Sortix:*:*)
+	GUESS=$UNAME_MACHINE-unknown-sortix
+	;;
+    *:Twizzler:*:*)
+	GUESS=$UNAME_MACHINE-unknown-twizzler
+	;;
+    *:Redox:*:*)
+	GUESS=$UNAME_MACHINE-unknown-redox
+	;;
+    mips:OSF1:*.*)
+	GUESS=mips-dec-osf1
+	;;
+    alpha:OSF1:*:*)
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	trap '' 0
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case $ALPHA_CPU_TYPE in
+	    "EV4 (21064)")
+		UNAME_MACHINE=alpha ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE=alpha ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE=alpha ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE=alphaev5 ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE=alphaev56 ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE=alphapca56 ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE=alphapca57 ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE=alphaev6 ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE=alphaev67 ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE=alphaev69 ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE=alphaev7 ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE=alphaev79 ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	OSF_REL=`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	GUESS=$UNAME_MACHINE-dec-osf$OSF_REL
+	;;
+    Amiga*:UNIX_System_V:4.0:*)
+	GUESS=m68k-unknown-sysv4
+	;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	GUESS=$UNAME_MACHINE-unknown-amigaos
+	;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	GUESS=$UNAME_MACHINE-unknown-morphos
+	;;
+    *:OS/390:*:*)
+	GUESS=i370-ibm-openedition
+	;;
+    *:z/VM:*:*)
+	GUESS=s390-ibm-zvmoe
+	;;
+    *:OS400:*:*)
+	GUESS=powerpc-ibm-os400
+	;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	GUESS=arm-acorn-riscix$UNAME_RELEASE
+	;;
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
+	GUESS=arm-unknown-riscos
+	;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	GUESS=hppa1.1-hitachi-hiuxmpp
+	;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	case `(/bin/universe) 2>/dev/null` in
+	    att) GUESS=pyramid-pyramid-sysv3 ;;
+	    *)   GUESS=pyramid-pyramid-bsd   ;;
+	esac
+	;;
+    NILE*:*:*:dcosx)
+	GUESS=pyramid-pyramid-svr4
+	;;
+    DRS?6000:unix:4.0:6*)
+	GUESS=sparc-icl-nx6
+	;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) GUESS=sparc-icl-nx7 ;;
+	esac
+	;;
+    s390x:SunOS:*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=$UNAME_MACHINE-ibm-solaris2$SUN_REL
+	;;
+    sun4H:SunOS:5.*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=sparc-hal-solaris2$SUN_REL
+	;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=sparc-sun-solaris2$SUN_REL
+	;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	GUESS=i386-pc-auroraux$UNAME_RELEASE
+	;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	set_cc_for_build
+	SUN_ARCH=i386
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS="" $CC_FOR_BUILD -m64 -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH=x86_64
+	    fi
+	fi
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=$SUN_ARCH-pc-solaris2$SUN_REL
+	;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=sparc-sun-solaris3$SUN_REL
+	;;
+    sun4*:SunOS:*:*)
+	case `/usr/bin/arch -k` in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like '4.1.3-JL'.
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/'`
+	GUESS=sparc-sun-sunos$SUN_REL
+	;;
+    sun3*:SunOS:*:*)
+	GUESS=m68k-sun-sunos$UNAME_RELEASE
+	;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3
+	case `/bin/arch` in
+	    sun3)
+		GUESS=m68k-sun-sunos$UNAME_RELEASE
+		;;
+	    sun4)
+		GUESS=sparc-sun-sunos$UNAME_RELEASE
+		;;
+	esac
+	;;
+    aushp:SunOS:*:*)
+	GUESS=sparc-auspex-sunos$UNAME_RELEASE
+	;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+	GUESS=m68k-atari-mint$UNAME_RELEASE
+	;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	GUESS=m68k-atari-mint$UNAME_RELEASE
+	;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+	GUESS=m68k-atari-mint$UNAME_RELEASE
+	;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+	GUESS=m68k-milan-mint$UNAME_RELEASE
+	;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+	GUESS=m68k-hades-mint$UNAME_RELEASE
+	;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+	GUESS=m68k-unknown-mint$UNAME_RELEASE
+	;;
+    m68k:machten:*:*)
+	GUESS=m68k-apple-machten$UNAME_RELEASE
+	;;
+    powerpc:machten:*:*)
+	GUESS=powerpc-apple-machten$UNAME_RELEASE
+	;;
+    RISC*:Mach:*:*)
+	GUESS=mips-dec-mach_bsd4.3
+	;;
+    RISC*:ULTRIX:*:*)
+	GUESS=mips-dec-ultrix$UNAME_RELEASE
+	;;
+    VAX*:ULTRIX*:*:*)
+	GUESS=vax-dec-ultrix$UNAME_RELEASE
+	;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	GUESS=clipper-intergraph-clix$UNAME_RELEASE
+	;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	set_cc_for_build
+	sed 's/^	//' << EOF > "$dummy.c"
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o "$dummy" "$dummy.c" &&
+	  dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`"$dummy" "$dummyarg"` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	GUESS=mips-mips-riscos$UNAME_RELEASE
+	;;
+    Motorola:PowerMAX_OS:*:*)
+	GUESS=powerpc-motorola-powermax
+	;;
+    Motorola:*:4.3:PL8-*)
+	GUESS=powerpc-harris-powermax
+	;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	GUESS=powerpc-harris-powermax
+	;;
+    Night_Hawk:Power_UNIX:*:*)
+	GUESS=powerpc-harris-powerunix
+	;;
+    m88k:CX/UX:7*:*)
+	GUESS=m88k-harris-cxux7
+	;;
+    m88k:*:4*:R4*)
+	GUESS=m88k-motorola-sysv4
+	;;
+    m88k:*:3*:R3*)
+	GUESS=m88k-motorola-sysv3
+	;;
+    AViiON:dgux:*:*)
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110
+	then
+	    if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \
+	       test "$TARGET_BINARY_INTERFACE"x = x
+	    then
+		GUESS=m88k-dg-dgux$UNAME_RELEASE
+	    else
+		GUESS=m88k-dg-dguxbcs$UNAME_RELEASE
+	    fi
+	else
+	    GUESS=i586-dg-dgux$UNAME_RELEASE
+	fi
+	;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	GUESS=m88k-dolphin-sysv3
+	;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	GUESS=m88k-motorola-sysv3
+	;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	GUESS=m88k-tektronix-sysv3
+	;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	GUESS=m68k-tektronix-bsd
+	;;
+    *:IRIX*:*:*)
+	IRIX_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/g'`
+	GUESS=mips-sgi-irix$IRIX_REL
+	;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	GUESS=romp-ibm-aix    # uname -m gives an 8 hex-code CPU id
+	;;                    # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	GUESS=i386-ibm-aix
+	;;
+    ia64:AIX:*:*)
+	if test -x /usr/bin/oslevel ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=$UNAME_VERSION.$UNAME_RELEASE
+	fi
+	GUESS=$UNAME_MACHINE-ibm-aix$IBM_REV
+	;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		set_cc_for_build
+		sed 's/^		//' << EOF > "$dummy.c"
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"`
+		then
+			GUESS=$SYSTEM_NAME
+		else
+			GUESS=rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		GUESS=rs6000-ibm-aix3.2.4
+	else
+		GUESS=rs6000-ibm-aix3.2
+	fi
+	;;
+    *:AIX:*:[4567])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if test -x /usr/bin/lslpp ; then
+		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | \
+			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
+	else
+		IBM_REV=$UNAME_VERSION.$UNAME_RELEASE
+	fi
+	GUESS=$IBM_ARCH-ibm-aix$IBM_REV
+	;;
+    *:AIX:*:*)
+	GUESS=rs6000-ibm-aix
+	;;
+    ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
+	GUESS=romp-ibm-bsd4.4
+	;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	GUESS=romp-ibm-bsd$UNAME_RELEASE    # 4.3 with uname added to
+	;;                                  # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	GUESS=rs6000-bull-bosx
+	;;
+    DPX/2?00:B.O.S.:*:*)
+	GUESS=m68k-bull-sysv3
+	;;
+    9000/[34]??:4.3bsd:1.*:*)
+	GUESS=m68k-hp-bsd
+	;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	GUESS=m68k-hp-bsd4.4
+	;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'`
+	case $UNAME_MACHINE in
+	    9000/31?)            HP_ARCH=m68000 ;;
+	    9000/[34]??)         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if test -x /usr/bin/getconf; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case $sc_cpu_version in
+		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case $sc_kernel_bits in
+			  32) HP_ARCH=hppa2.0n ;;
+			  64) HP_ARCH=hppa2.0w ;;
+			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
+			esac ;;
+		    esac
+		fi
+		if test "$HP_ARCH" = ""; then
+		    set_cc_for_build
+		    sed 's/^		//' << EOF > "$dummy.c"
+
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
+
+		int main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
+
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
+EOF
+		    (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if test "$HP_ARCH" = hppa2.0w
+	then
+	    set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep -q __LP64__
+	    then
+		HP_ARCH=hppa2.0w
+	    else
+		HP_ARCH=hppa64
+	    fi
+	fi
+	GUESS=$HP_ARCH-hp-hpux$HPUX_REV
+	;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'`
+	GUESS=ia64-hp-hpux$HPUX_REV
+	;;
+    3050*:HI-UX:*:*)
+	set_cc_for_build
+	sed 's/^	//' << EOF > "$dummy.c"
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	GUESS=unknown-hitachi-hiuxwe2
+	;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
+	GUESS=hppa1.1-hp-bsd
+	;;
+    9000/8??:4.3bsd:*:*)
+	GUESS=hppa1.0-hp-bsd
+	;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	GUESS=hppa1.0-hp-mpeix
+	;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
+	GUESS=hppa1.1-hp-osf
+	;;
+    hp8??:OSF1:*:*)
+	GUESS=hppa1.0-hp-osf
+	;;
+    i*86:OSF1:*:*)
+	if test -x /usr/sbin/sysversion ; then
+	    GUESS=$UNAME_MACHINE-unknown-osf1mk
+	else
+	    GUESS=$UNAME_MACHINE-unknown-osf1
+	fi
+	;;
+    parisc*:Lites*:*:*)
+	GUESS=hppa1.1-hp-lites
+	;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	GUESS=c1-convex-bsd
+	;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	GUESS=c34-convex-bsd
+	;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	GUESS=c38-convex-bsd
+	;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	GUESS=c4-convex-bsd
+	;;
+    CRAY*Y-MP:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=ymp-cray-unicos$CRAY_REL
+	;;
+    CRAY*[A-Z]90:*:*:*)
+	echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=t90-cray-unicos$CRAY_REL
+	;;
+    CRAY*T3E:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=alphaev5-cray-unicosmk$CRAY_REL
+	;;
+    CRAY*SV1:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=sv1-cray-unicos$CRAY_REL
+	;;
+    *:UNICOS/mp:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=craynv-cray-unicosmp$CRAY_REL
+	;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'`
+	GUESS=${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}
+	;;
+    5000:UNIX_System_V:4.*:*)
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+	GUESS=sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}
+	;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	GUESS=$UNAME_MACHINE-pc-bsdi$UNAME_RELEASE
+	;;
+    sparc*:BSD/OS:*:*)
+	GUESS=sparc-unknown-bsdi$UNAME_RELEASE
+	;;
+    *:BSD/OS:*:*)
+	GUESS=$UNAME_MACHINE-unknown-bsdi$UNAME_RELEASE
+	;;
+    arm:FreeBSD:*:*)
+	UNAME_PROCESSOR=`uname -p`
+	set_cc_for_build
+	if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_PCS_VFP
+	then
+	    FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	    GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabi
+	else
+	    FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	    GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabihf
+	fi
+	;;
+    *:FreeBSD:*:*)
+	UNAME_PROCESSOR=`uname -p`
+	case $UNAME_PROCESSOR in
+	    amd64)
+		UNAME_PROCESSOR=x86_64 ;;
+	    i386)
+		UNAME_PROCESSOR=i586 ;;
+	esac
+	FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL
+	;;
+    i*:CYGWIN*:*)
+	GUESS=$UNAME_MACHINE-pc-cygwin
+	;;
+    *:MINGW64*:*)
+	GUESS=$UNAME_MACHINE-pc-mingw64
+	;;
+    *:MINGW*:*)
+	GUESS=$UNAME_MACHINE-pc-mingw32
+	;;
+    *:MSYS*:*)
+	GUESS=$UNAME_MACHINE-pc-msys
+	;;
+    i*:PW*:*)
+	GUESS=$UNAME_MACHINE-pc-pw32
+	;;
+    *:SerenityOS:*:*)
+        GUESS=$UNAME_MACHINE-pc-serenity
+        ;;
+    *:Interix*:*)
+	case $UNAME_MACHINE in
+	    x86)
+		GUESS=i586-pc-interix$UNAME_RELEASE
+		;;
+	    authenticamd | genuineintel | EM64T)
+		GUESS=x86_64-unknown-interix$UNAME_RELEASE
+		;;
+	    IA64)
+		GUESS=ia64-unknown-interix$UNAME_RELEASE
+		;;
+	esac ;;
+    i*:UWIN*:*)
+	GUESS=$UNAME_MACHINE-pc-uwin
+	;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	GUESS=x86_64-pc-cygwin
+	;;
+    prep*:SunOS:5.*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=powerpcle-unknown-solaris2$SUN_REL
+	;;
+    *:GNU:*:*)
+	# the GNU system
+	GNU_ARCH=`echo "$UNAME_MACHINE" | sed -e 's,[-/].*$,,'`
+	GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's,/.*$,,'`
+	GUESS=$GNU_ARCH-unknown-$LIBC$GNU_REL
+	;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	GNU_SYS=`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"`
+	GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	GUESS=$UNAME_MACHINE-unknown-$GNU_SYS$GNU_REL-$LIBC
+	;;
+    x86_64:[Mm]anagarm:*:*|i?86:[Mm]anagarm:*:*)
+	GUESS="$UNAME_MACHINE-pc-managarm-mlibc"
+	;;
+    *:[Mm]anagarm:*:*)
+	GUESS="$UNAME_MACHINE-unknown-managarm-mlibc"
+	;;
+    *:Minix:*:*)
+	GUESS=$UNAME_MACHINE-unknown-minix
+	;;
+    aarch64:Linux:*:*)
+	set_cc_for_build
+	CPU=$UNAME_MACHINE
+	LIBCABI=$LIBC
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    ABI=64
+	    sed 's/^	    //' << EOF > "$dummy.c"
+	    #ifdef __ARM_EABI__
+	    #ifdef __ARM_PCS_VFP
+	    ABI=eabihf
+	    #else
+	    ABI=eabi
+	    #endif
+	    #endif
+EOF
+	    cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'`
+	    eval "$cc_set_abi"
+	    case $ABI in
+		eabi | eabihf) CPU=armv8l; LIBCABI=$LIBC$ABI ;;
+	    esac
+	fi
+	GUESS=$CPU-unknown-linux-$LIBCABI
+	;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    arc:Linux:*:* | arceb:Linux:*:* | arc32:Linux:*:* | arc64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    arm*:Linux:*:*)
+	set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	else
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabi
+	    else
+		GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabihf
+	    fi
+	fi
+	;;
+    avr32*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    cris:Linux:*:*)
+	GUESS=$UNAME_MACHINE-axis-linux-$LIBC
+	;;
+    crisv32:Linux:*:*)
+	GUESS=$UNAME_MACHINE-axis-linux-$LIBC
+	;;
+    e2k:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    frv:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    hexagon:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    i*86:Linux:*:*)
+	GUESS=$UNAME_MACHINE-pc-linux-$LIBC
+	;;
+    ia64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    k1om:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    kvx:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    kvx:cos:*:*)
+	GUESS=$UNAME_MACHINE-unknown-cos
+	;;
+    kvx:mbr:*:*)
+	GUESS=$UNAME_MACHINE-unknown-mbr
+	;;
+    loongarch32:Linux:*:* | loongarch64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    m32r*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    m68*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    mips:Linux:*:* | mips64:Linux:*:*)
+	set_cc_for_build
+	IS_GLIBC=0
+	test x"${LIBC}" = xgnu && IS_GLIBC=1
+	sed 's/^	//' << EOF > "$dummy.c"
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#undef mips64
+	#undef mips64el
+	#if ${IS_GLIBC} && defined(_ABI64)
+	LIBCABI=gnuabi64
+	#else
+	#if ${IS_GLIBC} && defined(_ABIN32)
+	LIBCABI=gnuabin32
+	#else
+	LIBCABI=${LIBC}
+	#endif
+	#endif
+
+	#if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
+	CPU=mipsisa64r6
+	#else
+	#if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
+	CPU=mipsisa32r6
+	#else
+	#if defined(__mips64)
+	CPU=mips64
+	#else
+	CPU=mips
+	#endif
+	#endif
+	#endif
+
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	MIPS_ENDIAN=el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	MIPS_ENDIAN=
+	#else
+	MIPS_ENDIAN=
+	#endif
+	#endif
+EOF
+	cc_set_vars=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'`
+	eval "$cc_set_vars"
+	test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; }
+	;;
+    mips64el:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    openrisc*:Linux:*:*)
+	GUESS=or1k-unknown-linux-$LIBC
+	;;
+    or32:Linux:*:* | or1k*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    padre:Linux:*:*)
+	GUESS=sparc-unknown-linux-$LIBC
+	;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	GUESS=hppa64-unknown-linux-$LIBC
+	;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) GUESS=hppa1.1-unknown-linux-$LIBC ;;
+	  PA8*) GUESS=hppa2.0-unknown-linux-$LIBC ;;
+	  *)    GUESS=hppa-unknown-linux-$LIBC ;;
+	esac
+	;;
+    ppc64:Linux:*:*)
+	GUESS=powerpc64-unknown-linux-$LIBC
+	;;
+    ppc:Linux:*:*)
+	GUESS=powerpc-unknown-linux-$LIBC
+	;;
+    ppc64le:Linux:*:*)
+	GUESS=powerpc64le-unknown-linux-$LIBC
+	;;
+    ppcle:Linux:*:*)
+	GUESS=powerpcle-unknown-linux-$LIBC
+	;;
+    riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	GUESS=$UNAME_MACHINE-ibm-linux-$LIBC
+	;;
+    sh64*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    sh*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    tile*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    vax:Linux:*:*)
+	GUESS=$UNAME_MACHINE-dec-linux-$LIBC
+	;;
+    x86_64:Linux:*:*)
+	set_cc_for_build
+	CPU=$UNAME_MACHINE
+	LIBCABI=$LIBC
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    ABI=64
+	    sed 's/^	    //' << EOF > "$dummy.c"
+	    #ifdef __i386__
+	    ABI=x86
+	    #else
+	    #ifdef __ILP32__
+	    ABI=x32
+	    #endif
+	    #endif
+EOF
+	    cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'`
+	    eval "$cc_set_abi"
+	    case $ABI in
+		x86) CPU=i686 ;;
+		x32) LIBCABI=${LIBC}x32 ;;
+	    esac
+	fi
+	GUESS=$CPU-pc-linux-$LIBCABI
+	;;
+    xtensa*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	GUESS=i386-sequent-sysv4
+	;;
+    i*86:UNIX_SV:4.2MP:2.*)
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+	# Use sysv4.2uw... so that sysv4* matches it.
+	GUESS=$UNAME_MACHINE-pc-sysv4.2uw$UNAME_VERSION
+	;;
+    i*86:OS/2:*:*)
+	# If we were able to find 'uname', then EMX Unix compatibility
+	# is probably installed.
+	GUESS=$UNAME_MACHINE-pc-os2-emx
+	;;
+    i*86:XTS-300:*:STOP)
+	GUESS=$UNAME_MACHINE-unknown-stop
+	;;
+    i*86:atheos:*:*)
+	GUESS=$UNAME_MACHINE-unknown-atheos
+	;;
+    i*86:syllable:*:*)
+	GUESS=$UNAME_MACHINE-pc-syllable
+	;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
+	GUESS=i386-unknown-lynxos$UNAME_RELEASE
+	;;
+    i*86:*DOS:*:*)
+	GUESS=$UNAME_MACHINE-pc-msdosdjgpp
+	;;
+    i*86:*:4.*:*)
+	UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		GUESS=$UNAME_MACHINE-univel-sysv$UNAME_REL
+	else
+		GUESS=$UNAME_MACHINE-pc-sysv$UNAME_REL
+	fi
+	;;
+    i*86:*:5:[678]*)
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	GUESS=$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		GUESS=$UNAME_MACHINE-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		GUESS=$UNAME_MACHINE-pc-sco$UNAME_REL
+	else
+		GUESS=$UNAME_MACHINE-pc-sysv32
+	fi
+	;;
+    pc:*:*:*)
+	# Left here for compatibility:
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configure will decide that
+	# this is a cross-build.
+	GUESS=i586-pc-msdosdjgpp
+	;;
+    Intel:Mach:3*:*)
+	GUESS=i386-pc-mach3
+	;;
+    paragon:*:*:*)
+	GUESS=i860-intel-osf1
+	;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  GUESS=i860-stardent-sysv$UNAME_RELEASE    # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  GUESS=i860-unknown-sysv$UNAME_RELEASE     # Unknown i860-SVR4
+	fi
+	;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	GUESS=m68010-convergent-sysv
+	;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	GUESS=m68k-convergent-sysv
+	;;
+    M680?0:D-NIX:5.3:*)
+	GUESS=m68k-diab-dnix
+	;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	GUESS=m68k-unknown-lynxos$UNAME_RELEASE
+	;;
+    mc68030:UNIX_System_V:4.*:*)
+	GUESS=m68k-atari-sysv4
+	;;
+    TSUNAMI:LynxOS:2.*:*)
+	GUESS=sparc-unknown-lynxos$UNAME_RELEASE
+	;;
+    rs6000:LynxOS:2.*:*)
+	GUESS=rs6000-unknown-lynxos$UNAME_RELEASE
+	;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
+	GUESS=powerpc-unknown-lynxos$UNAME_RELEASE
+	;;
+    SM[BE]S:UNIX_SV:*:*)
+	GUESS=mips-dde-sysv$UNAME_RELEASE
+	;;
+    RM*:ReliantUNIX-*:*:*)
+	GUESS=mips-sni-sysv4
+	;;
+    RM*:SINIX-*:*:*)
+	GUESS=mips-sni-sysv4
+	;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		GUESS=$UNAME_MACHINE-sni-sysv4
+	else
+		GUESS=ns32k-sni-sysv
+	fi
+	;;
+    PENTIUM:*:4.0*:*)	# Unisys 'ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	GUESS=i586-unisys-sysv4
+	;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	GUESS=hppa1.1-stratus-sysv4
+	;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	GUESS=i860-stratus-sysv4
+	;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	GUESS=$UNAME_MACHINE-stratus-vos
+	;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	GUESS=hppa1.1-stratus-vos
+	;;
+    mc68*:A/UX:*:*)
+	GUESS=m68k-apple-aux$UNAME_RELEASE
+	;;
+    news*:NEWS-OS:6*:*)
+	GUESS=mips-sony-newsos6
+	;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if test -d /usr/nec; then
+		GUESS=mips-nec-sysv$UNAME_RELEASE
+	else
+		GUESS=mips-unknown-sysv$UNAME_RELEASE
+	fi
+	;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	GUESS=powerpc-be-beos
+	;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	GUESS=powerpc-apple-beos
+	;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	GUESS=i586-pc-beos
+	;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	GUESS=i586-pc-haiku
+	;;
+    ppc:Haiku:*:*)	# Haiku running on Apple PowerPC
+	GUESS=powerpc-apple-haiku
+	;;
+    *:Haiku:*:*)	# Haiku modern gcc (not bound by BeOS compat)
+	GUESS=$UNAME_MACHINE-unknown-haiku
+	;;
+    SX-4:SUPER-UX:*:*)
+	GUESS=sx4-nec-superux$UNAME_RELEASE
+	;;
+    SX-5:SUPER-UX:*:*)
+	GUESS=sx5-nec-superux$UNAME_RELEASE
+	;;
+    SX-6:SUPER-UX:*:*)
+	GUESS=sx6-nec-superux$UNAME_RELEASE
+	;;
+    SX-7:SUPER-UX:*:*)
+	GUESS=sx7-nec-superux$UNAME_RELEASE
+	;;
+    SX-8:SUPER-UX:*:*)
+	GUESS=sx8-nec-superux$UNAME_RELEASE
+	;;
+    SX-8R:SUPER-UX:*:*)
+	GUESS=sx8r-nec-superux$UNAME_RELEASE
+	;;
+    SX-ACE:SUPER-UX:*:*)
+	GUESS=sxace-nec-superux$UNAME_RELEASE
+	;;
+    Power*:Rhapsody:*:*)
+	GUESS=powerpc-apple-rhapsody$UNAME_RELEASE
+	;;
+    *:Rhapsody:*:*)
+	GUESS=$UNAME_MACHINE-apple-rhapsody$UNAME_RELEASE
+	;;
+    arm64:Darwin:*:*)
+	GUESS=aarch64-apple-darwin$UNAME_RELEASE
+	;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p`
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	if command -v xcode-select > /dev/null 2> /dev/null && \
+		! xcode-select --print-path > /dev/null 2> /dev/null ; then
+	    # Avoid executing cc if there is no toolchain installed as
+	    # cc will be a stub that puts up a graphical alert
+	    # prompting the user to install developer tools.
+	    CC_FOR_BUILD=no_compiler_found
+	else
+	    set_cc_for_build
+	fi
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		   grep IS_64BIT_ARCH >/dev/null
+	    then
+		case $UNAME_PROCESSOR in
+		    i386) UNAME_PROCESSOR=x86_64 ;;
+		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		esac
+	    fi
+	    # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
+	    if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
+		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		   grep IS_PPC >/dev/null
+	    then
+		UNAME_PROCESSOR=powerpc
+	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # uname -m returns i386 or x86_64
+	    UNAME_PROCESSOR=$UNAME_MACHINE
+	fi
+	GUESS=$UNAME_PROCESSOR-apple-darwin$UNAME_RELEASE
+	;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = x86; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	GUESS=$UNAME_PROCESSOR-$UNAME_MACHINE-nto-qnx$UNAME_RELEASE
+	;;
+    *:QNX:*:4*)
+	GUESS=i386-pc-qnx
+	;;
+    NEO-*:NONSTOP_KERNEL:*:*)
+	GUESS=neo-tandem-nsk$UNAME_RELEASE
+	;;
+    NSE-*:NONSTOP_KERNEL:*:*)
+	GUESS=nse-tandem-nsk$UNAME_RELEASE
+	;;
+    NSR-*:NONSTOP_KERNEL:*:*)
+	GUESS=nsr-tandem-nsk$UNAME_RELEASE
+	;;
+    NSV-*:NONSTOP_KERNEL:*:*)
+	GUESS=nsv-tandem-nsk$UNAME_RELEASE
+	;;
+    NSX-*:NONSTOP_KERNEL:*:*)
+	GUESS=nsx-tandem-nsk$UNAME_RELEASE
+	;;
+    *:NonStop-UX:*:*)
+	GUESS=mips-compaq-nonstopux
+	;;
+    BS2000:POSIX*:*:*)
+	GUESS=bs2000-siemens-sysv
+	;;
+    DS/*:UNIX_System_V:*:*)
+	GUESS=$UNAME_MACHINE-$UNAME_SYSTEM-$UNAME_RELEASE
+	;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "${cputype-}" = 386; then
+	    UNAME_MACHINE=i386
+	elif test "x${cputype-}" != x; then
+	    UNAME_MACHINE=$cputype
+	fi
+	GUESS=$UNAME_MACHINE-unknown-plan9
+	;;
+    *:TOPS-10:*:*)
+	GUESS=pdp10-unknown-tops10
+	;;
+    *:TENEX:*:*)
+	GUESS=pdp10-unknown-tenex
+	;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	GUESS=pdp10-dec-tops20
+	;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	GUESS=pdp10-xkl-tops20
+	;;
+    *:TOPS-20:*:*)
+	GUESS=pdp10-unknown-tops20
+	;;
+    *:ITS:*:*)
+	GUESS=pdp10-unknown-its
+	;;
+    SEI:*:*:SEIUX)
+	GUESS=mips-sei-seiux$UNAME_RELEASE
+	;;
+    *:DragonFly:*:*)
+	DRAGONFLY_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	GUESS=$UNAME_MACHINE-unknown-dragonfly$DRAGONFLY_REL
+	;;
+    *:*VMS:*:*)
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case $UNAME_MACHINE in
+	    A*) GUESS=alpha-dec-vms ;;
+	    I*) GUESS=ia64-dec-vms ;;
+	    V*) GUESS=vax-dec-vms ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	GUESS=i386-pc-xenix
+	;;
+    i*86:skyos:*:*)
+	SKYOS_REL=`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'`
+	GUESS=$UNAME_MACHINE-pc-skyos$SKYOS_REL
+	;;
+    i*86:rdos:*:*)
+	GUESS=$UNAME_MACHINE-pc-rdos
+	;;
+    i*86:Fiwix:*:*)
+	GUESS=$UNAME_MACHINE-pc-fiwix
+	;;
+    *:AROS:*:*)
+	GUESS=$UNAME_MACHINE-unknown-aros
+	;;
+    x86_64:VMkernel:*:*)
+	GUESS=$UNAME_MACHINE-unknown-esx
+	;;
+    amd64:Isilon\ OneFS:*:*)
+	GUESS=x86_64-unknown-onefs
+	;;
+    *:Unleashed:*:*)
+	GUESS=$UNAME_MACHINE-unknown-unleashed$UNAME_RELEASE
+	;;
+esac
+
+# Do we have a guess based on uname results?
+if test "x$GUESS" != x; then
+    echo "$GUESS"
+    exit
+fi
+
+# No uname command or uname output not recognized.
+set_cc_for_build
+cat > "$dummy.c" <<EOF
+#ifdef _SEQUENT_
+#include <sys/types.h>
+#include <sys/utsname.h>
+#endif
+#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
+#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
+#include <signal.h>
+#if defined(_SIZE_T_) || defined(SIGLOST)
+#include <sys/utsname.h>
+#endif
+#endif
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+  "4"
+#else
+  ""
+#endif
+  ); exit (0);
+#endif
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+  struct utsname un;
+
+  uname(&un);
+  if (strncmp(un.version, "V2", 2) == 0) {
+    printf ("i386-sequent-ptx2\n"); exit (0);
+  }
+  if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+    printf ("i386-sequent-ptx1\n"); exit (0);
+  }
+  printf ("i386-sequent-ptx\n"); exit (0);
+#endif
+
+#if defined (vax)
+#if !defined (ultrix)
+#include <sys/param.h>
+#if defined (BSD)
+#if BSD == 43
+  printf ("vax-dec-bsd4.3\n"); exit (0);
+#else
+#if BSD == 199006
+  printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#else
+  printf ("vax-dec-bsd\n"); exit (0);
+#endif
+#endif
+#else
+  printf ("vax-dec-bsd\n"); exit (0);
+#endif
+#else
+#if defined(_SIZE_T_) || defined(SIGLOST)
+  struct utsname un;
+  uname (&un);
+  printf ("vax-dec-ultrix%s\n", un.release); exit (0);
+#else
+  printf ("vax-dec-ultrix\n"); exit (0);
+#endif
+#endif
+#endif
+#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
+#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
+#if defined(_SIZE_T_) || defined(SIGLOST)
+  struct utsname *un;
+  uname (&un);
+  printf ("mips-dec-ultrix%s\n", un.release); exit (0);
+#else
+  printf ("mips-dec-ultrix\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`"$dummy"` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; }
+
+echo "$0: unable to guess system type" >&2
+
+case $UNAME_MACHINE:$UNAME_SYSTEM in
+    mips:Linux | mips64:Linux)
+	# If we got here on MIPS GNU/Linux, output extra information.
+	cat >&2 <<EOF
+
+NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
+the system type. Please install a C compiler and try again.
+EOF
+	;;
+esac
+
+cat >&2 <<EOF
+
+This script (version $timestamp), has failed to recognize the
+operating system you are using. If your script is old, overwrite *all*
+copies of config.guess and config.sub with the latest versions from:
+
+  https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
+and
+  https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
+EOF
+
+our_year=`echo $timestamp | sed 's,-.*,,'`
+thisyear=`date +%Y`
+# shellcheck disable=SC2003
+script_age=`expr "$thisyear" - "$our_year"`
+if test "$script_age" -lt 3 ; then
+   cat >&2 <<EOF
+
+If $0 has already been updated, send the following data and any
+information you think might be pertinent to config-patches@gnu.org to
+provide the necessary information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = "$UNAME_MACHINE"
+UNAME_RELEASE = "$UNAME_RELEASE"
+UNAME_SYSTEM  = "$UNAME_SYSTEM"
+UNAME_VERSION = "$UNAME_VERSION"
+EOF
+fi
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/htslib/config.h.in b/htslib/config.h.in
index 08358aa80..70c6d341c 100644
--- a/htslib/config.h.in
+++ b/htslib/config.h.in
@@ -19,31 +19,51 @@
 /* Define if HTSlib should enable S3 support. */
 #undef ENABLE_S3
 
+/* Define if __attribute__((constructor)) is available. */
+#undef HAVE_ATTRIBUTE_CONSTRUCTOR
+
+/* Define if __attribute__((target("ssse3"))) works. */
+#undef HAVE_ATTRIBUTE_TARGET_SSSE3
+
 /* Defined to 1 if rANS source using AVX2 can be compiled. */
 #undef HAVE_AVX2
 
 /* Defined to 1 if rANS source using AVX512F can be compiled. */
 #undef HAVE_AVX512
 
+/* Defined to 1 if __builtin_cpu_supports("ssse3") works */
+#undef HAVE_BUILTIN_CPU_SUPPORT_SSSE3
+
+/* Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID. */
+#undef HAVE_CLOCK_GETTIME_CPUTIME
+
 /* Define if you have the Common Crypto library. */
 #undef HAVE_COMMONCRYPTO
 
-/* Define to 1 if you have the `drand48' function. */
+/* Define to 1 if you have the declaration of '__cpuid_count', and to 0 if you
+   don't. */
+#undef HAVE_DECL___CPUID_COUNT
+
+/* Define to 1 if you have the declaration of '__get_cpuid_max', and to 0 if
+   you don't. */
+#undef HAVE_DECL___GET_CPUID_MAX
+
+/* Define to 1 if you have the 'drand48' function. */
 #undef HAVE_DRAND48
 
 /* Define if using an external libhtscodecs */
 #undef HAVE_EXTERNAL_LIBHTSCODECS
 
-/* Define to 1 if you have the `fdatasync' function. */
+/* Define to 1 if you have the 'fdatasync' function. */
 #undef HAVE_FDATASYNC
 
-/* Define to 1 if you have the `fsync' function. */
+/* Define to 1 if you have the 'fsync' function. */
 #undef HAVE_FSYNC
 
-/* Define to 1 if you have the `getpagesize' function. */
+/* Define to 1 if you have the 'getpagesize' function. */
 #undef HAVE_GETPAGESIZE
 
-/* Define to 1 if you have the `gmtime_r' function. */
+/* Define to 1 if you have the 'gmtime_r' function. */
 #undef HAVE_GMTIME_R
 
 /* Define if you have libcrypto-style HMAC(). */
@@ -52,7 +72,7 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
-/* Define to 1 if you have the `bz2' library (-lbz2). */
+/* Define to 1 if you have the 'bz2' library (-lbz2). */
 #undef HAVE_LIBBZ2
 
 /* Define if libcurl file access is enabled. */
@@ -61,25 +81,22 @@
 /* Define if libdeflate is available. */
 #undef HAVE_LIBDEFLATE
 
-/* Define to 1 if you have the `lzma' library (-llzma). */
+/* Define to 1 if you have the 'lzma' library (-llzma). */
 #undef HAVE_LIBLZMA
 
-/* Define to 1 if you have the `z' library (-lz). */
+/* Define to 1 if you have the 'z' library (-lz). */
 #undef HAVE_LIBZ
 
 /* Define to 1 if you have the <lzma.h> header file. */
 #undef HAVE_LZMA_H
 
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Define to 1 if you have a working `mmap' system call. */
+/* Define to 1 if you have a working 'mmap' system call. */
 #undef HAVE_MMAP
 
 /* Defined to 1 if rANS source using popcnt can be compiled. */
 #undef HAVE_POPCNT
 
-/* Define to 1 if you have the `srand48_deterministic' function. */
+/* Define to 1 if you have the 'srand48_deterministic' function. */
 #undef HAVE_SRAND48_DETERMINISTIC
 
 /* Defined to 1 if rANS source using SSE4.1 can be compiled. */
@@ -91,6 +108,9 @@
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
+/* Define to 1 if you have the <stdio.h> header file. */
+#undef HAVE_STDIO_H
+
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 
@@ -133,7 +153,9 @@
 /* Platform-dependent plugin filename extension. */
 #undef PLUGIN_EXT
 
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C89 standard headers exist (not just the ones
+   required in a freestanding environment). This macro is provided for
+   backward compatibility; new code need not use it. */
 #undef STDC_HEADERS
 
 
@@ -142,16 +164,17 @@
 #undef UBSAN
 #endif
 
-/* Enable large inode numbers on Mac OS X 10.5.  */
-#ifndef _DARWIN_USE_64_BIT_INODE
-# define _DARWIN_USE_64_BIT_INODE 1
-#endif
-
 /* Number of bits in a file offset, on hosts where this is settable. */
 #undef _FILE_OFFSET_BITS
 
-/* Define for large files, on AIX-style hosts. */
+/* Define to 1 on platforms where this makes off_t a 64-bit type. */
 #undef _LARGE_FILES
 
+/* Number of bits in time_t, on hosts where this is settable. */
+#undef _TIME_BITS
+
 /* Specify X/Open requirements */
 #undef _XOPEN_SOURCE
+
+/* Define to 1 on platforms where this makes time_t a 64-bit type. */
+#undef __MINGW_USE_VC2005_COMPAT
diff --git a/htslib/config.sub b/htslib/config.sub
new file mode 100755
index 000000000..defe52c0c
--- /dev/null
+++ b/htslib/config.sub
@@ -0,0 +1,1960 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright 1992-2023 Free Software Foundation, Inc.
+
+# shellcheck disable=SC2006,SC2268 # see below for rationale
+
+timestamp='2023-09-19'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+
+
+# Please send patches to <config-patches@gnu.org>.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# You can get the latest version of this script from:
+# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+# The "shellcheck disable" line above the timestamp inhibits complaints
+# about features and limitations of the classic Bourne shell that were
+# superseded or lifted in POSIX.  However, this script identifies a wide
+# variety of pre-POSIX systems that do not have POSIX shells at all, and
+# even some reasonably current systems (Solaris 10 as case-in-point) still
+# have a pre-POSIX /bin/sh.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
+
+Canonicalize a configuration name.
+
+Options:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright 1992-2023 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try '$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo "$1"
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Split fields of configuration type
+# shellcheck disable=SC2162
+saved_IFS=$IFS
+IFS="-" read field1 field2 field3 field4 <<EOF
+$1
+EOF
+IFS=$saved_IFS
+
+# Separate into logical components for further validation
+case $1 in
+	*-*-*-*-*)
+		echo "Invalid configuration '$1': more than four components" >&2
+		exit 1
+		;;
+	*-*-*-*)
+		basic_machine=$field1-$field2
+		basic_os=$field3-$field4
+		;;
+	*-*-*)
+		# Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two
+		# parts
+		maybe_os=$field2-$field3
+		case $maybe_os in
+			nto-qnx* | linux-* | uclinux-uclibc* \
+			| uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \
+			| netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \
+			| storm-chaos* | os2-emx* | rtmk-nova* | managarm-* \
+			| windows-* )
+				basic_machine=$field1
+				basic_os=$maybe_os
+				;;
+			android-linux)
+				basic_machine=$field1-unknown
+				basic_os=linux-android
+				;;
+			*)
+				basic_machine=$field1-$field2
+				basic_os=$field3
+				;;
+		esac
+		;;
+	*-*)
+		# A lone config we happen to match not fitting any pattern
+		case $field1-$field2 in
+			decstation-3100)
+				basic_machine=mips-dec
+				basic_os=
+				;;
+			*-*)
+				# Second component is usually, but not always the OS
+				case $field2 in
+					# Prevent following clause from handling this valid os
+					sun*os*)
+						basic_machine=$field1
+						basic_os=$field2
+						;;
+					zephyr*)
+						basic_machine=$field1-unknown
+						basic_os=$field2
+						;;
+					# Manufacturers
+					dec* | mips* | sequent* | encore* | pc533* | sgi* | sony* \
+					| att* | 7300* | 3300* | delta* | motorola* | sun[234]* \
+					| unicom* | ibm* | next | hp | isi* | apollo | altos* \
+					| convergent* | ncr* | news | 32* | 3600* | 3100* \
+					| hitachi* | c[123]* | convex* | sun | crds | omron* | dg \
+					| ultra | tti* | harris | dolphin | highlevel | gould \
+					| cbm | ns | masscomp | apple | axis | knuth | cray \
+					| microblaze* | sim | cisco \
+					| oki | wec | wrs | winbond)
+						basic_machine=$field1-$field2
+						basic_os=
+						;;
+					*)
+						basic_machine=$field1
+						basic_os=$field2
+						;;
+				esac
+			;;
+		esac
+		;;
+	*)
+		# Convert single-component short-hands not valid as part of
+		# multi-component configurations.
+		case $field1 in
+			386bsd)
+				basic_machine=i386-pc
+				basic_os=bsd
+				;;
+			a29khif)
+				basic_machine=a29k-amd
+				basic_os=udi
+				;;
+			adobe68k)
+				basic_machine=m68010-adobe
+				basic_os=scout
+				;;
+			alliant)
+				basic_machine=fx80-alliant
+				basic_os=
+				;;
+			altos | altos3068)
+				basic_machine=m68k-altos
+				basic_os=
+				;;
+			am29k)
+				basic_machine=a29k-none
+				basic_os=bsd
+				;;
+			amdahl)
+				basic_machine=580-amdahl
+				basic_os=sysv
+				;;
+			amiga)
+				basic_machine=m68k-unknown
+				basic_os=
+				;;
+			amigaos | amigados)
+				basic_machine=m68k-unknown
+				basic_os=amigaos
+				;;
+			amigaunix | amix)
+				basic_machine=m68k-unknown
+				basic_os=sysv4
+				;;
+			apollo68)
+				basic_machine=m68k-apollo
+				basic_os=sysv
+				;;
+			apollo68bsd)
+				basic_machine=m68k-apollo
+				basic_os=bsd
+				;;
+			aros)
+				basic_machine=i386-pc
+				basic_os=aros
+				;;
+			aux)
+				basic_machine=m68k-apple
+				basic_os=aux
+				;;
+			balance)
+				basic_machine=ns32k-sequent
+				basic_os=dynix
+				;;
+			blackfin)
+				basic_machine=bfin-unknown
+				basic_os=linux
+				;;
+			cegcc)
+				basic_machine=arm-unknown
+				basic_os=cegcc
+				;;
+			convex-c1)
+				basic_machine=c1-convex
+				basic_os=bsd
+				;;
+			convex-c2)
+				basic_machine=c2-convex
+				basic_os=bsd
+				;;
+			convex-c32)
+				basic_machine=c32-convex
+				basic_os=bsd
+				;;
+			convex-c34)
+				basic_machine=c34-convex
+				basic_os=bsd
+				;;
+			convex-c38)
+				basic_machine=c38-convex
+				basic_os=bsd
+				;;
+			cray)
+				basic_machine=j90-cray
+				basic_os=unicos
+				;;
+			crds | unos)
+				basic_machine=m68k-crds
+				basic_os=
+				;;
+			da30)
+				basic_machine=m68k-da30
+				basic_os=
+				;;
+			decstation | pmax | pmin | dec3100 | decstatn)
+				basic_machine=mips-dec
+				basic_os=
+				;;
+			delta88)
+				basic_machine=m88k-motorola
+				basic_os=sysv3
+				;;
+			dicos)
+				basic_machine=i686-pc
+				basic_os=dicos
+				;;
+			djgpp)
+				basic_machine=i586-pc
+				basic_os=msdosdjgpp
+				;;
+			ebmon29k)
+				basic_machine=a29k-amd
+				basic_os=ebmon
+				;;
+			es1800 | OSE68k | ose68k | ose | OSE)
+				basic_machine=m68k-ericsson
+				basic_os=ose
+				;;
+			gmicro)
+				basic_machine=tron-gmicro
+				basic_os=sysv
+				;;
+			go32)
+				basic_machine=i386-pc
+				basic_os=go32
+				;;
+			h8300hms)
+				basic_machine=h8300-hitachi
+				basic_os=hms
+				;;
+			h8300xray)
+				basic_machine=h8300-hitachi
+				basic_os=xray
+				;;
+			h8500hms)
+				basic_machine=h8500-hitachi
+				basic_os=hms
+				;;
+			harris)
+				basic_machine=m88k-harris
+				basic_os=sysv3
+				;;
+			hp300 | hp300hpux)
+				basic_machine=m68k-hp
+				basic_os=hpux
+				;;
+			hp300bsd)
+				basic_machine=m68k-hp
+				basic_os=bsd
+				;;
+			hppaosf)
+				basic_machine=hppa1.1-hp
+				basic_os=osf
+				;;
+			hppro)
+				basic_machine=hppa1.1-hp
+				basic_os=proelf
+				;;
+			i386mach)
+				basic_machine=i386-mach
+				basic_os=mach
+				;;
+			isi68 | isi)
+				basic_machine=m68k-isi
+				basic_os=sysv
+				;;
+			m68knommu)
+				basic_machine=m68k-unknown
+				basic_os=linux
+				;;
+			magnum | m3230)
+				basic_machine=mips-mips
+				basic_os=sysv
+				;;
+			merlin)
+				basic_machine=ns32k-utek
+				basic_os=sysv
+				;;
+			mingw64)
+				basic_machine=x86_64-pc
+				basic_os=mingw64
+				;;
+			mingw32)
+				basic_machine=i686-pc
+				basic_os=mingw32
+				;;
+			mingw32ce)
+				basic_machine=arm-unknown
+				basic_os=mingw32ce
+				;;
+			monitor)
+				basic_machine=m68k-rom68k
+				basic_os=coff
+				;;
+			morphos)
+				basic_machine=powerpc-unknown
+				basic_os=morphos
+				;;
+			moxiebox)
+				basic_machine=moxie-unknown
+				basic_os=moxiebox
+				;;
+			msdos)
+				basic_machine=i386-pc
+				basic_os=msdos
+				;;
+			msys)
+				basic_machine=i686-pc
+				basic_os=msys
+				;;
+			mvs)
+				basic_machine=i370-ibm
+				basic_os=mvs
+				;;
+			nacl)
+				basic_machine=le32-unknown
+				basic_os=nacl
+				;;
+			ncr3000)
+				basic_machine=i486-ncr
+				basic_os=sysv4
+				;;
+			netbsd386)
+				basic_machine=i386-pc
+				basic_os=netbsd
+				;;
+			netwinder)
+				basic_machine=armv4l-rebel
+				basic_os=linux
+				;;
+			news | news700 | news800 | news900)
+				basic_machine=m68k-sony
+				basic_os=newsos
+				;;
+			news1000)
+				basic_machine=m68030-sony
+				basic_os=newsos
+				;;
+			necv70)
+				basic_machine=v70-nec
+				basic_os=sysv
+				;;
+			nh3000)
+				basic_machine=m68k-harris
+				basic_os=cxux
+				;;
+			nh[45]000)
+				basic_machine=m88k-harris
+				basic_os=cxux
+				;;
+			nindy960)
+				basic_machine=i960-intel
+				basic_os=nindy
+				;;
+			mon960)
+				basic_machine=i960-intel
+				basic_os=mon960
+				;;
+			nonstopux)
+				basic_machine=mips-compaq
+				basic_os=nonstopux
+				;;
+			os400)
+				basic_machine=powerpc-ibm
+				basic_os=os400
+				;;
+			OSE68000 | ose68000)
+				basic_machine=m68000-ericsson
+				basic_os=ose
+				;;
+			os68k)
+				basic_machine=m68k-none
+				basic_os=os68k
+				;;
+			paragon)
+				basic_machine=i860-intel
+				basic_os=osf
+				;;
+			parisc)
+				basic_machine=hppa-unknown
+				basic_os=linux
+				;;
+			psp)
+				basic_machine=mipsallegrexel-sony
+				basic_os=psp
+				;;
+			pw32)
+				basic_machine=i586-unknown
+				basic_os=pw32
+				;;
+			rdos | rdos64)
+				basic_machine=x86_64-pc
+				basic_os=rdos
+				;;
+			rdos32)
+				basic_machine=i386-pc
+				basic_os=rdos
+				;;
+			rom68k)
+				basic_machine=m68k-rom68k
+				basic_os=coff
+				;;
+			sa29200)
+				basic_machine=a29k-amd
+				basic_os=udi
+				;;
+			sei)
+				basic_machine=mips-sei
+				basic_os=seiux
+				;;
+			sequent)
+				basic_machine=i386-sequent
+				basic_os=
+				;;
+			sps7)
+				basic_machine=m68k-bull
+				basic_os=sysv2
+				;;
+			st2000)
+				basic_machine=m68k-tandem
+				basic_os=
+				;;
+			stratus)
+				basic_machine=i860-stratus
+				basic_os=sysv4
+				;;
+			sun2)
+				basic_machine=m68000-sun
+				basic_os=
+				;;
+			sun2os3)
+				basic_machine=m68000-sun
+				basic_os=sunos3
+				;;
+			sun2os4)
+				basic_machine=m68000-sun
+				basic_os=sunos4
+				;;
+			sun3)
+				basic_machine=m68k-sun
+				basic_os=
+				;;
+			sun3os3)
+				basic_machine=m68k-sun
+				basic_os=sunos3
+				;;
+			sun3os4)
+				basic_machine=m68k-sun
+				basic_os=sunos4
+				;;
+			sun4)
+				basic_machine=sparc-sun
+				basic_os=
+				;;
+			sun4os3)
+				basic_machine=sparc-sun
+				basic_os=sunos3
+				;;
+			sun4os4)
+				basic_machine=sparc-sun
+				basic_os=sunos4
+				;;
+			sun4sol2)
+				basic_machine=sparc-sun
+				basic_os=solaris2
+				;;
+			sun386 | sun386i | roadrunner)
+				basic_machine=i386-sun
+				basic_os=
+				;;
+			sv1)
+				basic_machine=sv1-cray
+				basic_os=unicos
+				;;
+			symmetry)
+				basic_machine=i386-sequent
+				basic_os=dynix
+				;;
+			t3e)
+				basic_machine=alphaev5-cray
+				basic_os=unicos
+				;;
+			t90)
+				basic_machine=t90-cray
+				basic_os=unicos
+				;;
+			toad1)
+				basic_machine=pdp10-xkl
+				basic_os=tops20
+				;;
+			tpf)
+				basic_machine=s390x-ibm
+				basic_os=tpf
+				;;
+			udi29k)
+				basic_machine=a29k-amd
+				basic_os=udi
+				;;
+			ultra3)
+				basic_machine=a29k-nyu
+				basic_os=sym1
+				;;
+			v810 | necv810)
+				basic_machine=v810-nec
+				basic_os=none
+				;;
+			vaxv)
+				basic_machine=vax-dec
+				basic_os=sysv
+				;;
+			vms)
+				basic_machine=vax-dec
+				basic_os=vms
+				;;
+			vsta)
+				basic_machine=i386-pc
+				basic_os=vsta
+				;;
+			vxworks960)
+				basic_machine=i960-wrs
+				basic_os=vxworks
+				;;
+			vxworks68)
+				basic_machine=m68k-wrs
+				basic_os=vxworks
+				;;
+			vxworks29k)
+				basic_machine=a29k-wrs
+				basic_os=vxworks
+				;;
+			xbox)
+				basic_machine=i686-pc
+				basic_os=mingw32
+				;;
+			ymp)
+				basic_machine=ymp-cray
+				basic_os=unicos
+				;;
+			*)
+				basic_machine=$1
+				basic_os=
+				;;
+		esac
+		;;
+esac
+
+# Decode 1-component or ad-hoc basic machines
+case $basic_machine in
+	# Here we handle the default manufacturer of certain CPU types.  It is in
+	# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		cpu=hppa1.1
+		vendor=winbond
+		;;
+	op50n)
+		cpu=hppa1.1
+		vendor=oki
+		;;
+	op60c)
+		cpu=hppa1.1
+		vendor=oki
+		;;
+	ibm*)
+		cpu=i370
+		vendor=ibm
+		;;
+	orion105)
+		cpu=clipper
+		vendor=highlevel
+		;;
+	mac | mpw | mac-mpw)
+		cpu=m68k
+		vendor=apple
+		;;
+	pmac | pmac-mpw)
+		cpu=powerpc
+		vendor=apple
+		;;
+
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		cpu=m68000
+		vendor=att
+		;;
+	3b*)
+		cpu=we32k
+		vendor=att
+		;;
+	bluegene*)
+		cpu=powerpc
+		vendor=ibm
+		basic_os=cnk
+		;;
+	decsystem10* | dec10*)
+		cpu=pdp10
+		vendor=dec
+		basic_os=tops10
+		;;
+	decsystem20* | dec20*)
+		cpu=pdp10
+		vendor=dec
+		basic_os=tops20
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		cpu=m68k
+		vendor=motorola
+		;;
+	dpx2*)
+		cpu=m68k
+		vendor=bull
+		basic_os=sysv3
+		;;
+	encore | umax | mmax)
+		cpu=ns32k
+		vendor=encore
+		;;
+	elxsi)
+		cpu=elxsi
+		vendor=elxsi
+		basic_os=${basic_os:-bsd}
+		;;
+	fx2800)
+		cpu=i860
+		vendor=alliant
+		;;
+	genix)
+		cpu=ns32k
+		vendor=ns
+		;;
+	h3050r* | hiux*)
+		cpu=hppa1.1
+		vendor=hitachi
+		basic_os=hiuxwe2
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		cpu=hppa1.0
+		vendor=hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		cpu=m68000
+		vendor=hp
+		;;
+	hp9k3[2-9][0-9])
+		cpu=m68k
+		vendor=hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		cpu=hppa1.0
+		vendor=hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		cpu=hppa1.0
+		vendor=hp
+		;;
+	i*86v32)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=sysv32
+		;;
+	i*86v4*)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=sysv4
+		;;
+	i*86v)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=sysv
+		;;
+	i*86sol2)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=solaris2
+		;;
+	j90 | j90-cray)
+		cpu=j90
+		vendor=cray
+		basic_os=${basic_os:-unicos}
+		;;
+	iris | iris4d)
+		cpu=mips
+		vendor=sgi
+		case $basic_os in
+		    irix*)
+			;;
+		    *)
+			basic_os=irix4
+			;;
+		esac
+		;;
+	miniframe)
+		cpu=m68000
+		vendor=convergent
+		;;
+	*mint | mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		cpu=m68k
+		vendor=atari
+		basic_os=mint
+		;;
+	news-3600 | risc-news)
+		cpu=mips
+		vendor=sony
+		basic_os=newsos
+		;;
+	next | m*-next)
+		cpu=m68k
+		vendor=next
+		case $basic_os in
+		    openstep*)
+		        ;;
+		    nextstep*)
+			;;
+		    ns2*)
+		      basic_os=nextstep2
+			;;
+		    *)
+		      basic_os=nextstep3
+			;;
+		esac
+		;;
+	np1)
+		cpu=np1
+		vendor=gould
+		;;
+	op50n-* | op60c-*)
+		cpu=hppa1.1
+		vendor=oki
+		basic_os=proelf
+		;;
+	pa-hitachi)
+		cpu=hppa1.1
+		vendor=hitachi
+		basic_os=hiuxwe2
+		;;
+	pbd)
+		cpu=sparc
+		vendor=tti
+		;;
+	pbb)
+		cpu=m68k
+		vendor=tti
+		;;
+	pc532)
+		cpu=ns32k
+		vendor=pc532
+		;;
+	pn)
+		cpu=pn
+		vendor=gould
+		;;
+	power)
+		cpu=power
+		vendor=ibm
+		;;
+	ps2)
+		cpu=i386
+		vendor=ibm
+		;;
+	rm[46]00)
+		cpu=mips
+		vendor=siemens
+		;;
+	rtpc | rtpc-*)
+		cpu=romp
+		vendor=ibm
+		;;
+	sde)
+		cpu=mipsisa32
+		vendor=sde
+		basic_os=${basic_os:-elf}
+		;;
+	simso-wrs)
+		cpu=sparclite
+		vendor=wrs
+		basic_os=vxworks
+		;;
+	tower | tower-32)
+		cpu=m68k
+		vendor=ncr
+		;;
+	vpp*|vx|vx-*)
+		cpu=f301
+		vendor=fujitsu
+		;;
+	w65)
+		cpu=w65
+		vendor=wdc
+		;;
+	w89k-*)
+		cpu=hppa1.1
+		vendor=winbond
+		basic_os=proelf
+		;;
+	none)
+		cpu=none
+		vendor=none
+		;;
+	leon|leon[3-9])
+		cpu=sparc
+		vendor=$basic_machine
+		;;
+	leon-*|leon[3-9]-*)
+		cpu=sparc
+		vendor=`echo "$basic_machine" | sed 's/-.*//'`
+		;;
+
+	*-*)
+		# shellcheck disable=SC2162
+		saved_IFS=$IFS
+		IFS="-" read cpu vendor <<EOF
+$basic_machine
+EOF
+		IFS=$saved_IFS
+		;;
+	# We use 'pc' rather than 'unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+		cpu=$basic_machine
+		vendor=pc
+		;;
+	# These rules are duplicated from below for sake of the special case above;
+	# i.e. things that normalized to x86 arches should also default to "pc"
+	pc98)
+		cpu=i386
+		vendor=pc
+		;;
+	x64 | amd64)
+		cpu=x86_64
+		vendor=pc
+		;;
+	# Recognize the basic CPU types without company name.
+	*)
+		cpu=$basic_machine
+		vendor=unknown
+		;;
+esac
+
+unset -v basic_machine
+
+# Decode basic machines in the full and proper CPU-Company form.
+case $cpu-$vendor in
+	# Here we handle the default manufacturer of certain CPU types in canonical form. It is in
+	# some cases the only manufacturer, in others, it is the most popular.
+	craynv-unknown)
+		vendor=cray
+		basic_os=${basic_os:-unicosmp}
+		;;
+	c90-unknown | c90-cray)
+		vendor=cray
+		basic_os=${Basic_os:-unicos}
+		;;
+	fx80-unknown)
+		vendor=alliant
+		;;
+	romp-unknown)
+		vendor=ibm
+		;;
+	mmix-unknown)
+		vendor=knuth
+		;;
+	microblaze-unknown | microblazeel-unknown)
+		vendor=xilinx
+		;;
+	rs6000-unknown)
+		vendor=ibm
+		;;
+	vax-unknown)
+		vendor=dec
+		;;
+	pdp11-unknown)
+		vendor=dec
+		;;
+	we32k-unknown)
+		vendor=att
+		;;
+	cydra-unknown)
+		vendor=cydrome
+		;;
+	i370-ibm*)
+		vendor=ibm
+		;;
+	orion-unknown)
+		vendor=highlevel
+		;;
+	xps-unknown | xps100-unknown)
+		cpu=xps100
+		vendor=honeywell
+		;;
+
+	# Here we normalize CPU types with a missing or matching vendor
+	armh-unknown | armh-alt)
+		cpu=armv7l
+		vendor=alt
+		basic_os=${basic_os:-linux-gnueabihf}
+		;;
+	dpx20-unknown | dpx20-bull)
+		cpu=rs6000
+		vendor=bull
+		basic_os=${basic_os:-bosx}
+		;;
+
+	# Here we normalize CPU types irrespective of the vendor
+	amd64-*)
+		cpu=x86_64
+		;;
+	blackfin-*)
+		cpu=bfin
+		basic_os=linux
+		;;
+	c54x-*)
+		cpu=tic54x
+		;;
+	c55x-*)
+		cpu=tic55x
+		;;
+	c6x-*)
+		cpu=tic6x
+		;;
+	e500v[12]-*)
+		cpu=powerpc
+		basic_os=${basic_os}"spe"
+		;;
+	mips3*-*)
+		cpu=mips64
+		;;
+	ms1-*)
+		cpu=mt
+		;;
+	m68knommu-*)
+		cpu=m68k
+		basic_os=linux
+		;;
+	m9s12z-* | m68hcs12z-* | hcs12z-* | s12z-*)
+		cpu=s12z
+		;;
+	openrisc-*)
+		cpu=or32
+		;;
+	parisc-*)
+		cpu=hppa
+		basic_os=linux
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		cpu=i586
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-* | athlon_*-*)
+		cpu=i686
+		;;
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		cpu=i686
+		;;
+	pentium4-*)
+		cpu=i786
+		;;
+	pc98-*)
+		cpu=i386
+		;;
+	ppc-* | ppcbe-*)
+		cpu=powerpc
+		;;
+	ppcle-* | powerpclittle-*)
+		cpu=powerpcle
+		;;
+	ppc64-*)
+		cpu=powerpc64
+		;;
+	ppc64le-* | powerpc64little-*)
+		cpu=powerpc64le
+		;;
+	sb1-*)
+		cpu=mipsisa64sb1
+		;;
+	sb1el-*)
+		cpu=mipsisa64sb1el
+		;;
+	sh5e[lb]-*)
+		cpu=`echo "$cpu" | sed 's/^\(sh.\)e\(.\)$/\1\2e/'`
+		;;
+	spur-*)
+		cpu=spur
+		;;
+	strongarm-* | thumb-*)
+		cpu=arm
+		;;
+	tx39-*)
+		cpu=mipstx39
+		;;
+	tx39el-*)
+		cpu=mipstx39el
+		;;
+	x64-*)
+		cpu=x86_64
+		;;
+	xscale-* | xscalee[bl]-*)
+		cpu=`echo "$cpu" | sed 's/^xscale/arm/'`
+		;;
+	arm64-* | aarch64le-*)
+		cpu=aarch64
+		;;
+
+	# Recognize the canonical CPU Types that limit and/or modify the
+	# company names they are paired with.
+	cr16-*)
+		basic_os=${basic_os:-elf}
+		;;
+	crisv32-* | etraxfs*-*)
+		cpu=crisv32
+		vendor=axis
+		;;
+	cris-* | etrax*-*)
+		cpu=cris
+		vendor=axis
+		;;
+	crx-*)
+		basic_os=${basic_os:-elf}
+		;;
+	neo-tandem)
+		cpu=neo
+		vendor=tandem
+		;;
+	nse-tandem)
+		cpu=nse
+		vendor=tandem
+		;;
+	nsr-tandem)
+		cpu=nsr
+		vendor=tandem
+		;;
+	nsv-tandem)
+		cpu=nsv
+		vendor=tandem
+		;;
+	nsx-tandem)
+		cpu=nsx
+		vendor=tandem
+		;;
+	mipsallegrexel-sony)
+		cpu=mipsallegrexel
+		vendor=sony
+		;;
+	tile*-*)
+		basic_os=${basic_os:-linux-gnu}
+		;;
+
+	*)
+		# Recognize the canonical CPU types that are allowed with any
+		# company name.
+		case $cpu in
+			1750a | 580 \
+			| a29k \
+			| aarch64 | aarch64_be | aarch64c | arm64ec \
+			| abacus \
+			| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \
+			| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \
+			| alphapca5[67] | alpha64pca5[67] \
+			| am33_2.0 \
+			| amdgcn \
+			| arc | arceb | arc32 | arc64 \
+			| arm | arm[lb]e | arme[lb] | armv* \
+			| avr | avr32 \
+			| asmjs \
+			| ba \
+			| be32 | be64 \
+			| bfin | bpf | bs2000 \
+			| c[123]* | c30 | [cjt]90 | c4x \
+			| c8051 | clipper | craynv | csky | cydra \
+			| d10v | d30v | dlx | dsp16xx \
+			| e2k | elxsi | epiphany \
+			| f30[01] | f700 | fido | fr30 | frv | ft32 | fx80 \
+			| javascript \
+			| h8300 | h8500 \
+			| hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+			| hexagon \
+			| i370 | i*86 | i860 | i960 | ia16 | ia64 \
+			| ip2k | iq2000 \
+			| k1om \
+			| kvx \
+			| le32 | le64 \
+			| lm32 \
+			| loongarch32 | loongarch64 \
+			| m32c | m32r | m32rle \
+			| m5200 | m68000 | m680[012346]0 | m68360 | m683?2 | m68k \
+			| m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x \
+			| m88110 | m88k | maxq | mb | mcore | mep | metag \
+			| microblaze | microblazeel \
+			| mips* \
+			| mmix \
+			| mn10200 | mn10300 \
+			| moxie \
+			| mt \
+			| msp430 \
+			| nds32 | nds32le | nds32be \
+			| nfp \
+			| nios | nios2 | nios2eb | nios2el \
+			| none | np1 | ns16k | ns32k | nvptx \
+			| open8 \
+			| or1k* \
+			| or32 \
+			| orion \
+			| picochip \
+			| pdp10 | pdp11 | pj | pjl | pn | power \
+			| powerpc | powerpc64 | powerpc64le | powerpcle | powerpcspe \
+			| pru \
+			| pyramid \
+			| riscv | riscv32 | riscv32be | riscv64 | riscv64be \
+			| rl78 | romp | rs6000 | rx \
+			| s390 | s390x \
+			| score \
+			| sh | shl \
+			| sh[1234] | sh[24]a | sh[24]ae[lb] | sh[23]e | she[lb] | sh[lb]e \
+			| sh[1234]e[lb] |  sh[12345][lb]e | sh[23]ele | sh64 | sh64le \
+			| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet \
+			| sparclite \
+			| sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \
+			| spu \
+			| tahoe \
+			| thumbv7* \
+			| tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \
+			| tron \
+			| ubicom32 \
+			| v70 | v850 | v850e | v850e1 | v850es | v850e2 | v850e2v3 \
+			| vax \
+			| visium \
+			| w65 \
+			| wasm32 | wasm64 \
+			| we32k \
+			| x86 | x86_64 | xc16x | xgate | xps100 \
+			| xstormy16 | xtensa* \
+			| ymp \
+			| z8k | z80)
+				;;
+
+			*)
+				echo "Invalid configuration '$1': machine '$cpu-$vendor' not recognized" 1>&2
+				exit 1
+				;;
+		esac
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $vendor in
+	digital*)
+		vendor=dec
+		;;
+	commodore*)
+		vendor=cbm
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if test x"$basic_os" != x
+then
+
+# First recognize some ad-hoc cases, or perhaps split kernel-os, or else just
+# set os.
+obj=
+case $basic_os in
+	gnu/linux*)
+		kernel=linux
+		os=`echo "$basic_os" | sed -e 's|gnu/linux|gnu|'`
+		;;
+	os2-emx)
+		kernel=os2
+		os=`echo "$basic_os" | sed -e 's|os2-emx|emx|'`
+		;;
+	nto-qnx*)
+		kernel=nto
+		os=`echo "$basic_os" | sed -e 's|nto-qnx|qnx|'`
+		;;
+	*-*)
+		# shellcheck disable=SC2162
+		saved_IFS=$IFS
+		IFS="-" read kernel os <<EOF
+$basic_os
+EOF
+		IFS=$saved_IFS
+		;;
+	# Default OS when just kernel was specified
+	nto*)
+		kernel=nto
+		os=`echo "$basic_os" | sed -e 's|nto|qnx|'`
+		;;
+	linux*)
+		kernel=linux
+		os=`echo "$basic_os" | sed -e 's|linux|gnu|'`
+		;;
+	managarm*)
+		kernel=managarm
+		os=`echo "$basic_os" | sed -e 's|managarm|mlibc|'`
+		;;
+	*)
+		kernel=
+		os=$basic_os
+		;;
+esac
+
+# Now, normalize the OS (knowing we just have one component, it's not a kernel,
+# etc.)
+case $os in
+	# First match some system type aliases that might get confused
+	# with valid system types.
+	# solaris* is a basic system type, with this one exception.
+	auroraux)
+		os=auroraux
+		;;
+	bluegene*)
+		os=cnk
+		;;
+	solaris1 | solaris1.*)
+		os=`echo "$os" | sed -e 's|solaris1|sunos4|'`
+		;;
+	solaris)
+		os=solaris2
+		;;
+	unixware*)
+		os=sysv4.2uw
+		;;
+	# es1800 is here to avoid being matched by es* (a different OS)
+	es1800*)
+		os=ose
+		;;
+	# Some version numbers need modification
+	chorusos*)
+		os=chorusos
+		;;
+	isc)
+		os=isc2.2
+		;;
+	sco6)
+		os=sco5v6
+		;;
+	sco5)
+		os=sco3.2v5
+		;;
+	sco4)
+		os=sco3.2v4
+		;;
+	sco3.2.[4-9]*)
+		os=`echo "$os" | sed -e 's/sco3.2./sco3.2v/'`
+		;;
+	sco*v* | scout)
+		# Don't match below
+		;;
+	sco*)
+		os=sco3.2v2
+		;;
+	psos*)
+		os=psos
+		;;
+	qnx*)
+		os=qnx
+		;;
+	hiux*)
+		os=hiuxwe2
+		;;
+	lynx*178)
+		os=lynxos178
+		;;
+	lynx*5)
+		os=lynxos5
+		;;
+	lynxos*)
+		# don't get caught up in next wildcard
+		;;
+	lynx*)
+		os=lynxos
+		;;
+	mac[0-9]*)
+		os=`echo "$os" | sed -e 's|mac|macos|'`
+		;;
+	opened*)
+		os=openedition
+		;;
+	os400*)
+		os=os400
+		;;
+	sunos5*)
+		os=`echo "$os" | sed -e 's|sunos5|solaris2|'`
+		;;
+	sunos6*)
+		os=`echo "$os" | sed -e 's|sunos6|solaris3|'`
+		;;
+	wince*)
+		os=wince
+		;;
+	utek*)
+		os=bsd
+		;;
+	dynix*)
+		os=bsd
+		;;
+	acis*)
+		os=aos
+		;;
+	atheos*)
+		os=atheos
+		;;
+	syllable*)
+		os=syllable
+		;;
+	386bsd)
+		os=bsd
+		;;
+	ctix* | uts*)
+		os=sysv
+		;;
+	nova*)
+		os=rtmk-nova
+		;;
+	ns2)
+		os=nextstep2
+		;;
+	# Preserve the version number of sinix5.
+	sinix5.*)
+		os=`echo "$os" | sed -e 's|sinix|sysv|'`
+		;;
+	sinix*)
+		os=sysv4
+		;;
+	tpf*)
+		os=tpf
+		;;
+	triton*)
+		os=sysv3
+		;;
+	oss*)
+		os=sysv3
+		;;
+	svr4*)
+		os=sysv4
+		;;
+	svr3)
+		os=sysv3
+		;;
+	sysvr4)
+		os=sysv4
+		;;
+	ose*)
+		os=ose
+		;;
+	*mint | mint[0-9]* | *MiNT | MiNT[0-9]*)
+		os=mint
+		;;
+	dicos*)
+		os=dicos
+		;;
+	pikeos*)
+		# Until real need of OS specific support for
+		# particular features comes up, bare metal
+		# configurations are quite functional.
+		case $cpu in
+		    arm*)
+			os=eabi
+			;;
+		    *)
+			os=
+			obj=elf
+			;;
+		esac
+		;;
+	aout* | coff* | elf* | pe*)
+		# These are machine code file formats, not OSes
+		obj=$os
+		os=
+		;;
+	*)
+		# No normalization, but not necessarily accepted, that comes below.
+		;;
+esac
+
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+kernel=
+obj=
+case $cpu-$vendor in
+	score-*)
+		os=
+		obj=elf
+		;;
+	spu-*)
+		os=
+		obj=elf
+		;;
+	*-acorn)
+		os=riscix1.2
+		;;
+	arm*-rebel)
+		kernel=linux
+		os=gnu
+		;;
+	arm*-semi)
+		os=
+		obj=aout
+		;;
+	c4x-* | tic4x-*)
+		os=
+		obj=coff
+		;;
+	c8051-*)
+		os=
+		obj=elf
+		;;
+	clipper-intergraph)
+		os=clix
+		;;
+	hexagon-*)
+		os=
+		obj=elf
+		;;
+	tic54x-*)
+		os=
+		obj=coff
+		;;
+	tic55x-*)
+		os=
+		obj=coff
+		;;
+	tic6x-*)
+		os=
+		obj=coff
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=tops20
+		;;
+	pdp11-*)
+		os=none
+		;;
+	*-dec | vax-*)
+		os=ultrix4.2
+		;;
+	m68*-apollo)
+		os=domain
+		;;
+	i386-sun)
+		os=sunos4.0.2
+		;;
+	m68000-sun)
+		os=sunos3
+		;;
+	m68*-cisco)
+		os=
+		obj=aout
+		;;
+	mep-*)
+		os=
+		obj=elf
+		;;
+	mips*-cisco)
+		os=
+		obj=elf
+		;;
+	mips*-*)
+		os=
+		obj=elf
+		;;
+	or32-*)
+		os=
+		obj=coff
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=sysv3
+		;;
+	sparc-* | *-sun)
+		os=sunos4.1.1
+		;;
+	pru-*)
+		os=
+		obj=elf
+		;;
+	*-be)
+		os=beos
+		;;
+	*-ibm)
+		os=aix
+		;;
+	*-knuth)
+		os=mmixware
+		;;
+	*-wec)
+		os=proelf
+		;;
+	*-winbond)
+		os=proelf
+		;;
+	*-oki)
+		os=proelf
+		;;
+	*-hp)
+		os=hpux
+		;;
+	*-hitachi)
+		os=hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=sysv
+		;;
+	*-cbm)
+		os=amigaos
+		;;
+	*-dg)
+		os=dgux
+		;;
+	*-dolphin)
+		os=sysv3
+		;;
+	m68k-ccur)
+		os=rtu
+		;;
+	m88k-omron*)
+		os=luna
+		;;
+	*-next)
+		os=nextstep
+		;;
+	*-sequent)
+		os=ptx
+		;;
+	*-crds)
+		os=unos
+		;;
+	*-ns)
+		os=genix
+		;;
+	i370-*)
+		os=mvs
+		;;
+	*-gould)
+		os=sysv
+		;;
+	*-highlevel)
+		os=bsd
+		;;
+	*-encore)
+		os=bsd
+		;;
+	*-sgi)
+		os=irix
+		;;
+	*-siemens)
+		os=sysv4
+		;;
+	*-masscomp)
+		os=rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=uxpv
+		;;
+	*-rom68k)
+		os=
+		obj=coff
+		;;
+	*-*bug)
+		os=
+		obj=coff
+		;;
+	*-apple)
+		os=macos
+		;;
+	*-atari*)
+		os=mint
+		;;
+	*-wrs)
+		os=vxworks
+		;;
+	*)
+		os=none
+		;;
+esac
+
+fi
+
+# Now, validate our (potentially fixed-up) individual pieces (OS, OBJ).
+
+case $os in
+	# Sometimes we do "kernel-libc", so those need to count as OSes.
+	musl* | newlib* | relibc* | uclibc*)
+		;;
+	# Likewise for "kernel-abi"
+	eabi* | gnueabi*)
+		;;
+	# VxWorks passes extra cpu info in the 4th filed.
+	simlinux | simwindows | spe)
+		;;
+	# See `case $cpu-$os` validation below
+	ghcjs)
+		;;
+	# Now accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST end in a * to match a version number.
+	gnu* | android* | bsd* | mach* | minix* | genix* | ultrix* | irix* \
+	     | *vms* | esix* | aix* | cnk* | sunos | sunos[34]* \
+	     | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \
+	     | sym* |  plan9* | psp* | sim* | xray* | os68k* | v88r* \
+	     | hiux* | abug | nacl* | netware* | windows* \
+	     | os9* | macos* | osx* | ios* | tvos* | watchos* \
+	     | mpw* | magic* | mmixware* | mon960* | lnews* \
+	     | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \
+	     | aos* | aros* | cloudabi* | sortix* | twizzler* \
+	     | nindy* | vxsim* | vxworks* | ebmon* | hms* | mvs* \
+	     | clix* | riscos* | uniplus* | iris* | isc* | rtu* | xenix* \
+	     | mirbsd* | netbsd* | dicos* | openedition* | ose* \
+	     | bitrig* | openbsd* | secbsd* | solidbsd* | libertybsd* | os108* \
+	     | ekkobsd* | freebsd* | riscix* | lynxos* | os400* \
+	     | bosx* | nextstep* | cxux* | oabi* \
+	     | ptx* | ecoff* | winnt* | domain* | vsta* \
+	     | udi* | lites* | ieee* | go32* | aux* | hcos* \
+	     | chorusrdb* | cegcc* | glidix* | serenity* \
+	     | cygwin* | msys* | moss* | proelf* | rtems* \
+	     | midipix* | mingw32* | mingw64* | mint* \
+	     | uxpv* | beos* | mpeix* | udk* | moxiebox* \
+	     | interix* | uwin* | mks* | rhapsody* | darwin* \
+	     | openstep* | oskit* | conix* | pw32* | nonstopux* \
+	     | storm-chaos* | tops10* | tenex* | tops20* | its* \
+	     | os2* | vos* | palmos* | uclinux* | nucleus* | morphos* \
+	     | scout* | superux* | sysv* | rtmk* | tpf* | windiss* \
+	     | powermax* | dnix* | nx6 | nx7 | sei* | dragonfly* \
+	     | skyos* | haiku* | rdos* | toppers* | drops* | es* \
+	     | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \
+	     | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \
+	     | nsk* | powerunix* | genode* | zvmoe* | qnx* | emx* | zephyr* \
+	     | fiwix* | mlibc* | cos* | mbr* )
+		;;
+	# This one is extra strict with allowed versions
+	sco3.2v2 | sco3.2v[4-9]* | sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		;;
+	none)
+		;;
+	kernel* | msvc* )
+		# Restricted further below
+		;;
+	'')
+		if test x"$obj" = x
+		then
+			echo "Invalid configuration '$1': Blank OS only allowed with explicit machine code file format" 1>&2
+		fi
+		;;
+	*)
+		echo "Invalid configuration '$1': OS '$os' not recognized" 1>&2
+		exit 1
+		;;
+esac
+
+case $obj in
+	aout* | coff* | elf* | pe*)
+		;;
+	'')
+		# empty is fine
+		;;
+	*)
+		echo "Invalid configuration '$1': Machine code format '$obj' not recognized" 1>&2
+		exit 1
+		;;
+esac
+
+# Here we handle the constraint that a (synthetic) cpu and os are
+# valid only in combination with each other and nowhere else.
+case $cpu-$os in
+	# The "javascript-unknown-ghcjs" triple is used by GHC; we
+	# accept it here in order to tolerate that, but reject any
+	# variations.
+	javascript-ghcjs)
+		;;
+	javascript-* | *-ghcjs)
+		echo "Invalid configuration '$1': cpu '$cpu' is not valid with os '$os$obj'" 1>&2
+		exit 1
+		;;
+esac
+
+# As a final step for OS-related things, validate the OS-kernel combination
+# (given a valid OS), if there is a kernel.
+case $kernel-$os-$obj in
+	linux-gnu*- | linux-dietlibc*- | linux-android*- | linux-newlib*- \
+		   | linux-musl*- | linux-relibc*- | linux-uclibc*- | linux-mlibc*- )
+		;;
+	uclinux-uclibc*- )
+		;;
+	managarm-mlibc*- | managarm-kernel*- )
+		;;
+	windows*-msvc*-)
+		;;
+	-dietlibc*- | -newlib*- | -musl*- | -relibc*- | -uclibc*- | -mlibc*- )
+		# These are just libc implementations, not actual OSes, and thus
+		# require a kernel.
+		echo "Invalid configuration '$1': libc '$os' needs explicit kernel." 1>&2
+		exit 1
+		;;
+	-kernel*- )
+		echo "Invalid configuration '$1': '$os' needs explicit kernel." 1>&2
+		exit 1
+		;;
+	*-kernel*- )
+		echo "Invalid configuration '$1': '$kernel' does not support '$os'." 1>&2
+		exit 1
+		;;
+	*-msvc*- )
+		echo "Invalid configuration '$1': '$os' needs 'windows'." 1>&2
+		exit 1
+		;;
+	kfreebsd*-gnu*- | kopensolaris*-gnu*-)
+		;;
+	vxworks-simlinux- | vxworks-simwindows- | vxworks-spe-)
+		;;
+	nto-qnx*-)
+		;;
+	os2-emx-)
+		;;
+	*-eabi*- | *-gnueabi*-)
+		;;
+	none--*)
+		# None (no kernel, i.e. freestanding / bare metal),
+		# can be paired with an machine code file format
+		;;
+	-*-)
+		# Blank kernel with real OS is always fine.
+		;;
+	--*)
+		# Blank kernel and OS with real machine code file format is always fine.
+		;;
+	*-*-*)
+		echo "Invalid configuration '$1': Kernel '$kernel' not known to work with OS '$os'." 1>&2
+		exit 1
+		;;
+esac
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+case $vendor in
+	unknown)
+		case $cpu-$os in
+			*-riscix*)
+				vendor=acorn
+				;;
+			*-sunos*)
+				vendor=sun
+				;;
+			*-cnk* | *-aix*)
+				vendor=ibm
+				;;
+			*-beos*)
+				vendor=be
+				;;
+			*-hpux*)
+				vendor=hp
+				;;
+			*-mpeix*)
+				vendor=hp
+				;;
+			*-hiux*)
+				vendor=hitachi
+				;;
+			*-unos*)
+				vendor=crds
+				;;
+			*-dgux*)
+				vendor=dg
+				;;
+			*-luna*)
+				vendor=omron
+				;;
+			*-genix*)
+				vendor=ns
+				;;
+			*-clix*)
+				vendor=intergraph
+				;;
+			*-mvs* | *-opened*)
+				vendor=ibm
+				;;
+			*-os400*)
+				vendor=ibm
+				;;
+			s390-* | s390x-*)
+				vendor=ibm
+				;;
+			*-ptx*)
+				vendor=sequent
+				;;
+			*-tpf*)
+				vendor=ibm
+				;;
+			*-vxsim* | *-vxworks* | *-windiss*)
+				vendor=wrs
+				;;
+			*-aux*)
+				vendor=apple
+				;;
+			*-hms*)
+				vendor=hitachi
+				;;
+			*-mpw* | *-macos*)
+				vendor=apple
+				;;
+			*-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*)
+				vendor=atari
+				;;
+			*-vos*)
+				vendor=stratus
+				;;
+		esac
+		;;
+esac
+
+echo "$cpu-$vendor${kernel:+-$kernel}${os:+-$os}${obj:+-$obj}"
+exit
+
+# Local variables:
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/htslib/configure b/htslib/configure
index ac63dee2b..1a6ec7d3f 100755
--- a/htslib/configure
+++ b/htslib/configure
@@ -1,17 +1,18 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for HTSlib 1.18.
+# Generated by GNU Autoconf 2.72 for HTSlib 1.21.
 #
 # Report bugs to <samtools-help@lists.sourceforge.net>.
 #
 #
-# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
+# Inc.
 #
 #
 # This configure script is free software; the Free Software Foundation
 # gives unlimited permission to copy, distribute and modify it.
 #
-# Portions copyright (C) 2020-2023 Genome Research Ltd.
+# Portions copyright (C) 2020-2024 Genome Research Ltd.
 #
 # This configure script is free software: you are free to change and
 # redistribute it.  There is NO WARRANTY, to the extent permitted by law.
@@ -21,63 +22,65 @@
 
 # Be more Bourne compatible
 DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in #(
+else case e in #(
+  e) case `(set -o) 2>/dev/null` in #(
   *posix*) :
     set -o posix ;; #(
   *) :
      ;;
+esac ;;
 esac
 fi
 
 
+
+# Reset variables that may have inherited troublesome values from
+# the environment.
+
+# IFS needs to be set, to space, tab, and newline, in precisely that order.
+# (If _AS_PATH_WALK were called with IFS unset, it would have the
+# side effect of setting IFS to empty, thus disabling word splitting.)
+# Quoting is to prevent editors from complaining about space-tab.
 as_nl='
 '
 export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
+IFS=" ""	$as_nl"
+
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# Ensure predictable behavior from utilities with locale-dependent output.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# We cannot yet rely on "unset" to work, but we need these variables
+# to be unset--not just set to an empty or harmless value--now, to
+# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh).  This construct
+# also avoids known problems related to "unset" and subshell syntax
+# in other old shells (e.g. bash 2.01 and pdksh 5.2.14).
+for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH
+do eval test \${$as_var+y} \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+
+# Ensure that fds 0, 1, and 2 are open.
+if (exec 3>&0) 2>/dev/null; then :; else exec 0</dev/null; fi
+if (exec 3>&1) 2>/dev/null; then :; else exec 1>/dev/null; fi
+if (exec 3>&2)            ; then :; else exec 2>/dev/null; fi
 
 # The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
+if ${PATH_SEPARATOR+false} :; then
   PATH_SEPARATOR=:
   (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
     (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
@@ -86,13 +89,6 @@ if test "${PATH_SEPARATOR+set}" != set; then
 fi
 
 
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
 # Find who we are.  Look in the path if we contain no directory separator.
 as_myself=
 case $0 in #((
@@ -101,43 +97,27 @@ case $0 in #((
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    test -r "$as_dir$0" && as_myself=$as_dir$0 && break
   done
 IFS=$as_save_IFS
 
      ;;
 esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
+# We did not find ourselves, most probably we were run as 'sh COMMAND'
 # in which case we are not to be found in the path.
 if test "x$as_myself" = x; then
   as_myself=$0
 fi
 if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
   exit 1
 fi
 
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
 
 # Use a proper internal environment variable to ensure we don't fall
   # into an infinite loop, continuously re-executing ourselves.
@@ -158,26 +138,28 @@ case $- in # ((((
 esac
 exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
 # Admittedly, this is quite paranoid, since all the known shells bail
-# out after a failed `exec'.
-$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
-as_fn_exit 255
+# out after a failed 'exec'.
+printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
   fi
   # We don't want this to propagate to other subprocesses.
           { _as_can_reexec=; unset _as_can_reexec;}
 if test "x$CONFIG_SHELL" = x; then
-  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  as_bourne_compatible="if test \${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '\${1+\"\$@\"}'='\"\$@\"'
   setopt NO_GLOB_SUBST
-else
-  case \`(set -o) 2>/dev/null\` in #(
+else case e in #(
+  e) case \`(set -o) 2>/dev/null\` in #(
   *posix*) :
     set -o posix ;; #(
   *) :
      ;;
+esac ;;
 esac
 fi
 "
@@ -192,42 +174,55 @@ as_fn_success || { exitcode=1; echo as_fn_success failed.; }
 as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
 as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
 as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
-if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+if ( set x; as_fn_ret_success y && test x = \"\$1\" )
+then :
 
-else
-  exitcode=1; echo positional parameters were not saved.
+else case e in #(
+  e) exitcode=1; echo positional parameters were not saved. ;;
+esac
 fi
 test x\$exitcode = x0 || exit 1
+blah=\$(echo \$(echo blah))
+test x\"\$blah\" = xblah || exit 1
 test -x / || exit 1"
   as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
   as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
   eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
   test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
 test \$(( 1 + 1 )) = 2 || exit 1"
-  if (eval "$as_required") 2>/dev/null; then :
+  if (eval "$as_required") 2>/dev/null
+then :
   as_have_required=yes
-else
-  as_have_required=no
+else case e in #(
+  e) as_have_required=no ;;
+esac
 fi
-  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null
+then :
 
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+else case e in #(
+  e) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 as_found=false
 for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
   as_found=:
   case $as_dir in #(
 	 /*)
 	   for as_base in sh bash ksh sh5; do
 	     # Try only shells that exist, to save several forks.
-	     as_shell=$as_dir/$as_base
+	     as_shell=$as_dir$as_base
 	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
-		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+		    as_run=a "$as_shell" -c "$as_bourne_compatible""$as_required" 2>/dev/null
+then :
   CONFIG_SHELL=$as_shell as_have_required=yes
-		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+		   if as_run=a "$as_shell" -c "$as_bourne_compatible""$as_suggested" 2>/dev/null
+then :
   break 2
 fi
 fi
@@ -235,14 +230,22 @@ fi
        esac
   as_found=false
 done
-$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
-	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
-  CONFIG_SHELL=$SHELL as_have_required=yes
-fi; }
 IFS=$as_save_IFS
+if $as_found
+then :
+
+else case e in #(
+  e) if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      as_run=a "$SHELL" -c "$as_bourne_compatible""$as_required" 2>/dev/null
+then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi ;;
+esac
+fi
 
 
-      if test "x$CONFIG_SHELL" != x; then :
+      if test "x$CONFIG_SHELL" != x
+then :
   export CONFIG_SHELL
              # We cannot yet assume a decent shell, so we have to provide a
 # neutralization value for shells without unset; and this also
@@ -259,26 +262,28 @@ case $- in # ((((
 esac
 exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
 # Admittedly, this is quite paranoid, since all the known shells bail
-# out after a failed `exec'.
-$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+# out after a failed 'exec'.
+printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2
 exit 255
 fi
 
-    if test x$as_have_required = xno; then :
-  $as_echo "$0: This script requires a shell more modern than all"
-  $as_echo "$0: the shells that I found on your system."
-  if test x${ZSH_VERSION+set} = xset ; then
-    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
-    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+    if test x$as_have_required = xno
+then :
+  printf "%s\n" "$0: This script requires a shell more modern than all"
+  printf "%s\n" "$0: the shells that I found on your system."
+  if test ${ZSH_VERSION+y} ; then
+    printf "%s\n" "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    printf "%s\n" "$0: be upgraded to zsh 4.3.4 or later."
   else
-    $as_echo "$0: Please tell bug-autoconf@gnu.org and
+    printf "%s\n" "$0: Please tell bug-autoconf@gnu.org and
 $0: samtools-help@lists.sourceforge.net about your system,
 $0: including any error possibly output before this
 $0: message. Then install a modern shell, or manually run
 $0: the script under such a shell if you do have one."
   fi
   exit 1
-fi
+fi ;;
+esac
 fi
 fi
 SHELL=${CONFIG_SHELL-/bin/sh}
@@ -299,6 +304,7 @@ as_fn_unset ()
 }
 as_unset=as_fn_unset
 
+
 # as_fn_set_status STATUS
 # -----------------------
 # Set $? to STATUS, without forking.
@@ -330,7 +336,7 @@ as_fn_mkdir_p ()
     as_dirs=
     while :; do
       case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
       *) as_qdir=$as_dir;;
       esac
       as_dirs="'$as_qdir' $as_dirs"
@@ -339,7 +345,7 @@ $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$as_dir" : 'X\(//\)[^/]' \| \
 	 X"$as_dir" : 'X\(//\)$' \| \
 	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
+printf "%s\n" X"$as_dir" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -378,16 +384,18 @@ as_fn_executable_p ()
 # advantage of any shell optimizations that allow amortized linear growth over
 # repeated appends, instead of the typical quadratic growth present in naive
 # implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null
+then :
   eval 'as_fn_append ()
   {
     eval $1+=\$2
   }'
-else
-  as_fn_append ()
+else case e in #(
+  e) as_fn_append ()
   {
     eval $1=\$$1\$2
-  }
+  } ;;
+esac
 fi # as_fn_append
 
 # as_fn_arith ARG...
@@ -395,16 +403,18 @@ fi # as_fn_append
 # Perform arithmetic evaluation on the ARGs, and store the result in the
 # global $as_val. Take advantage of shells that can avoid forks. The arguments
 # must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null
+then :
   eval 'as_fn_arith ()
   {
     as_val=$(( $* ))
   }'
-else
-  as_fn_arith ()
+else case e in #(
+  e) as_fn_arith ()
   {
     as_val=`expr "$@" || test $? -eq 1`
-  }
+  } ;;
+esac
 fi # as_fn_arith
 
 
@@ -418,9 +428,9 @@ as_fn_error ()
   as_status=$1; test $as_status -eq 0 && as_status=1
   if test "$4"; then
     as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
   fi
-  $as_echo "$as_me: error: $2" >&2
+  printf "%s\n" "$as_me: error: $2" >&2
   as_fn_exit $as_status
 } # as_fn_error
 
@@ -447,7 +457,7 @@ as_me=`$as_basename -- "$0" ||
 $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
 	 X"$0" : 'X\(//\)$' \| \
 	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
+printf "%s\n" X/"$0" |
     sed '/^.*\/\([^/][^/]*\)\/*$/{
 	    s//\1/
 	    q
@@ -480,6 +490,8 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits
     /[$]LINENO/=
   ' <$as_myself |
     sed '
+      t clear
+      :clear
       s/[$]LINENO.*/&-/
       t lineno
       b
@@ -491,7 +503,7 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits
       s/-\n.*//
     ' >$as_me.lineno &&
   chmod +x "$as_me.lineno" ||
-    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+    { printf "%s\n" "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
 
   # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
   # already done that, so ensure we don't try to do so again and fall
@@ -505,6 +517,10 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits
   exit
 }
 
+
+# Determine whether it's possible to make 'echo' print without a newline.
+# These variables are no longer used directly by Autoconf, but are AC_SUBSTed
+# for compatibility with existing Makefiles.
 ECHO_C= ECHO_N= ECHO_T=
 case `echo -n x` in #(((((
 -n*)
@@ -518,6 +534,12 @@ case `echo -n x` in #(((((
   ECHO_N='-n';;
 esac
 
+# For backward compatibility with old third-party macros, we provide
+# the shell variables $as_echo and $as_echo_n.  New code should use
+# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively.
+as_echo='printf %s\n'
+as_echo_n='printf %s'
+
 rm -f conf$$ conf$$.exe conf$$.file
 if test -d conf$$.dir; then
   rm -f conf$$.dir/conf$$.file
@@ -529,9 +551,9 @@ if (echo >conf$$.file) 2>/dev/null; then
   if ln -s conf$$.file conf$$ 2>/dev/null; then
     as_ln_s='ln -s'
     # ... but there are two gotchas:
-    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -pR'.
+    # 1) On MSYS, both 'ln -s file dir' and 'ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; 'ln -s' creates a wrapper executable.
+    # In both cases, we have to default to 'cp -pR'.
     ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
       as_ln_s='cp -pR'
   elif ln conf$$.file conf$$ 2>/dev/null; then
@@ -556,10 +578,12 @@ as_test_x='test -x'
 as_executable_p=as_fn_executable_p
 
 # Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+as_sed_cpp="y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g"
+as_tr_cpp="eval sed '$as_sed_cpp'" # deprecated
 
 # Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+as_sed_sh="y%*+%pp%;s%[^_$as_cr_alnum]%_%g"
+as_tr_sh="eval sed '$as_sed_sh'" # deprecated
 
 
 test -n "$DJDIR" || exec 7<&0 </dev/null
@@ -585,49 +609,46 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='HTSlib'
 PACKAGE_TARNAME='htslib'
-PACKAGE_VERSION='1.18'
-PACKAGE_STRING='HTSlib 1.18'
+PACKAGE_VERSION='1.21'
+PACKAGE_STRING='HTSlib 1.21'
 PACKAGE_BUGREPORT='samtools-help@lists.sourceforge.net'
 PACKAGE_URL='http://www.htslib.org/'
 
 ac_unique_file="hts.c"
 # Factoring default headers for most tests.
 ac_includes_default="\
-#include <stdio.h>
-#ifdef HAVE_SYS_TYPES_H
-# include <sys/types.h>
-#endif
-#ifdef HAVE_SYS_STAT_H
-# include <sys/stat.h>
+#include <stddef.h>
+#ifdef HAVE_STDIO_H
+# include <stdio.h>
 #endif
-#ifdef STDC_HEADERS
+#ifdef HAVE_STDLIB_H
 # include <stdlib.h>
-# include <stddef.h>
-#else
-# ifdef HAVE_STDLIB_H
-#  include <stdlib.h>
-# endif
 #endif
 #ifdef HAVE_STRING_H
-# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
-#  include <memory.h>
-# endif
 # include <string.h>
 #endif
-#ifdef HAVE_STRINGS_H
-# include <strings.h>
-#endif
 #ifdef HAVE_INTTYPES_H
 # include <inttypes.h>
 #endif
 #ifdef HAVE_STDINT_H
 # include <stdint.h>
 #endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
 #ifdef HAVE_UNISTD_H
 # include <unistd.h>
 #endif"
 
-ac_header_list=
+ac_header_c_list=
+ac_func_c_list=
+enable_year2038=no
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
 HTSDIRslash_if_relsrcdir
@@ -640,6 +661,14 @@ s3
 gcs
 libcurl
 PLUGIN_EXT
+host_os
+host_vendor
+host_cpu
+host
+build_os
+build_vendor
+build_cpu
+build
 VERSION_SCRIPT_LDFLAGS
 PLATFORM
 pluginpath
@@ -652,8 +681,6 @@ PKG_CONFIG
 hts_cflags_avx512
 hts_cflags_avx2
 hts_cflags_sse4
-EGREP
-CPP
 GREP
 RANLIB
 OBJEXT
@@ -719,6 +746,7 @@ with_libdeflate
 with_plugin_dir
 with_plugin_path
 enable_s3
+enable_year2038
 '
       ac_precious_vars='build_alias
 host_alias
@@ -728,7 +756,6 @@ CFLAGS
 LDFLAGS
 LIBS
 CPPFLAGS
-CPP
 PKG_CONFIG
 PKG_CONFIG_PATH
 PKG_CONFIG_LIBDIR'
@@ -800,8 +827,6 @@ do
   *)    ac_optarg=yes ;;
   esac
 
-  # Accept the important Cygnus configure options, so we can diagnose typos.
-
   case $ac_dashdash$ac_option in
   --)
     ac_dashdash=yes ;;
@@ -842,9 +867,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
+      as_fn_error $? "invalid feature name: '$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "enable_$ac_useropt"
@@ -868,9 +893,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
+      as_fn_error $? "invalid feature name: '$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "enable_$ac_useropt"
@@ -1081,9 +1106,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
+      as_fn_error $? "invalid package name: '$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "with_$ac_useropt"
@@ -1097,9 +1122,9 @@ do
     ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
+      as_fn_error $? "invalid package name: '$ac_useropt'"
     ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
       *"
 "with_$ac_useropt"
@@ -1127,8 +1152,8 @@ do
   | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
     x_libraries=$ac_optarg ;;
 
-  -*) as_fn_error $? "unrecognized option: \`$ac_option'
-Try \`$0 --help' for more information"
+  -*) as_fn_error $? "unrecognized option: '$ac_option'
+Try '$0 --help' for more information"
     ;;
 
   *=*)
@@ -1136,16 +1161,16 @@ Try \`$0 --help' for more information"
     # Reject names that are not valid shell variable names.
     case $ac_envvar in #(
       '' | [0-9]* | *[!_$as_cr_alnum]* )
-      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+      as_fn_error $? "invalid variable name: '$ac_envvar'" ;;
     esac
     eval $ac_envvar=\$ac_optarg
     export $ac_envvar ;;
 
   *)
     # FIXME: should be removed in autoconf 3.0.
-    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    printf "%s\n" "$as_me: WARNING: you should use --build, --host, --target" >&2
     expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+      printf "%s\n" "$as_me: WARNING: invalid host type: $ac_option" >&2
     : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
     ;;
 
@@ -1161,7 +1186,7 @@ if test -n "$ac_unrecognized_opts"; then
   case $enable_option_checking in
     no) ;;
     fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
-    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+    *)     printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
   esac
 fi
 
@@ -1186,7 +1211,7 @@ do
   as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
 done
 
-# There might be people who depend on the old broken behavior: `$host'
+# There might be people who depend on the old broken behavior: '$host'
 # used to hold the argument of --host etc.
 # FIXME: To remove some day.
 build=$build_alias
@@ -1225,7 +1250,7 @@ $as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$as_myself" : 'X\(//\)[^/]' \| \
 	 X"$as_myself" : 'X\(//\)$' \| \
 	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_myself" |
+printf "%s\n" X"$as_myself" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -1254,7 +1279,7 @@ if test ! -r "$srcdir/$ac_unique_file"; then
   test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
   as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
 fi
-ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_msg="sources are in $srcdir, but 'cd $srcdir' does not work"
 ac_abs_confdir=`(
 	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
 	pwd)`
@@ -1282,7 +1307,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures HTSlib 1.18 to adapt to many kinds of systems.
+'configure' configures HTSlib 1.21 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1296,11 +1321,11 @@ Configuration:
       --help=short        display options specific to this package
       --help=recursive    display the short help of all the included packages
   -V, --version           display version information and exit
-  -q, --quiet, --silent   do not print \`checking ...' messages
+  -q, --quiet, --silent   do not print 'checking ...' messages
       --cache-file=FILE   cache test results in FILE [disabled]
-  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -C, --config-cache      alias for '--cache-file=config.cache'
   -n, --no-create         do not create output files
-      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+      --srcdir=DIR        find the sources in DIR [configure dir or '..']
 
 Installation directories:
   --prefix=PREFIX         install architecture-independent files in PREFIX
@@ -1308,10 +1333,10 @@ Installation directories:
   --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
                           [PREFIX]
 
-By default, \`make install' will install all the files in
-\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
-an installation prefix other than \`$ac_default_prefix' using \`--prefix',
-for instance \`--prefix=\$HOME'.
+By default, 'make install' will install all the files in
+'$ac_default_prefix/bin', '$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than '$ac_default_prefix' using '--prefix',
+for instance '--prefix=\$HOME'.
 
 For better control, use the options below.
 
@@ -1339,12 +1364,16 @@ Fine tuning of the installation directories:
 _ACEOF
 
   cat <<\_ACEOF
+
+System types:
+  --build=BUILD     configure for building on BUILD [guessed]
+  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
 _ACEOF
 fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of HTSlib 1.18:";;
+     short | recursive ) echo "Configuration of HTSlib 1.21:";;
    esac
   cat <<\_ACEOF
 
@@ -1363,6 +1392,7 @@ Optional Features:
   --disable-lzma          omit support for LZMA-compressed CRAM files
   --enable-plugins        enable separately-compiled plugins for file access
   --enable-s3             support Amazon AWS S3 URLs
+  --enable-year2038       support timestamps after 2038
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -1381,14 +1411,13 @@ Some influential environment variables:
   LIBS        libraries to pass to the linker, e.g. -l<library>
   CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
-  CPP         C preprocessor
   PKG_CONFIG  path to pkg-config utility
   PKG_CONFIG_PATH
               directories to add to pkg-config's search path
   PKG_CONFIG_LIBDIR
               path overriding pkg-config's built-in search path
 
-Use these variables to override the choices made by `configure' or to help
+Use these variables to override the choices made by 'configure' or to help
 it to find libraries and programs with nonstandard names/locations.
 
 Report bugs to <samtools-help@lists.sourceforge.net>.
@@ -1408,9 +1437,9 @@ if test "$ac_init_help" = "recursive"; then
 case "$ac_dir" in
 .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
 *)
-  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'`
   # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
   case $ac_top_builddir_sub in
   "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
   *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
@@ -1438,7 +1467,8 @@ esac
 ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
 
     cd "$ac_dir" || { ac_status=$?; continue; }
-    # Check for guested configure.
+    # Check for configure.gnu first; this name is used for a wrapper for
+    # Metaconfig's "Configure" on case-insensitive file systems.
     if test -f "$ac_srcdir/configure.gnu"; then
       echo &&
       $SHELL "$ac_srcdir/configure.gnu" --help=recursive
@@ -1446,7 +1476,7 @@ ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
       echo &&
       $SHELL "$ac_srcdir/configure" --help=recursive
     else
-      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+      printf "%s\n" "$as_me: WARNING: no configuration information is in $ac_dir" >&2
     fi || ac_status=$?
     cd "$ac_pwd" || { ac_status=$?; break; }
   done
@@ -1455,14 +1485,14 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-HTSlib configure 1.18
-generated by GNU Autoconf 2.69
+HTSlib configure 1.21
+generated by GNU Autoconf 2.72
 
-Copyright (C) 2012 Free Software Foundation, Inc.
+Copyright (C) 2023 Free Software Foundation, Inc.
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.
 
-Portions copyright (C) 2020-2023 Genome Research Ltd.
+Portions copyright (C) 2020-2024 Genome Research Ltd.
 
 This configure script is free software: you are free to change and
 redistribute it.  There is NO WARRANTY, to the extent permitted by law.
@@ -1480,14 +1510,14 @@ fi
 ac_fn_c_try_compile ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
+  rm -f conftest.$ac_objext conftest.beam
   if { { ac_try="$ac_compile"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_compile") 2>conftest.err
   ac_status=$?
   if test -s conftest.err; then
@@ -1495,42 +1525,47 @@ $as_echo "$ac_try_echo"; } >&5
     cat conftest.er1 >&5
     mv -f conftest.er1 conftest.err
   fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; } && {
 	 test -z "$ac_c_werror_flag" ||
 	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
+       } && test -s conftest.$ac_objext
+then :
   ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
+else case e in #(
+  e) printf "%s\n" "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	ac_retval=1
+	ac_retval=1 ;;
+esac
 fi
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
   as_fn_set_status $ac_retval
 
 } # ac_fn_c_try_compile
 
-# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES
-# ---------------------------------------------
+# ac_fn_check_decl LINENO SYMBOL VAR INCLUDES EXTRA-OPTIONS FLAG-VAR
+# ------------------------------------------------------------------
 # Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
-# accordingly.
-ac_fn_c_check_decl ()
+# accordingly. Pass EXTRA-OPTIONS to the compiler, using FLAG-VAR.
+ac_fn_check_decl ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
   as_decl_name=`echo $2|sed 's/ *(.*//'`
-  as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
-$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
+printf %s "checking whether $as_decl_name is declared... " >&6; }
+if eval test \${$3+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
+  eval ac_save_FLAGS=\$$6
+  as_fn_append $6 " $5"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $4
 int
-main ()
+main (void)
 {
 #ifndef $as_decl_name
 #ifdef __cplusplus
@@ -1544,98 +1579,24 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"
+then :
   eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_decl
-
-# ac_fn_c_try_cpp LINENO
-# ----------------------
-# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_cpp ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
+else case e in #(
+  e) eval "$3=no" ;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } > conftest.i && {
-	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-    ac_retval=1
 fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_cpp
-
-# ac_fn_c_try_run LINENO
-# ----------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
-# that executables *can* be run.
-ac_fn_c_try_run ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+  eval $6=\$ac_save_FLAGS
+ ;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: program exited with status $ac_status" >&5
-       $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-       ac_retval=$ac_status
 fi
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+eval ac_res=\$$3
+	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+printf "%s\n" "$ac_res" >&6; }
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
 
-} # ac_fn_c_try_run
+} # ac_fn_check_decl
 
 # ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
 # -------------------------------------------------------
@@ -1644,26 +1605,30 @@ fi
 ac_fn_c_check_header_compile ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+printf %s "checking for $2... " >&6; }
+if eval test \${$3+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $4
 #include <$2>
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"
+then :
   eval "$3=yes"
-else
-  eval "$3=no"
+else case e in #(
+  e) eval "$3=no" ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
 fi
 eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
+	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+printf "%s\n" "$ac_res" >&6; }
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
 
 } # ac_fn_c_check_header_compile
@@ -1674,14 +1639,14 @@ $as_echo "$ac_res" >&6; }
 ac_fn_c_try_link ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext conftest$ac_exeext
+  rm -f conftest.$ac_objext conftest.beam conftest$ac_exeext
   if { { ac_try="$ac_link"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_link") 2>conftest.err
   ac_status=$?
   if test -s conftest.err; then
@@ -1689,20 +1654,22 @@ $as_echo "$ac_try_echo"; } >&5
     cat conftest.er1 >&5
     mv -f conftest.er1 conftest.err
   fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; } && {
 	 test -z "$ac_c_werror_flag" ||
 	 test ! -s conftest.err
        } && test -s conftest$ac_exeext && {
 	 test "$cross_compiling" = yes ||
 	 test -x conftest$ac_exeext
-       }; then :
+       }
+then :
   ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
+else case e in #(
+  e) printf "%s\n" "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	ac_retval=1
+	ac_retval=1 ;;
+esac
 fi
   # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
   # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
@@ -1720,28 +1687,22 @@ fi
 ac_fn_c_check_func ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+printf %s "checking for $2... " >&6; }
+if eval test \${$3+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 /* Define $2 to an innocuous variant, in case <limits.h> declares $2.
    For example, HP-UX 11i <limits.h> declares gettimeofday.  */
 #define $2 innocuous_$2
 
 /* System header to define __stub macros and hopefully few prototypes,
-    which can conflict with char $2 (); below.
-    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-    <limits.h> exists even on freestanding compilers.  */
-
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
+   which can conflict with char $2 (void); below.  */
 
+#include <limits.h>
 #undef $2
 
 /* Override any GCC internal prototype to avoid an error.
@@ -1750,7 +1711,7 @@ else
 #ifdef __cplusplus
 extern "C"
 #endif
-char $2 ();
+char $2 (void);
 /* The GNU C library defines this for functions which it implements
     to always fail with ENOSYS.  Some functions are actually named
     something starting with __ and the normal name is an alias.  */
@@ -1759,35 +1720,102 @@ choke me
 #endif
 
 int
-main ()
+main (void)
 {
 return $2 ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   eval "$3=yes"
-else
-  eval "$3=no"
+else case e in #(
+  e) eval "$3=no" ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext ;;
+esac
 fi
 eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
+	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+printf "%s\n" "$ac_res" >&6; }
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
 
 } # ac_fn_c_check_func
+
+# ac_fn_c_try_run LINENO
+# ----------------------
+# Try to run conftest.$ac_ext, and return whether this succeeded. Assumes that
+# executables *can* be run.
+ac_fn_c_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then :
+  ac_retval=0
+else case e in #(
+  e) printf "%s\n" "$as_me: program exited with status $ac_status" >&5
+       printf "%s\n" "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status ;;
+esac
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_run
+ac_configure_args_raw=
+for ac_arg
+do
+  case $ac_arg in
+  *\'*)
+    ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+  esac
+  as_fn_append ac_configure_args_raw " '$ac_arg'"
+done
+
+case $ac_configure_args_raw in
+  *$as_nl*)
+    ac_safe_unquote= ;;
+  *)
+    ac_unsafe_z='|&;<>()$`\\"*?[ ''	' # This string ends in space, tab.
+    ac_unsafe_a="$ac_unsafe_z#~"
+    ac_safe_unquote="s/ '\\([^$ac_unsafe_a][^$ac_unsafe_z]*\\)'/ \\1/g"
+    ac_configure_args_raw=`      printf "%s\n" "$ac_configure_args_raw" | sed "$ac_safe_unquote"`;;
+esac
+
 cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by HTSlib $as_me 1.18, which was
-generated by GNU Autoconf 2.69.  Invocation command line was
+It was created by HTSlib $as_me 1.21, which was
+generated by GNU Autoconf 2.72.  Invocation command line was
 
-  $ $0 $@
+  $ $0$ac_configure_args_raw
 
 _ACEOF
 exec 5>>config.log
@@ -1820,8 +1848,12 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    $as_echo "PATH: $as_dir"
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    printf "%s\n" "PATH: $as_dir"
   done
 IFS=$as_save_IFS
 
@@ -1856,7 +1888,7 @@ do
     | -silent | --silent | --silen | --sile | --sil)
       continue ;;
     *\'*)
-      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+      ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
     esac
     case $ac_pass in
     1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
@@ -1891,11 +1923,13 @@ done
 # WARNING: Use '\'' to represent an apostrophe within the trap.
 # WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
 trap 'exit_status=$?
+  # Sanitize IFS.
+  IFS=" ""	$as_nl"
   # Save into config.log some information that might help in debugging.
   {
     echo
 
-    $as_echo "## ---------------- ##
+    printf "%s\n" "## ---------------- ##
 ## Cache variables. ##
 ## ---------------- ##"
     echo
@@ -1906,8 +1940,8 @@ trap 'exit_status=$?
     case $ac_val in #(
     *${as_nl}*)
       case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
-$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       esac
       case $ac_var in #(
       _ | IFS | as_nl) ;; #(
@@ -1931,7 +1965,7 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
 )
     echo
 
-    $as_echo "## ----------------- ##
+    printf "%s\n" "## ----------------- ##
 ## Output variables. ##
 ## ----------------- ##"
     echo
@@ -1939,14 +1973,14 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
     do
       eval ac_val=\$$ac_var
       case $ac_val in
-      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      *\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
       esac
-      $as_echo "$ac_var='\''$ac_val'\''"
+      printf "%s\n" "$ac_var='\''$ac_val'\''"
     done | sort
     echo
 
     if test -n "$ac_subst_files"; then
-      $as_echo "## ------------------- ##
+      printf "%s\n" "## ------------------- ##
 ## File substitutions. ##
 ## ------------------- ##"
       echo
@@ -1954,15 +1988,15 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       do
 	eval ac_val=\$$ac_var
 	case $ac_val in
-	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	*\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
 	esac
-	$as_echo "$ac_var='\''$ac_val'\''"
+	printf "%s\n" "$ac_var='\''$ac_val'\''"
       done | sort
       echo
     fi
 
     if test -s confdefs.h; then
-      $as_echo "## ----------- ##
+      printf "%s\n" "## ----------- ##
 ## confdefs.h. ##
 ## ----------- ##"
       echo
@@ -1970,8 +2004,8 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       echo
     fi
     test "$ac_signal" != 0 &&
-      $as_echo "$as_me: caught signal $ac_signal"
-    $as_echo "$as_me: exit $exit_status"
+      printf "%s\n" "$as_me: caught signal $ac_signal"
+    printf "%s\n" "$as_me: exit $exit_status"
   } >&5
   rm -f core *.core core.conftest.* &&
     rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
@@ -1985,65 +2019,50 @@ ac_signal=0
 # confdefs.h avoids OS command line length limits that DEFS can exceed.
 rm -f -r conftest* confdefs.h
 
-$as_echo "/* confdefs.h */" > confdefs.h
+printf "%s\n" "/* confdefs.h */" > confdefs.h
 
 # Predefined preprocessor variables.
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_NAME "$PACKAGE_NAME"
-_ACEOF
+printf "%s\n" "#define PACKAGE_NAME \"$PACKAGE_NAME\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
-_ACEOF
+printf "%s\n" "#define PACKAGE_TARNAME \"$PACKAGE_TARNAME\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_VERSION "$PACKAGE_VERSION"
-_ACEOF
+printf "%s\n" "#define PACKAGE_VERSION \"$PACKAGE_VERSION\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_STRING "$PACKAGE_STRING"
-_ACEOF
+printf "%s\n" "#define PACKAGE_STRING \"$PACKAGE_STRING\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
-_ACEOF
+printf "%s\n" "#define PACKAGE_BUGREPORT \"$PACKAGE_BUGREPORT\"" >>confdefs.h
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_URL "$PACKAGE_URL"
-_ACEOF
+printf "%s\n" "#define PACKAGE_URL \"$PACKAGE_URL\"" >>confdefs.h
 
 
 # Let the site file select an alternate cache file if it wants to.
 # Prefer an explicitly selected file to automatically selected ones.
-ac_site_file1=NONE
-ac_site_file2=NONE
 if test -n "$CONFIG_SITE"; then
-  # We do not want a PATH search for config.site.
-  case $CONFIG_SITE in #((
-    -*)  ac_site_file1=./$CONFIG_SITE;;
-    */*) ac_site_file1=$CONFIG_SITE;;
-    *)   ac_site_file1=./$CONFIG_SITE;;
-  esac
+  ac_site_files="$CONFIG_SITE"
 elif test "x$prefix" != xNONE; then
-  ac_site_file1=$prefix/share/config.site
-  ac_site_file2=$prefix/etc/config.site
+  ac_site_files="$prefix/share/config.site $prefix/etc/config.site"
 else
-  ac_site_file1=$ac_default_prefix/share/config.site
-  ac_site_file2=$ac_default_prefix/etc/config.site
+  ac_site_files="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
 fi
-for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+
+for ac_site_file in $ac_site_files
 do
-  test "x$ac_site_file" = xNONE && continue
-  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
-$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+  case $ac_site_file in #(
+  */*) :
+     ;; #(
+  *) :
+    ac_site_file=./$ac_site_file ;;
+esac
+  if test -f "$ac_site_file" && test -r "$ac_site_file"; then
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+printf "%s\n" "$as_me: loading site script $ac_site_file" >&6;}
     sed 's/^/| /' "$ac_site_file" >&5
     . "$ac_site_file" \
-      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+      || { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
 as_fn_error $? "failed to load site script $ac_site_file
-See \`config.log' for more details" "$LINENO" 5; }
+See 'config.log' for more details" "$LINENO" 5; }
   fi
 done
 
@@ -2051,84 +2070,517 @@ if test -r "$cache_file"; then
   # Some versions of bash will fail to source /dev/null (special files
   # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
   if test /dev/null != "$cache_file" && test -f "$cache_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
-$as_echo "$as_me: loading cache $cache_file" >&6;}
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+printf "%s\n" "$as_me: loading cache $cache_file" >&6;}
     case $cache_file in
       [\\/]* | ?:[\\/]* ) . "$cache_file";;
       *)                      . "./$cache_file";;
     esac
   fi
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
-$as_echo "$as_me: creating cache $cache_file" >&6;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+printf "%s\n" "$as_me: creating cache $cache_file" >&6;}
   >$cache_file
 fi
 
-as_fn_append ac_header_list " stdlib.h"
-as_fn_append ac_header_list " unistd.h"
-as_fn_append ac_header_list " sys/param.h"
-# Check that the precious variables saved in the cache have kept the same
-# value.
-ac_cache_corrupted=false
-for ac_var in $ac_precious_vars; do
-  eval ac_old_set=\$ac_cv_env_${ac_var}_set
-  eval ac_new_set=\$ac_env_${ac_var}_set
-  eval ac_old_val=\$ac_cv_env_${ac_var}_value
-  eval ac_new_val=\$ac_env_${ac_var}_value
-  case $ac_old_set,$ac_new_set in
-    set,)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
-$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
-      ac_cache_corrupted=: ;;
-    ,set)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
-$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
-      ac_cache_corrupted=: ;;
-    ,);;
-    *)
-      if test "x$ac_old_val" != "x$ac_new_val"; then
-	# differences in whitespace do not lead to failure.
-	ac_old_val_w=`echo x $ac_old_val`
-	ac_new_val_w=`echo x $ac_new_val`
-	if test "$ac_old_val_w" != "$ac_new_val_w"; then
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
-$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
-	  ac_cache_corrupted=:
-	else
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
-$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
-	  eval $ac_var=\$ac_old_val
-	fi
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
-$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
-$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
-      fi;;
-  esac
-  # Pass precious variables to config.status.
-  if test "$ac_new_set" = set; then
-    case $ac_new_val in
-    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
-    *) ac_arg=$ac_var=$ac_new_val ;;
-    esac
-    case " $ac_configure_args " in
-      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
-      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
-    esac
-  fi
-done
-if $ac_cache_corrupted; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
-$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
-  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
-fi
-## -------------------- ##
-## Main body of script. ##
-## -------------------- ##
+# Test code for whether the C compiler supports C89 (global declarations)
+ac_c_conftest_c89_globals='
+/* Does the compiler advertise C89 conformance?
+   Do not test the value of __STDC__, because some compilers set it to 0
+   while being otherwise adequately conformant. */
+#if !defined __STDC__
+# error "Compiler does not advertise C89 conformance"
+#endif
 
-ac_ext=c
+#include <stddef.h>
+#include <stdarg.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7 src/conf.sh.  */
+struct buf { int x; };
+struct buf * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (char **p, int i)
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* C89 style stringification. */
+#define noexpand_stringify(a) #a
+const char *stringified = noexpand_stringify(arbitrary+token=sequence);
+
+/* C89 style token pasting.  Exercises some of the corner cases that
+   e.g. old MSVC gets wrong, but not very hard. */
+#define noexpand_concat(a,b) a##b
+#define expand_concat(a,b) noexpand_concat(a,b)
+extern int vA;
+extern int vbee;
+#define aye A
+#define bee B
+int *pvA = &expand_concat(v,aye);
+int *pvbee = &noexpand_concat(v,bee);
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not \xHH hex character constants.
+   These do not provoke an error unfortunately, instead are silently treated
+   as an "x".  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously \x00 != x always comes out true, for an
+   array size at least.  It is necessary to write \x00 == 0 to get something
+   that is true only with -std.  */
+int osf4_cc_array ['\''\x00'\'' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) '\''x'\''
+int xlc6_cc_array[FOO(a) == '\''x'\'' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, int *(*)(struct buf *, struct stat *, int),
+               int, int);'
+
+# Test code for whether the C compiler supports C89 (body of main).
+ac_c_conftest_c89_main='
+ok |= (argc == 0 || f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]);
+'
+
+# Test code for whether the C compiler supports C99 (global declarations)
+ac_c_conftest_c99_globals='
+/* Does the compiler advertise C99 conformance? */
+#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 199901L
+# error "Compiler does not advertise C99 conformance"
+#endif
+
+// See if C++-style comments work.
+
+#include <stdbool.h>
+extern int puts (const char *);
+extern int printf (const char *, ...);
+extern int dprintf (int, const char *, ...);
+extern void *malloc (size_t);
+extern void free (void *);
+
+// Check varargs macros.  These examples are taken from C99 6.10.3.5.
+// dprintf is used instead of fprintf to avoid needing to declare
+// FILE and stderr.
+#define debug(...) dprintf (2, __VA_ARGS__)
+#define showlist(...) puts (#__VA_ARGS__)
+#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__))
+static void
+test_varargs_macros (void)
+{
+  int x = 1234;
+  int y = 5678;
+  debug ("Flag");
+  debug ("X = %d\n", x);
+  showlist (The first, second, and third items.);
+  report (x>y, "x is %d but y is %d", x, y);
+}
+
+// Check long long types.
+#define BIG64 18446744073709551615ull
+#define BIG32 4294967295ul
+#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0)
+#if !BIG_OK
+  #error "your preprocessor is broken"
+#endif
+#if BIG_OK
+#else
+  #error "your preprocessor is broken"
+#endif
+static long long int bignum = -9223372036854775807LL;
+static unsigned long long int ubignum = BIG64;
+
+struct incomplete_array
+{
+  int datasize;
+  double data[];
+};
+
+struct named_init {
+  int number;
+  const wchar_t *name;
+  double average;
+};
+
+typedef const char *ccp;
+
+static inline int
+test_restrict (ccp restrict text)
+{
+  // Iterate through items via the restricted pointer.
+  // Also check for declarations in for loops.
+  for (unsigned int i = 0; *(text+i) != '\''\0'\''; ++i)
+    continue;
+  return 0;
+}
+
+// Check varargs and va_copy.
+static bool
+test_varargs (const char *format, ...)
+{
+  va_list args;
+  va_start (args, format);
+  va_list args_copy;
+  va_copy (args_copy, args);
+
+  const char *str = "";
+  int number = 0;
+  float fnumber = 0;
+
+  while (*format)
+    {
+      switch (*format++)
+	{
+	case '\''s'\'': // string
+	  str = va_arg (args_copy, const char *);
+	  break;
+	case '\''d'\'': // int
+	  number = va_arg (args_copy, int);
+	  break;
+	case '\''f'\'': // float
+	  fnumber = va_arg (args_copy, double);
+	  break;
+	default:
+	  break;
+	}
+    }
+  va_end (args_copy);
+  va_end (args);
+
+  return *str && number && fnumber;
+}
+'
+
+# Test code for whether the C compiler supports C99 (body of main).
+ac_c_conftest_c99_main='
+  // Check bool.
+  _Bool success = false;
+  success |= (argc != 0);
+
+  // Check restrict.
+  if (test_restrict ("String literal") == 0)
+    success = true;
+  char *restrict newvar = "Another string";
+
+  // Check varargs.
+  success &= test_varargs ("s, d'\'' f .", "string", 65, 34.234);
+  test_varargs_macros ();
+
+  // Check flexible array members.
+  struct incomplete_array *ia =
+    malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10));
+  ia->datasize = 10;
+  for (int i = 0; i < ia->datasize; ++i)
+    ia->data[i] = i * 1.234;
+  // Work around memory leak warnings.
+  free (ia);
+
+  // Check named initializers.
+  struct named_init ni = {
+    .number = 34,
+    .name = L"Test wide string",
+    .average = 543.34343,
+  };
+
+  ni.number = 58;
+
+  int dynamic_array[ni.number];
+  dynamic_array[0] = argv[0][0];
+  dynamic_array[ni.number - 1] = 543;
+
+  // work around unused variable warnings
+  ok |= (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == '\''x'\''
+	 || dynamic_array[ni.number - 1] != 543);
+'
+
+# Test code for whether the C compiler supports C11 (global declarations)
+ac_c_conftest_c11_globals='
+/* Does the compiler advertise C11 conformance? */
+#if !defined __STDC_VERSION__ || __STDC_VERSION__ < 201112L
+# error "Compiler does not advertise C11 conformance"
+#endif
+
+// Check _Alignas.
+char _Alignas (double) aligned_as_double;
+char _Alignas (0) no_special_alignment;
+extern char aligned_as_int;
+char _Alignas (0) _Alignas (int) aligned_as_int;
+
+// Check _Alignof.
+enum
+{
+  int_alignment = _Alignof (int),
+  int_array_alignment = _Alignof (int[100]),
+  char_alignment = _Alignof (char)
+};
+_Static_assert (0 < -_Alignof (int), "_Alignof is signed");
+
+// Check _Noreturn.
+int _Noreturn does_not_return (void) { for (;;) continue; }
+
+// Check _Static_assert.
+struct test_static_assert
+{
+  int x;
+  _Static_assert (sizeof (int) <= sizeof (long int),
+                  "_Static_assert does not work in struct");
+  long int y;
+};
+
+// Check UTF-8 literals.
+#define u8 syntax error!
+char const utf8_literal[] = u8"happens to be ASCII" "another string";
+
+// Check duplicate typedefs.
+typedef long *long_ptr;
+typedef long int *long_ptr;
+typedef long_ptr long_ptr;
+
+// Anonymous structures and unions -- taken from C11 6.7.2.1 Example 1.
+struct anonymous
+{
+  union {
+    struct { int i; int j; };
+    struct { int k; long int l; } w;
+  };
+  int m;
+} v1;
+'
+
+# Test code for whether the C compiler supports C11 (body of main).
+ac_c_conftest_c11_main='
+  _Static_assert ((offsetof (struct anonymous, i)
+		   == offsetof (struct anonymous, w.k)),
+		  "Anonymous union alignment botch");
+  v1.i = 2;
+  v1.w.k = 5;
+  ok |= v1.i != 5;
+'
+
+# Test code for whether the C compiler supports C11 (complete).
+ac_c_conftest_c11_program="${ac_c_conftest_c89_globals}
+${ac_c_conftest_c99_globals}
+${ac_c_conftest_c11_globals}
+
+int
+main (int argc, char **argv)
+{
+  int ok = 0;
+  ${ac_c_conftest_c89_main}
+  ${ac_c_conftest_c99_main}
+  ${ac_c_conftest_c11_main}
+  return ok;
+}
+"
+
+# Test code for whether the C compiler supports C99 (complete).
+ac_c_conftest_c99_program="${ac_c_conftest_c89_globals}
+${ac_c_conftest_c99_globals}
+
+int
+main (int argc, char **argv)
+{
+  int ok = 0;
+  ${ac_c_conftest_c89_main}
+  ${ac_c_conftest_c99_main}
+  return ok;
+}
+"
+
+# Test code for whether the C compiler supports C89 (complete).
+ac_c_conftest_c89_program="${ac_c_conftest_c89_globals}
+
+int
+main (int argc, char **argv)
+{
+  int ok = 0;
+  ${ac_c_conftest_c89_main}
+  return ok;
+}
+"
+
+as_fn_append ac_header_c_list " stdio.h stdio_h HAVE_STDIO_H"
+as_fn_append ac_header_c_list " stdlib.h stdlib_h HAVE_STDLIB_H"
+as_fn_append ac_header_c_list " string.h string_h HAVE_STRING_H"
+as_fn_append ac_header_c_list " inttypes.h inttypes_h HAVE_INTTYPES_H"
+as_fn_append ac_header_c_list " stdint.h stdint_h HAVE_STDINT_H"
+as_fn_append ac_header_c_list " strings.h strings_h HAVE_STRINGS_H"
+as_fn_append ac_header_c_list " sys/stat.h sys_stat_h HAVE_SYS_STAT_H"
+as_fn_append ac_header_c_list " sys/types.h sys_types_h HAVE_SYS_TYPES_H"
+as_fn_append ac_header_c_list " unistd.h unistd_h HAVE_UNISTD_H"
+as_fn_append ac_header_c_list " sys/param.h sys_param_h HAVE_SYS_PARAM_H"
+as_fn_append ac_func_c_list " getpagesize HAVE_GETPAGESIZE"
+
+# Auxiliary files required by this configure script.
+ac_aux_files="config.guess config.sub"
+
+# Locations in which to look for auxiliary files.
+ac_aux_dir_candidates="${srcdir}${PATH_SEPARATOR}${srcdir}/..${PATH_SEPARATOR}${srcdir}/../.."
+
+# Search for a directory containing all of the required auxiliary files,
+# $ac_aux_files, from the $PATH-style list $ac_aux_dir_candidates.
+# If we don't find one directory that contains all the files we need,
+# we report the set of missing files from the *first* directory in
+# $ac_aux_dir_candidates and give up.
+ac_missing_aux_files=""
+ac_first_candidate=:
+printf "%s\n" "$as_me:${as_lineno-$LINENO}: looking for aux files: $ac_aux_files" >&5
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in $ac_aux_dir_candidates
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+  as_found=:
+
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}:  trying $as_dir" >&5
+  ac_aux_dir_found=yes
+  ac_install_sh=
+  for ac_aux in $ac_aux_files
+  do
+    # As a special case, if "install-sh" is required, that requirement
+    # can be satisfied by any of "install-sh", "install.sh", or "shtool",
+    # and $ac_install_sh is set appropriately for whichever one is found.
+    if test x"$ac_aux" = x"install-sh"
+    then
+      if test -f "${as_dir}install-sh"; then
+        printf "%s\n" "$as_me:${as_lineno-$LINENO}:   ${as_dir}install-sh found" >&5
+        ac_install_sh="${as_dir}install-sh -c"
+      elif test -f "${as_dir}install.sh"; then
+        printf "%s\n" "$as_me:${as_lineno-$LINENO}:   ${as_dir}install.sh found" >&5
+        ac_install_sh="${as_dir}install.sh -c"
+      elif test -f "${as_dir}shtool"; then
+        printf "%s\n" "$as_me:${as_lineno-$LINENO}:   ${as_dir}shtool found" >&5
+        ac_install_sh="${as_dir}shtool install -c"
+      else
+        ac_aux_dir_found=no
+        if $ac_first_candidate; then
+          ac_missing_aux_files="${ac_missing_aux_files} install-sh"
+        else
+          break
+        fi
+      fi
+    else
+      if test -f "${as_dir}${ac_aux}"; then
+        printf "%s\n" "$as_me:${as_lineno-$LINENO}:   ${as_dir}${ac_aux} found" >&5
+      else
+        ac_aux_dir_found=no
+        if $ac_first_candidate; then
+          ac_missing_aux_files="${ac_missing_aux_files} ${ac_aux}"
+        else
+          break
+        fi
+      fi
+    fi
+  done
+  if test "$ac_aux_dir_found" = yes; then
+    ac_aux_dir="$as_dir"
+    break
+  fi
+  ac_first_candidate=false
+
+  as_found=false
+done
+IFS=$as_save_IFS
+if $as_found
+then :
+
+else case e in #(
+  e) as_fn_error $? "cannot find required auxiliary files:$ac_missing_aux_files" "$LINENO" 5 ;;
+esac
+fi
+
+
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+if test -f "${ac_aux_dir}config.guess"; then
+  ac_config_guess="$SHELL ${ac_aux_dir}config.guess"
+fi
+if test -f "${ac_aux_dir}config.sub"; then
+  ac_config_sub="$SHELL ${ac_aux_dir}config.sub"
+fi
+if test -f "$ac_aux_dir/configure"; then
+  ac_configure="$SHELL ${ac_aux_dir}configure"
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: '$ac_var' was set to '$ac_old_val' in the previous run" >&5
+printf "%s\n" "$as_me: error: '$ac_var' was set to '$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: '$ac_var' was not set in the previous run" >&5
+printf "%s\n" "$as_me: error: '$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: '$ac_var' has changed since the previous run:" >&5
+printf "%s\n" "$as_me: error: '$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in '$ac_var' since the previous run:" >&5
+printf "%s\n" "$as_me: warning: ignoring whitespace changes in '$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}:   former value:  '$ac_old_val'" >&5
+printf "%s\n" "$as_me:   former value:  '$ac_old_val'" >&2;}
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}:   current value: '$ac_new_val'" >&5
+printf "%s\n" "$as_me:   current value: '$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`printf "%s\n" "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+printf "%s\n" "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run '${MAKE-make} distclean' and/or 'rm $cache_file'
+	    and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
@@ -2190,7 +2642,7 @@ ac_config_headers="$ac_config_headers config.h"
 #   and this notice are preserved.  This file is offered as-is, without any
 #   warranty.
 
-#   AX_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#   HTS_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAGS, [INPUT], [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
 
 
 
@@ -2222,6 +2674,15 @@ ac_config_headers="$ac_config_headers config.h"
 
 
 
+
+
+
+
+
+
+
+
+
 
 
 
@@ -2240,38 +2701,44 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 if test -n "$ac_tool_prefix"; then
   # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
 set dummy ${ac_tool_prefix}gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CC"; then
   ac_cv_prog_CC="$CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_CC="${ac_tool_prefix}gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+printf "%s\n" "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -2280,38 +2747,44 @@ if test -z "$ac_cv_prog_CC"; then
   ac_ct_CC=$CC
   # Extract the first word of "gcc", so it can be a program name with args.
 set dummy gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CC"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_ac_ct_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$ac_ct_CC"; then
   ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_CC="gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 ac_ct_CC=$ac_cv_prog_ac_ct_CC
 if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
-$as_echo "$ac_ct_CC" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+printf "%s\n" "$ac_ct_CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
   if test "x$ac_ct_CC" = x; then
@@ -2319,8 +2792,8 @@ fi
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
     CC=$ac_ct_CC
@@ -2333,38 +2806,44 @@ if test -z "$CC"; then
           if test -n "$ac_tool_prefix"; then
     # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
 set dummy ${ac_tool_prefix}cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CC"; then
   ac_cv_prog_CC="$CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_CC="${ac_tool_prefix}cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+printf "%s\n" "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -2373,12 +2852,13 @@ fi
 if test -z "$CC"; then
   # Extract the first word of "cc", so it can be a program name with args.
 set dummy cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CC"; then
   ac_cv_prog_CC="$CC" # Let the user override the test.
 else
   ac_prog_rejected=no
@@ -2386,15 +2866,19 @@ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    if test "$as_dir$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
        ac_prog_rejected=yes
        continue
      fi
     ac_cv_prog_CC="cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
@@ -2410,18 +2894,19 @@ if test $ac_prog_rejected = yes; then
     # However, it has the same basename, so the bogon will be chosen
     # first if we set CC to just the basename; use the full file name.
     shift
-    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+    ac_cv_prog_CC="$as_dir$ac_word${1+' '}$@"
   fi
 fi
-fi
+fi ;;
+esac
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+printf "%s\n" "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -2432,38 +2917,44 @@ if test -z "$CC"; then
   do
     # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
 set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CC"; then
   ac_cv_prog_CC="$CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+printf "%s\n" "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -2476,38 +2967,44 @@ if test -z "$CC"; then
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CC"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_ac_ct_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$ac_ct_CC"; then
   ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_CC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 ac_ct_CC=$ac_cv_prog_ac_ct_CC
 if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
-$as_echo "$ac_ct_CC" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+printf "%s\n" "$ac_ct_CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -2519,34 +3016,140 @@ done
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}clang", so it can be a program name with args.
+set dummy ${ac_tool_prefix}clang; ac_word=$2
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}clang"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi ;;
+esac
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+printf "%s\n" "$CC" >&6; }
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "clang", so it can be a program name with args.
+set dummy clang; ac_word=$2
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_ac_ct_CC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="clang"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi ;;
+esac
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+printf "%s\n" "$ac_ct_CC" >&6; }
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
     CC=$ac_ct_CC
   fi
+else
+  CC="$ac_cv_prog_CC"
 fi
 
 fi
 
 
-test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+test -z "$CC" && { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
 as_fn_error $? "no acceptable C compiler found in \$PATH
-See \`config.log' for more details" "$LINENO" 5; }
+See 'config.log' for more details" "$LINENO" 5; }
 
 # Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
 set X $ac_compile
 ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
+for ac_option in --version -v -V -qversion -version; do
   { { ac_try="$ac_compiler $ac_option >&5"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_compiler $ac_option >&5") 2>conftest.err
   ac_status=$?
   if test -s conftest.err; then
@@ -2556,7 +3159,7 @@ $as_echo "$ac_try_echo"; } >&5
     cat conftest.er1 >&5
   fi
   rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
 done
 
@@ -2564,7 +3167,7 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
@@ -2576,9 +3179,9 @@ ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
 # Try to create an executable without -o first, disregard a.out.
 # It will help us diagnose broken compilers, and finding out an intuition
 # of exeext.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
-$as_echo_n "checking whether the C compiler works... " >&6; }
-ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
+printf %s "checking whether the C compiler works... " >&6; }
+ac_link_default=`printf "%s\n" "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
 
 # The possible output files:
 ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
@@ -2599,13 +3202,14 @@ case "(($ac_try" in
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_link_default") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
-  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
-# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to 'no'.
+# So ignore a value of 'no', otherwise this would lead to 'EXEEXT = no'
 # in a Makefile.  We should not override ac_cv_exeext if it was cached,
 # so that the user can short-circuit this test for compilers unknown to
 # Autoconf.
@@ -2620,12 +3224,12 @@ do
 	# certainly right.
 	break;;
     *.* )
-	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	if test ${ac_cv_exeext+y} && test "$ac_cv_exeext" != no;
 	then :; else
 	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
 	fi
 	# We set ac_cv_exeext here because the later test for it is not
-	# safe: cross compilers may not add the suffix if given an `-o'
+	# safe: cross compilers may not add the suffix if given an '-o'
 	# argument, so we may need to know it at that point already.
 	# Even if this section looks crufty: it has the advantage of
 	# actually working.
@@ -2636,48 +3240,52 @@ do
 done
 test "$ac_cv_exeext" = no && ac_cv_exeext=
 
-else
-  ac_file=''
+else case e in #(
+  e) ac_file='' ;;
+esac
 fi
-if test -z "$ac_file"; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-$as_echo "$as_me: failed program was:" >&5
+if test -z "$ac_file"
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+printf "%s\n" "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
 as_fn_error 77 "C compiler cannot create executables
-See \`config.log' for more details" "$LINENO" 5; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+See 'config.log' for more details" "$LINENO" 5; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; } ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
-$as_echo_n "checking for C compiler default output file name... " >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
-$as_echo "$ac_file" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
+printf %s "checking for C compiler default output file name... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+printf "%s\n" "$ac_file" >&6; }
 ac_exeext=$ac_cv_exeext
 
 rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
 ac_clean_files=$ac_clean_files_save
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
-$as_echo_n "checking for suffix of executables... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+printf %s "checking for suffix of executables... " >&6; }
 if { { ac_try="$ac_link"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_link") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
-  # If both `conftest.exe' and `conftest' are `present' (well, observable)
-# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
-# work properly (i.e., refer to `conftest.exe'), while it won't with
-# `rm'.
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+then :
+  # If both 'conftest.exe' and 'conftest' are 'present' (well, observable)
+# catch 'conftest.exe'.  For instance with Cygwin, 'ls conftest' will
+# work properly (i.e., refer to 'conftest.exe'), while it won't with
+# 'rm'.
 for ac_file in conftest.exe conftest conftest.*; do
   test -f "$ac_file" || continue
   case $ac_file in
@@ -2687,15 +3295,16 @@ for ac_file in conftest.exe conftest conftest.*; do
     * ) break;;
   esac
 done
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+else case e in #(
+  e) { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
 as_fn_error $? "cannot compute suffix of executables: cannot compile and link
-See \`config.log' for more details" "$LINENO" 5; }
+See 'config.log' for more details" "$LINENO" 5; } ;;
+esac
 fi
 rm -f conftest conftest$ac_cv_exeext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
-$as_echo "$ac_cv_exeext" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+printf "%s\n" "$ac_cv_exeext" >&6; }
 
 rm -f conftest.$ac_ext
 EXEEXT=$ac_cv_exeext
@@ -2704,9 +3313,11 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <stdio.h>
 int
-main ()
+main (void)
 {
 FILE *f = fopen ("conftest.out", "w");
+ if (!f)
+  return 1;
  return ferror (f) || fclose (f) != 0;
 
   ;
@@ -2716,8 +3327,8 @@ _ACEOF
 ac_clean_files="$ac_clean_files conftest.out"
 # Check that the compiler produces executables we can run.  If not, either
 # the compiler is broken, or we cross compile.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
-$as_echo_n "checking whether we are cross compiling... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+printf %s "checking whether we are cross compiling... " >&6; }
 if test "$cross_compiling" != yes; then
   { { ac_try="$ac_link"
 case "(($ac_try" in
@@ -2725,10 +3336,10 @@ case "(($ac_try" in
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_link") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
   if { ac_try='./conftest$ac_cv_exeext'
   { { case "(($ac_try" in
@@ -2736,39 +3347,41 @@ $as_echo "$ac_try_echo"; } >&5
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_try") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; }; then
     cross_compiling=no
   else
     if test "$cross_compiling" = maybe; then
 	cross_compiling=yes
     else
-	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot run C compiled programs.
-If you meant to cross compile, use \`--host'.
-See \`config.log' for more details" "$LINENO" 5; }
+	{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error 77 "cannot run C compiled programs.
+If you meant to cross compile, use '--host'.
+See 'config.log' for more details" "$LINENO" 5; }
     fi
   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
-$as_echo "$cross_compiling" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+printf "%s\n" "$cross_compiling" >&6; }
 
-rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+rm -f conftest.$ac_ext conftest$ac_cv_exeext \
+  conftest.o conftest.obj conftest.out
 ac_clean_files=$ac_clean_files_save
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
-$as_echo_n "checking for suffix of object files... " >&6; }
-if ${ac_cv_objext+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+printf %s "checking for suffix of object files... " >&6; }
+if test ${ac_cv_objext+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
@@ -2782,11 +3395,12 @@ case "(($ac_try" in
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+printf "%s\n" "$ac_try_echo"; } >&5
   (eval "$ac_compile") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+then :
   for ac_file in conftest.o conftest.obj conftest.*; do
   test -f "$ac_file" || continue;
   case $ac_file in
@@ -2795,31 +3409,34 @@ $as_echo "$ac_try_echo"; } >&5
        break;;
   esac
 done
-else
-  $as_echo "$as_me: failed program was:" >&5
+else case e in #(
+  e) printf "%s\n" "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
 as_fn_error $? "cannot compute suffix of object files: cannot compile
-See \`config.log' for more details" "$LINENO" 5; }
+See 'config.log' for more details" "$LINENO" 5; } ;;
+esac
 fi
-rm -f conftest.$ac_cv_objext conftest.$ac_ext
+rm -f conftest.$ac_cv_objext conftest.$ac_ext ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
-$as_echo "$ac_cv_objext" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+printf "%s\n" "$ac_cv_objext" >&6; }
 OBJEXT=$ac_cv_objext
 ac_objext=$OBJEXT
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
-$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
-if ${ac_cv_c_compiler_gnu+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports GNU C" >&5
+printf %s "checking whether the compiler supports GNU C... " >&6; }
+if test ${ac_cv_c_compiler_gnu+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 #ifndef __GNUC__
        choke me
@@ -2829,30 +3446,36 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"
+then :
   ac_compiler_gnu=yes
-else
-  ac_compiler_gnu=no
+else case e in #(
+  e) ac_compiler_gnu=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
 ac_cv_c_compiler_gnu=$ac_compiler_gnu
-
+ ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
-$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+printf "%s\n" "$ac_cv_c_compiler_gnu" >&6; }
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
 if test $ac_compiler_gnu = yes; then
   GCC=yes
 else
   GCC=
 fi
-ac_test_CFLAGS=${CFLAGS+set}
+ac_test_CFLAGS=${CFLAGS+y}
 ac_save_CFLAGS=$CFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
-$as_echo_n "checking whether $CC accepts -g... " >&6; }
-if ${ac_cv_prog_cc_g+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_save_c_werror_flag=$ac_c_werror_flag
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+printf %s "checking whether $CC accepts -g... " >&6; }
+if test ${ac_cv_prog_cc_g+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_save_c_werror_flag=$ac_c_werror_flag
    ac_c_werror_flag=yes
    ac_cv_prog_cc_g=no
    CFLAGS="-g"
@@ -2860,57 +3483,63 @@ else
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"
+then :
   ac_cv_prog_cc_g=yes
-else
-  CFLAGS=""
+else case e in #(
+  e) CFLAGS=""
       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"
+then :
 
-else
-  ac_c_werror_flag=$ac_save_c_werror_flag
+else case e in #(
+  e) ac_c_werror_flag=$ac_save_c_werror_flag
 	 CFLAGS="-g"
 	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"
+then :
   ac_cv_prog_cc_g=yes
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_c_werror_flag=$ac_save_c_werror_flag
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
-$as_echo "$ac_cv_prog_cc_g" >&6; }
-if test "$ac_test_CFLAGS" = set; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+printf "%s\n" "$ac_cv_prog_cc_g" >&6; }
+if test $ac_test_CFLAGS; then
   CFLAGS=$ac_save_CFLAGS
 elif test $ac_cv_prog_cc_g = yes; then
   if test "$GCC" = yes; then
@@ -2925,94 +3554,153 @@ else
     CFLAGS=
   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
-$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
-if ${ac_cv_prog_cc_c89+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_cv_prog_cc_c89=no
+ac_prog_cc_stdc=no
+if test x$ac_prog_cc_stdc = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable C11 features" >&5
+printf %s "checking for $CC option to enable C11 features... " >&6; }
+if test ${ac_cv_prog_cc_c11+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cc_c11=no
 ac_save_CC=$CC
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-#include <stdarg.h>
-#include <stdio.h>
-struct stat;
-/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
-struct buf { int x; };
-FILE * (*rcsopen) (struct buf *, struct stat *, int);
-static char *e (p, i)
-     char **p;
-     int i;
-{
-  return p[i];
-}
-static char *f (char * (*g) (char **, int), char **p, ...)
-{
-  char *s;
-  va_list v;
-  va_start (v,p);
-  s = g (p, va_arg (v,int));
-  va_end (v);
-  return s;
-}
-
-/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
-   function prototypes and stuff, but not '\xHH' hex character constants.
-   These don't provoke an error unfortunately, instead are silently treated
-   as 'x'.  The following induces an error, until -std is added to get
-   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
-   array size at least.  It's necessary to write '\x00'==0 to get something
-   that's true only with -std.  */
-int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+$ac_c_conftest_c11_program
+_ACEOF
+for ac_arg in '' -std=gnu11
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"
+then :
+  ac_cv_prog_cc_c11=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam
+  test "x$ac_cv_prog_cc_c11" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC ;;
+esac
+fi
 
-/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
-   inside strings and character constants.  */
-#define FOO(x) 'x'
-int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+if test "x$ac_cv_prog_cc_c11" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cc_c11" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c11" >&5
+printf "%s\n" "$ac_cv_prog_cc_c11" >&6; }
+     CC="$CC $ac_cv_prog_cc_c11" ;;
+esac
+fi
+  ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c11
+  ac_prog_cc_stdc=c11 ;;
+esac
+fi
+fi
+if test x$ac_prog_cc_stdc = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable C99 features" >&5
+printf %s "checking for $CC option to enable C99 features... " >&6; }
+if test ${ac_cv_prog_cc_c99+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cc_c99=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_c_conftest_c99_program
+_ACEOF
+for ac_arg in '' -std=gnu99 -std=c99 -c99 -qlanglvl=extc1x -qlanglvl=extc99 -AC99 -D_STDC_C99=
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"
+then :
+  ac_cv_prog_cc_c99=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam
+  test "x$ac_cv_prog_cc_c99" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC ;;
+esac
+fi
 
-int test (int i, double x);
-struct s1 {int (*f) (int a);};
-struct s2 {int (*f) (double a);};
-int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
-int argc;
-char **argv;
-int
-main ()
-{
-return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
-  ;
-  return 0;
-}
+if test "x$ac_cv_prog_cc_c99" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cc_c99" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5
+printf "%s\n" "$ac_cv_prog_cc_c99" >&6; }
+     CC="$CC $ac_cv_prog_cc_c99" ;;
+esac
+fi
+  ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c99
+  ac_prog_cc_stdc=c99 ;;
+esac
+fi
+fi
+if test x$ac_prog_cc_stdc = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable C89 features" >&5
+printf %s "checking for $CC option to enable C89 features... " >&6; }
+if test ${ac_cv_prog_cc_c89+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_c_conftest_c89_program
 _ACEOF
-for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
-	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
 do
   CC="$ac_save_CC $ac_arg"
-  if ac_fn_c_try_compile "$LINENO"; then :
+  if ac_fn_c_try_compile "$LINENO"
+then :
   ac_cv_prog_cc_c89=$ac_arg
 fi
-rm -f core conftest.err conftest.$ac_objext
+rm -f core conftest.err conftest.$ac_objext conftest.beam
   test "x$ac_cv_prog_cc_c89" != "xno" && break
 done
 rm -f conftest.$ac_ext
-CC=$ac_save_CC
+CC=$ac_save_CC ;;
+esac
+fi
 
+if test "x$ac_cv_prog_cc_c89" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cc_c89" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+printf "%s\n" "$ac_cv_prog_cc_c89" >&6; }
+     CC="$CC $ac_cv_prog_cc_c89" ;;
+esac
 fi
-# AC_CACHE_VAL
-case "x$ac_cv_prog_cc_c89" in
-  x)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
-$as_echo "none needed" >&6; } ;;
-  xno)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
-$as_echo "unsupported" >&6; } ;;
-  *)
-    CC="$CC $ac_cv_prog_cc_c89"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
-$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+  ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c89
+  ac_prog_cc_stdc=c89 ;;
 esac
-if test "x$ac_cv_prog_cc_c89" != xno; then :
-
+fi
 fi
 
 ac_ext=c
@@ -3024,38 +3712,44 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 if test -n "$ac_tool_prefix"; then
   # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
 set dummy ${ac_tool_prefix}ranlib; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_RANLIB+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$RANLIB"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_RANLIB+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$RANLIB"; then
   ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 RANLIB=$ac_cv_prog_RANLIB
 if test -n "$RANLIB"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5
-$as_echo "$RANLIB" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5
+printf "%s\n" "$RANLIB" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -3064,38 +3758,44 @@ if test -z "$ac_cv_prog_RANLIB"; then
   ac_ct_RANLIB=$RANLIB
   # Extract the first word of "ranlib", so it can be a program name with args.
 set dummy ranlib; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_RANLIB+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_RANLIB"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_ac_ct_RANLIB+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$ac_ct_RANLIB"; then
   ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_RANLIB="ranlib"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
   done
 IFS=$as_save_IFS
 
-fi
+fi ;;
+esac
 fi
 ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
 if test -n "$ac_ct_RANLIB"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5
-$as_echo "$ac_ct_RANLIB" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5
+printf "%s\n" "$ac_ct_RANLIB" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
   if test "x$ac_ct_RANLIB" = x; then
@@ -3103,8 +3803,8 @@ fi
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
     RANLIB=$ac_ct_RANLIB
@@ -3114,37 +3814,44 @@ else
 fi
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
-$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
-if ${ac_cv_path_GREP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$GREP"; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+printf %s "checking for grep that handles long lines and -e... " >&6; }
+if test ${ac_cv_path_GREP+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -z "$GREP"; then
   ac_path_GREP_found=false
   # Loop through the user's path and test for each of PROGNAME-LIST
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in grep ggrep; do
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_prog in grep ggrep
+   do
     for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      ac_path_GREP="$as_dir$ac_prog$ac_exec_ext"
       as_fn_executable_p "$ac_path_GREP" || continue
 # Check for GNU ac_path_GREP and select it if it is found.
   # Check for GNU $ac_path_GREP
-case `"$ac_path_GREP" --version 2>&1` in
+case `"$ac_path_GREP" --version 2>&1` in #(
 *GNU*)
   ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+#(
 *)
   ac_count=0
-  $as_echo_n 0123456789 >"conftest.in"
+  printf %s 0123456789 >"conftest.in"
   while :
   do
     cat "conftest.in" "conftest.in" >"conftest.tmp"
     mv "conftest.tmp" "conftest.in"
     cp "conftest.in" "conftest.nl"
-    $as_echo 'GREP' >> "conftest.nl"
+    printf "%s\n" 'GREP' >> "conftest.nl"
     "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
     diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
     as_fn_arith $ac_count + 1 && ac_count=$as_val
@@ -3170,40 +3877,48 @@ IFS=$as_save_IFS
 else
   ac_cv_path_GREP=$GREP
 fi
-
+ ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
-$as_echo "$ac_cv_path_GREP" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+printf "%s\n" "$ac_cv_path_GREP" >&6; }
  GREP="$ac_cv_path_GREP"
 
 
 
   # Check whether --enable-warnings was given.
-if test "${enable_warnings+set}" = set; then :
+if test ${enable_warnings+y}
+then :
   enableval=$enable_warnings;
-else
-  enable_warnings=yes
+else case e in #(
+  e) enable_warnings=yes ;;
+esac
 fi
 
 
-  if test "x$enable_warnings" != xno; then :
+  if test "x$enable_warnings" != xno
+then :
 
 
 
     ansi=""
-    if test "x$ansi" = "x"; then :
+    if test "x$ansi" = "x"
+then :
   msg="for C compiler warning flags"
-else
-  msg="for C compiler warning and ANSI conformance flags"
+else case e in #(
+  e) msg="for C compiler warning and ANSI conformance flags" ;;
+esac
 fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking $msg" >&5
-$as_echo_n "checking $msg... " >&6; }
-    if ${hts_cv_prog_cc_warnings+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-        hts_cv_prog_cc_warnings=""
-      if test "x$CC" != "x"; then :
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking $msg" >&5
+printf %s "checking $msg... " >&6; }
+    if test ${hts_cv_prog_cc_warnings+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)       hts_cv_prog_cc_warnings=""
+      if test "x$CC" != "x"
+then :
 
         cat > conftest.c <<EOF
 int main(int argc, char **argv) { return 0; }
@@ -3215,103 +3930,130 @@ EOF
         # GCC compatible
         if test "x$GCC" = "xyes" &&
                "$CC" -c -Wall conftest.c > /dev/null 2>&1 &&
-               test -f conftest.o; then :
-            if test "x$ansi" = "x"; then :
+               test -f conftest.o
+then :
+            if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-Wall"
-else
-  hts_cv_prog_cc_warnings="-Wall -ansi -pedantic"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-Wall -ansi -pedantic" ;;
+esac
 fi
 
 elif # Sun Studio or Solaris C compiler
         "$CC" -V 2>&1 | $GREP -i -E "WorkShop|Sun C" > /dev/null 2>&1 &&
          "$CC" -c -v -Xc conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
-          if test "x$ansi" = "x"; then :
+         test -f conftest.o
+then :
+          if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-v"
-else
-  hts_cv_prog_cc_warnings="-v -Xc"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-v -Xc" ;;
+esac
 fi
 
 elif # Digital Unix C compiler
         "$CC" -V 2>&1 | $GREP -i "Digital UNIX Compiler" > /dev/null 2>&1 &&
          "$CC" -c -verbose -w0 -warnprotos -std1 conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
-             if test "x$ansi" = "x"; then :
+         test -f conftest.o
+then :
+             if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos"
-else
-  hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos -std1"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos -std1" ;;
+esac
 fi
 
 elif # C for AIX Compiler
         "$CC" 2>&1 | $GREP -i "C for AIX Compiler" > /dev/null 2>&1 &&
          "$CC" -c -qlanglvl=ansi -qinfo=all conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
-          if test "x$ansi" = "x"; then :
+         test -f conftest.o
+then :
+          if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd"
-else
-  hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd -qlanglvl=ansi"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd -qlanglvl=ansi" ;;
+esac
 fi
 
 elif # IRIX C compiler
         "$CC" -version 2>&1 | $GREP -i "MIPSpro Compilers" > /dev/null 2>&1 &&
          "$CC" -c -fullwarn -ansi -ansiE conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
-             if test "x$ansi" = "x"; then :
+         test -f conftest.o
+then :
+             if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-fullwarn"
-else
-  hts_cv_prog_cc_warnings="-fullwarn -ansi -ansiE"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-fullwarn -ansi -ansiE" ;;
+esac
 fi
 
 elif # HP-UX C compiler
         what "$CC" 2>&1 | $GREP -i "HP C Compiler" > /dev/null 2>&1 &&
          "$CC" -c -Aa +w1 conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
-          if test "x$ansi" = "x"; then :
+         test -f conftest.o
+then :
+          if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="+w1"
-else
-  hts_cv_prog_cc_warnings="+w1 -Aa"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="+w1 -Aa" ;;
+esac
 fi
 
 elif # The NEC SX series (Super-UX 10) C compiler
         "$CC" -V 2>&1 | $GREP "/SX" > /dev/null 2>&1 &&
          "$CC" -c -pvctl,fullmsg -Xc conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
+         test -f conftest.o
+then :
 
-        if test "x$ansi" = "x"; then :
+        if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-pvctl,fullmsg"
-else
-  hts_cv_prog_cc_warnings="-pvctl,fullmsg -Xc"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-pvctl,fullmsg -Xc" ;;
+esac
 fi
 
 elif # The Cray C compiler (Unicos)
         "$CC" -V 2>&1 | $GREP -i "Cray" > /dev/null 2>&1 &&
          "$CC" -c -h msglevel_2 conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
-          if test "x$ansi" = "x"; then :
+         test -f conftest.o
+then :
+          if test "x$ansi" = "x"
+then :
   hts_cv_prog_cc_warnings="-h#msglevel_2"
-else
-  hts_cv_prog_cc_warnings="-h#msglevel_2,conform"
+else case e in #(
+  e) hts_cv_prog_cc_warnings="-h#msglevel_2,conform" ;;
+esac
 fi
 
 elif # The Tiny C Compiler
         "$CC" -v 2>&1 | $GREP "tcc version" > /dev/null &&
          "$CC" -Wall -c conftest.c > /dev/null 2>&1 &&
-         test -f conftest.o; then :
+         test -f conftest.o
+then :
            hts_cv_prog_cc_warnings="-Wall"
 
 fi
         rm -f conftest.*
 
 fi
-
+     ;;
+esac
 fi
 
 
-    if test "x$hts_cv_prog_cc_warnings" != "x"; then :
+    if test "x$hts_cv_prog_cc_warnings" != "x"
+then :
 
 ac_arg_result=`echo "$hts_cv_prog_cc_warnings" | tr '#' ' '`
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_arg_result" >&5
-$as_echo "$ac_arg_result" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_arg_result" >&5
+printf "%s\n" "$ac_arg_result" >&6; }
 
 ac_arg_needed=""
 for ac_arg in $hts_cv_prog_cc_warnings
@@ -3325,32 +4067,38 @@ do
 esac
 done
 CFLAGS="$ac_arg_needed $CFLAGS"
-else
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: unknown" >&5
-$as_echo "unknown" >&6; }
-
+else case e in #(
+  e)       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unknown" >&5
+printf "%s\n" "unknown" >&6; }
+     ;;
+esac
 fi
 
 fi
 
 
   # Check whether --enable-werror was given.
-if test "${enable_werror+set}" = set; then :
+if test ${enable_werror+y}
+then :
   enableval=$enable_werror;
-else
-  enable_werror=no
+else case e in #(
+  e) enable_werror=no ;;
+esac
 fi
 
 
-  if test "x$enable_werror" != xno; then :
+  if test "x$enable_werror" != xno
+then :
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler flags to error on warnings" >&5
-$as_echo_n "checking for C compiler flags to error on warnings... " >&6; }
-    if ${hts_cv_prog_cc_werror+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-        hts_cv_prog_cc_werror=""
-      if test "x$CC" != "x"; then :
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C compiler flags to error on warnings" >&5
+printf %s "checking for C compiler flags to error on warnings... " >&6; }
+    if test ${hts_cv_prog_cc_werror+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)       hts_cv_prog_cc_werror=""
+      if test "x$CC" != "x"
+then :
 
         cat > conftest.c <<EOF
 int main(int argc, char **argv) { return 0; }
@@ -3360,38 +4108,45 @@ EOF
          # GCC compatible
          test "x$GCC" = "xyes" &&
           "$CC" -c -Werror conftest.c > /dev/null 2>&1 &&
-          test -f conftest.o; then :
+          test -f conftest.o
+then :
   hts_cv_prog_cc_werror="-Werror"
 elif # Sun Studio or Solaris C compiler
          "$CC" -V 2>&1 | $GREP -i -E "WorkShop|Sun C" > /dev/null 2>&1 &&
           "$CC" -c -errwarn=%all conftest.c > /dev/null 2>&1 &&
-          test -f conftest.o; then :
+          test -f conftest.o
+then :
   hts_cv_prog_cc_werror="-errwarn=%all"
 elif # The Tiny C Compiler
          "$CC" -v 2>&1 | $GREP "tcc version" > /dev/null &&
           "$CC" -Wall -c conftest.c > /dev/null 2>&1 &&
-          test -f conftest.o; then :
+          test -f conftest.o
+then :
   hts_cv_prog_cc_werror="-Werror"
 
 fi
         rm -f conftest.*
 
 fi
-
+     ;;
+esac
 fi
 
-    if test "x$hts_cv_prog_cc_werror" != x; then :
+    if test "x$hts_cv_prog_cc_werror" != x
+then :
 
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_prog_cc_werror" >&5
-$as_echo "$hts_cv_prog_cc_werror" >&6; }
-      if test "xhts_late_cflags" != x; then :
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_prog_cc_werror" >&5
+printf "%s\n" "$hts_cv_prog_cc_werror" >&6; }
+      if test "xhts_late_cflags" != x
+then :
   eval hts_late_cflags="$hts_cv_prog_cc_werror"
 fi
 
-else
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: unknown" >&5
-$as_echo "unknown" >&6; }
-
+else case e in #(
+  e)       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unknown" >&5
+printf "%s\n" "unknown" >&6; }
+     ;;
+esac
 fi
 
 fi
@@ -3406,532 +4161,183 @@ fi
 
 # Define _XOPEN_SOURCE unless the user has already done so via $CPPFLAGS etc.
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
-$as_echo_n "checking how to run the C preprocessor... " >&6; }
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
-  CPP=
-fi
-if test -z "$CPP"; then
-  if ${ac_cv_prog_CPP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-      # Double quotes because CPP needs to be expanded
-    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_c_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-  break
-fi
-
-    done
-    ac_cv_prog_CPP=$CPP
-
-fi
-  CPP=$ac_cv_prog_CPP
-else
-  ac_cv_prog_CPP=$CPP
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
-$as_echo "$CPP" >&6; }
-ac_preproc_ok=false
-for ac_c_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
-$as_echo_n "checking for egrep... " >&6; }
-if ${ac_cv_path_EGREP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
-   then ac_cv_path_EGREP="$GREP -E"
-   else
-     if test -z "$EGREP"; then
-  ac_path_EGREP_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in egrep; do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
-      as_fn_executable_p "$ac_path_EGREP" || continue
-# Check for GNU ac_path_EGREP and select it if it is found.
-  # Check for GNU $ac_path_EGREP
-case `"$ac_path_EGREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
-*)
-  ac_count=0
-  $as_echo_n 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    $as_echo 'EGREP' >> "conftest.nl"
-    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_EGREP="$ac_path_EGREP"
-      ac_path_EGREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_EGREP_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_EGREP"; then
-    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-  fi
-else
-  ac_cv_path_EGREP=$EGREP
-fi
-
-   fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
-$as_echo "$ac_cv_path_EGREP" >&6; }
- EGREP="$ac_cv_path_EGREP"
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
-$as_echo_n "checking for ANSI C header files... " >&6; }
-if ${ac_cv_header_stdc+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <float.h>
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_header_stdc=yes
-else
-  ac_cv_header_stdc=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-
-if test $ac_cv_header_stdc = yes; then
-  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <string.h>
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "memchr" >/dev/null 2>&1; then :
-
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
-
-fi
-
-if test $ac_cv_header_stdc = yes; then
-  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdlib.h>
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "free" >/dev/null 2>&1; then :
-
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
-
-fi
-
-if test $ac_cv_header_stdc = yes; then
-  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
-  if test "$cross_compiling" = yes; then :
-  :
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ctype.h>
-#include <stdlib.h>
-#if ((' ' & 0x0FF) == 0x020)
-# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
-# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
-#else
-# define ISLOWER(c) \
-		   (('a' <= (c) && (c) <= 'i') \
-		     || ('j' <= (c) && (c) <= 'r') \
-		     || ('s' <= (c) && (c) <= 'z'))
-# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
-#endif
-
-#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
-int
-main ()
-{
-  int i;
-  for (i = 0; i < 256; i++)
-    if (XOR (islower (i), ISLOWER (i))
-	|| toupper (i) != TOUPPER (i))
-      return 2;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-
-else
-  ac_cv_header_stdc=no
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
-$as_echo "$ac_cv_header_stdc" >&6; }
-if test $ac_cv_header_stdc = yes; then
-
-$as_echo "#define STDC_HEADERS 1" >>confdefs.h
-
-fi
-
-# On IRIX 5.3, sys/types and inttypes.h are conflicting.
-for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
-		  inttypes.h stdint.h unistd.h
-do :
-  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
-ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
-"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
-  cat >>confdefs.h <<_ACEOF
-#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
-
-fi
-
-done
-
-
-ac_fn_c_check_decl "$LINENO" "_XOPEN_SOURCE" "ac_cv_have_decl__XOPEN_SOURCE" "$ac_includes_default"
-if test "x$ac_cv_have_decl__XOPEN_SOURCE" = xyes; then :
-
-else
-
-$as_echo "#define _XOPEN_SOURCE 600" >>confdefs.h
-
-fi
-
-
-
-hts_cflags_sse4=""
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for ssse3" >&5
-$as_echo_n "checking C compiler flags needed for ssse3... " >&6; }
-if ${hts_cv_check_cflags_needed_ssse3___mssse3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC options needed to detect all undeclared functions" >&5
+printf %s "checking for $CC options needed to detect all undeclared functions... " >&6; }
+if test ${ac_cv_c_undeclared_builtin_options+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_save_CFLAGS=$CFLAGS
+   ac_cv_c_undeclared_builtin_options='cannot detect'
+   for ac_arg in '' -fno-builtin; do
+     CFLAGS="$ac_save_CFLAGS $ac_arg"
+     # This test program should *not* compile successfully.
+     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
-    #ifdef __x86_64__
-    #include "x86intrin.h"
-    #endif
-
 int
-main ()
+main (void)
 {
-
-    #ifdef __x86_64__
-    __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-    __m128i c = _mm_shuffle_epi8(a, b);
-    return *((char *) &c);
-    #endif
-
+(void) strchr;
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_ssse3___mssse3=none
-else
-  ax_check_save_flags=$CFLAGS
-     CFLAGS="$CFLAGS  -mssse3"
-     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+if ac_fn_c_try_compile "$LINENO"
+then :
+
+else case e in #(
+  e) # This test program should compile successfully.
+        # No library function is consistently available on
+        # freestanding implementations, so test against a dummy
+        # declaration.  Include always-available headers on the
+        # off chance that they somehow elicit warnings.
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-
-    #ifdef __x86_64__
-    #include "x86intrin.h"
-    #endif
+#include <float.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+extern void ac_decl (int, char *);
 
 int
-main ()
+main (void)
 {
-
-    #ifdef __x86_64__
-    __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-    __m128i c = _mm_shuffle_epi8(a, b);
-    return *((char *) &c);
-    #endif
+(void) ac_decl (0, (char *) 0);
+  (void) ac_decl;
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_ssse3___mssse3=-mssse3
-else
-  hts_cv_check_cflags_needed_ssse3___mssse3=unsupported
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-     CFLAGS=$ax_check_save_flags
+if ac_fn_c_try_compile "$LINENO"
+then :
+  if test x"$ac_arg" = x
+then :
+  ac_cv_c_undeclared_builtin_options='none needed'
+else case e in #(
+  e) ac_cv_c_undeclared_builtin_options=$ac_arg ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+          break
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_ssse3___mssse3" >&5
-$as_echo "$hts_cv_check_cflags_needed_ssse3___mssse3" >&6; }
-if test "x$hts_cv_check_cflags_needed_ssse3___mssse3" = xunsupported; then :
-
-  :
-
-else
-
-  if test "x$hts_cv_check_cflags_needed_ssse3___mssse3" = xnone; then :
-  flags_needed=""
-else
-  flags_needed="$hts_cv_check_cflags_needed_ssse3___mssse3"
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
 fi
-
-  hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
-
-$as_echo "#define HAVE_SSSE3 1" >>confdefs.h
-
-
-
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+    done
+    CFLAGS=$ac_save_CFLAGS
+   ;;
+esac
 fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_undeclared_builtin_options" >&5
+printf "%s\n" "$ac_cv_c_undeclared_builtin_options" >&6; }
+  case $ac_cv_c_undeclared_builtin_options in #(
+  'cannot detect') :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error $? "cannot make $CC report undeclared builtins
+See 'config.log' for more details" "$LINENO" 5; } ;; #(
+  'none needed') :
+    ac_c_undeclared_builtin_options='' ;; #(
+  *) :
+    ac_c_undeclared_builtin_options=$ac_cv_c_undeclared_builtin_options ;;
+esac
 
+ac_header= ac_cache=
+for ac_item in $ac_header_c_list
+do
+  if test $ac_cache; then
+    ac_fn_c_check_header_compile "$LINENO" $ac_header ac_cv_header_$ac_cache "$ac_includes_default"
+    if eval test \"x\$ac_cv_header_$ac_cache\" = xyes; then
+      printf "%s\n" "#define $ac_item 1" >> confdefs.h
+    fi
+    ac_header= ac_cache=
+  elif test $ac_header; then
+    ac_cache=$ac_item
+  else
+    ac_header=$ac_item
+  fi
+done
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for popcnt" >&5
-$as_echo_n "checking C compiler flags needed for popcnt... " >&6; }
-if ${hts_cv_check_cflags_needed_popcnt___mpopcnt+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
 
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
-    #ifdef __x86_64__
-    #include "x86intrin.h"
-    #endif
 
-int
-main ()
-{
 
-    #ifdef __x86_64__
-    unsigned int i = _mm_popcnt_u32(1);
-    return i != 1;
-    #endif
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_popcnt___mpopcnt=none
-else
-  ax_check_save_flags=$CFLAGS
-     CFLAGS="$CFLAGS  -mpopcnt"
-     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
-    #ifdef __x86_64__
-    #include "x86intrin.h"
-    #endif
 
-int
-main ()
-{
+if test $ac_cv_header_stdlib_h = yes && test $ac_cv_header_string_h = yes
+then :
 
-    #ifdef __x86_64__
-    unsigned int i = _mm_popcnt_u32(1);
-    return i != 1;
-    #endif
+printf "%s\n" "#define STDC_HEADERS 1" >>confdefs.h
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_popcnt___mpopcnt=-mpopcnt
-else
-  hts_cv_check_cflags_needed_popcnt___mpopcnt=unsupported
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-     CFLAGS=$ax_check_save_flags
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_popcnt___mpopcnt" >&5
-$as_echo "$hts_cv_check_cflags_needed_popcnt___mpopcnt" >&6; }
-if test "x$hts_cv_check_cflags_needed_popcnt___mpopcnt" = xunsupported; then :
-
-  :
-
-else
-
-  if test "x$hts_cv_check_cflags_needed_popcnt___mpopcnt" = xnone; then :
-  flags_needed=""
-else
-  flags_needed="$hts_cv_check_cflags_needed_popcnt___mpopcnt"
+ac_fn_check_decl "$LINENO" "_XOPEN_SOURCE" "ac_cv_have_decl__XOPEN_SOURCE" "$ac_includes_default" "$ac_c_undeclared_builtin_options" "CFLAGS"
+if test "x$ac_cv_have_decl__XOPEN_SOURCE" = xyes
+then :
+
+else case e in #(
+  e)
+printf "%s\n" "#define _XOPEN_SOURCE 600" >>confdefs.h
+ ;;
+esac
 fi
 
-  hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
+ac_fn_check_decl "$LINENO" "__get_cpuid_max" "ac_cv_have_decl___get_cpuid_max" "#include <cpuid.h>
+" "$ac_c_undeclared_builtin_options" "CFLAGS"
+if test "x$ac_cv_have_decl___get_cpuid_max" = xyes
+then :
+  ac_have_decl=1
+else case e in #(
+  e) ac_have_decl=0 ;;
+esac
+fi
+printf "%s\n" "#define HAVE_DECL___GET_CPUID_MAX $ac_have_decl" >>confdefs.h
+if test $ac_have_decl = 1
+then :
 
-$as_echo "#define HAVE_POPCNT 1" >>confdefs.h
+   hts_have_cpuid=yes
 
+else case e in #(
+  e)
+   hts_have_cpuid=no
+ ;;
+esac
+fi
+ac_fn_check_decl "$LINENO" "__cpuid_count" "ac_cv_have_decl___cpuid_count" "#include <cpuid.h>
+" "$ac_c_undeclared_builtin_options" "CFLAGS"
+if test "x$ac_cv_have_decl___cpuid_count" = xyes
+then :
+  ac_have_decl=1
+else case e in #(
+  e) ac_have_decl=0 ;;
+esac
+fi
+printf "%s\n" "#define HAVE_DECL___CPUID_COUNT $ac_have_decl" >>confdefs.h
+if test $ac_have_decl = 1
+then :
 
+   hts_have_cpuid=yes
 
+else case e in #(
+  e)
+   hts_have_cpuid=no
+ ;;
+esac
 fi
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for sse4.1" >&5
-$as_echo_n "checking C compiler flags needed for sse4.1... " >&6; }
-if ${hts_cv_check_cflags_needed_sse4_1___msse4_1+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
+if test "x$hts_have_cpuid" = "xyes"
+then :
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for sse4.1" >&5
+printf %s "checking C compiler flags needed for sse4.1... " >&6; }
+if test ${hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -3940,24 +4346,25 @@ else
     #endif
 
 int
-main ()
+main (void)
 {
 
     #ifdef __x86_64__
     __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-    __m128i c = _mm_max_epu32(a, b);
-    return *((char *) &c);
+    __m128i c = _mm_shuffle_epi8(_mm_max_epu32(a, b), b);
+    return _mm_popcnt_u32(*((char *) &c));
     #endif
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_sse4_1___msse4_1=none
-else
-  ax_check_save_flags=$CFLAGS
-     CFLAGS="$CFLAGS  -msse4.1"
+if ac_fn_c_try_link "$LINENO"
+then :
+  hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt=none
+else case e in #(
+  e) ax_check_save_flags=$CFLAGS
+     CFLAGS="$CFLAGS  -msse4.1 -mssse3 -mpopcnt"
      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -3966,64 +4373,80 @@ else
     #endif
 
 int
-main ()
+main (void)
 {
 
     #ifdef __x86_64__
     __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-    __m128i c = _mm_max_epu32(a, b);
-    return *((char *) &c);
+    __m128i c = _mm_shuffle_epi8(_mm_max_epu32(a, b), b);
+    return _mm_popcnt_u32(*((char *) &c));
     #endif
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_sse4_1___msse4_1=-msse4.1
-else
-  hts_cv_check_cflags_needed_sse4_1___msse4_1=unsupported
+if ac_fn_c_try_link "$LINENO"
+then :
+  hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt="-msse4.1 -mssse3 -mpopcnt"
+else case e in #(
+  e) hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt=unsupported ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-     CFLAGS=$ax_check_save_flags
+     CFLAGS=$ax_check_save_flags ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_sse4_1___msse4_1" >&5
-$as_echo "$hts_cv_check_cflags_needed_sse4_1___msse4_1" >&6; }
-if test "x$hts_cv_check_cflags_needed_sse4_1___msse4_1" = xunsupported; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt" >&5
+printf "%s\n" "$hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt" >&6; }
+if test "x$hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt" = xunsupported
+then :
 
   :
 
-else
-
-  if test "x$hts_cv_check_cflags_needed_sse4_1___msse4_1" = xnone; then :
+else case e in #(
+  e)
+  if test "x$hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt" = xnone
+then :
   flags_needed=""
-else
-  flags_needed="$hts_cv_check_cflags_needed_sse4_1___msse4_1"
+else case e in #(
+  e) flags_needed="$hts_cv_check_cflags_needed_sse4_1___msse4_1__mssse3__mpopcnt" ;;
+esac
 fi
 
-  hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
+  hts_cflags_sse4="$flags_needed"
 
-$as_echo "#define HAVE_SSE4_1 1" >>confdefs.h
+printf "%s\n" "#define HAVE_SSSE3 1" >>confdefs.h
 
 
-  $as_echo "#define UBSAN 1" >>confdefs.h
+printf "%s\n" "#define HAVE_POPCNT 1" >>confdefs.h
 
 
+printf "%s\n" "#define HAVE_SSE4_1 1" >>confdefs.h
 
-fi
 
 
+  printf "%s\n" "#define UBSAN 1" >>confdefs.h
+
+
+ ;;
+esac
+fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for avx2" >&5
-$as_echo_n "checking C compiler flags needed for avx2... " >&6; }
-if ${hts_cv_check_cflags_needed_avx2___mavx2+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
 
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for avx2" >&5
+printf %s "checking C compiler flags needed for avx2... " >&6; }
+if test ${hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4032,25 +4455,26 @@ else
     #endif
 
 int
-main ()
+main (void)
 {
 
     #ifdef __x86_64__
     __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
     __m256i b = _mm256_add_epi32(a, a);
     long long c = _mm256_extract_epi64(b, 0);
-    return (int) c;
+    return _mm_popcnt_u32((int) c);
     #endif
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_avx2___mavx2=none
-else
-  ax_check_save_flags=$CFLAGS
-     CFLAGS="$CFLAGS  -mavx2"
+if ac_fn_c_try_link "$LINENO"
+then :
+  hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt=none
+else case e in #(
+  e) ax_check_save_flags=$CFLAGS
+     CFLAGS="$CFLAGS  -mavx2 -mpopcnt"
      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4059,62 +4483,74 @@ else
     #endif
 
 int
-main ()
+main (void)
 {
 
     #ifdef __x86_64__
     __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
     __m256i b = _mm256_add_epi32(a, a);
     long long c = _mm256_extract_epi64(b, 0);
-    return (int) c;
+    return _mm_popcnt_u32((int) c);
     #endif
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_avx2___mavx2=-mavx2
-else
-  hts_cv_check_cflags_needed_avx2___mavx2=unsupported
+if ac_fn_c_try_link "$LINENO"
+then :
+  hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt="-mavx2 -mpopcnt"
+else case e in #(
+  e) hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt=unsupported ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-     CFLAGS=$ax_check_save_flags
+     CFLAGS=$ax_check_save_flags ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_avx2___mavx2" >&5
-$as_echo "$hts_cv_check_cflags_needed_avx2___mavx2" >&6; }
-if test "x$hts_cv_check_cflags_needed_avx2___mavx2" = xunsupported; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt" >&5
+printf "%s\n" "$hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt" >&6; }
+if test "x$hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt" = xunsupported
+then :
 
   :
 
-else
-
-  if test "x$hts_cv_check_cflags_needed_avx2___mavx2" = xnone; then :
+else case e in #(
+  e)
+  if test "x$hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt" = xnone
+then :
   flags_needed=""
-else
-  flags_needed="$hts_cv_check_cflags_needed_avx2___mavx2"
+else case e in #(
+  e) flags_needed="$hts_cv_check_cflags_needed_avx2___mavx2__mpopcnt" ;;
+esac
 fi
 
   hts_cflags_avx2="$flags_needed"
 
 
-$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+printf "%s\n" "#define HAVE_POPCNT 1" >>confdefs.h
 
 
+printf "%s\n" "#define HAVE_AVX2 1" >>confdefs.h
 
-fi
 
+ ;;
+esac
+fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for avx512f" >&5
-$as_echo_n "checking C compiler flags needed for avx512f... " >&6; }
-if ${hts_cv_check_cflags_needed_avx512f___mavx512f+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking C compiler flags needed for avx512f" >&5
+printf %s "checking C compiler flags needed for avx512f... " >&6; }
+if test ${hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4123,24 +4559,27 @@ else
     #endif
 
 int
-main ()
+main (void)
 {
 
     #ifdef __x86_64__
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return *((char *) &b);
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
     #endif
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_avx512f___mavx512f=none
-else
-  ax_check_save_flags=$CFLAGS
-     CFLAGS="$CFLAGS  -mavx512f"
+if ac_fn_c_try_link "$LINENO"
+then :
+  hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt=none
+else case e in #(
+  e) ax_check_save_flags=$CFLAGS
+     CFLAGS="$CFLAGS  -mavx512f -mpopcnt"
      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4149,55 +4588,150 @@ else
     #endif
 
 int
-main ()
+main (void)
 {
 
     #ifdef __x86_64__
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return *((char *) &b);
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
     #endif
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  hts_cv_check_cflags_needed_avx512f___mavx512f=-mavx512f
-else
-  hts_cv_check_cflags_needed_avx512f___mavx512f=unsupported
+if ac_fn_c_try_link "$LINENO"
+then :
+  hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt="-mavx512f -mpopcnt"
+else case e in #(
+  e) hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt=unsupported ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-     CFLAGS=$ax_check_save_flags
+     CFLAGS=$ax_check_save_flags ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_avx512f___mavx512f" >&5
-$as_echo "$hts_cv_check_cflags_needed_avx512f___mavx512f" >&6; }
-if test "x$hts_cv_check_cflags_needed_avx512f___mavx512f" = xunsupported; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt" >&5
+printf "%s\n" "$hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt" >&6; }
+if test "x$hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt" = xunsupported
+then :
 
   :
 
-else
-
-  if test "x$hts_cv_check_cflags_needed_avx512f___mavx512f" = xnone; then :
+else case e in #(
+  e)
+  if test "x$hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt" = xnone
+then :
   flags_needed=""
-else
-  flags_needed="$hts_cv_check_cflags_needed_avx512f___mavx512f"
+else case e in #(
+  e) flags_needed="$hts_cv_check_cflags_needed_avx512f___mavx512f__mpopcnt" ;;
+esac
 fi
 
   hts_cflags_avx512="$flags_needed"
 
 
-$as_echo "#define HAVE_AVX512 1" >>confdefs.h
+printf "%s\n" "#define HAVE_POPCNT 1" >>confdefs.h
+
+
+printf "%s\n" "#define HAVE_AVX512 1" >>confdefs.h
+
+
+ ;;
+esac
+fi
+
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for working __builtin_cpu_supports(\"ssse3\")" >&5
+printf %s "checking for working __builtin_cpu_supports(\"ssse3\")... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  if (__builtin_cpu_supports("ssse3")) {
+    return 0;
+  }
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+printf "%s\n" "#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1" >>confdefs.h
+
+
+else case e in #(
+  e)
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+ ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for working __attribute__((target(\"ssse3\")))" >&5
+printf %s "checking for working __attribute__((target(\"ssse3\")))... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  #ifdef __x86_64__
+  #include "x86intrin.h"
+
+  __attribute__((target("ssse3")))
+  void shuffle(char *aptr, char *bptr) {
+    __m128i a = _mm_lddqu_si128((__m128i *)aptr);
+    __m128i b = _mm_shuffle_epi8(a, a);
+    _mm_storeu_si128((__m128i *)bptr, b);
+  }
+  #else
+  void shuffle(char *aptr, char *bptr) { }
+  #endif
+
+int
+main (void)
+{
+shuffle(0, 0);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
 
+printf "%s\n" "#define HAVE_ATTRIBUTE_TARGET_SSSE3 1" >>confdefs.h
 
 
+else case e in #(
+  e)
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+ ;;
+esac
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
 
 
+fi
 
 
 
@@ -4209,12 +4743,13 @@ if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
 	if test -n "$ac_tool_prefix"; then
   # Extract the first word of "${ac_tool_prefix}pkg-config", so it can be a program name with args.
 set dummy ${ac_tool_prefix}pkg-config; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_PKG_CONFIG+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  case $PKG_CONFIG in
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_path_PKG_CONFIG+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) case $PKG_CONFIG in
   [\\/]* | ?:[\\/]*)
   ac_cv_path_PKG_CONFIG="$PKG_CONFIG" # Let the user override the test with a path.
   ;;
@@ -4223,11 +4758,15 @@ else
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_path_PKG_CONFIG="$as_dir$ac_word$ac_exec_ext"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
@@ -4235,15 +4774,16 @@ done
 IFS=$as_save_IFS
 
   ;;
+esac ;;
 esac
 fi
 PKG_CONFIG=$ac_cv_path_PKG_CONFIG
 if test -n "$PKG_CONFIG"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5
-$as_echo "$PKG_CONFIG" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5
+printf "%s\n" "$PKG_CONFIG" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
 
@@ -4252,12 +4792,13 @@ if test -z "$ac_cv_path_PKG_CONFIG"; then
   ac_pt_PKG_CONFIG=$PKG_CONFIG
   # Extract the first word of "pkg-config", so it can be a program name with args.
 set dummy pkg-config; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_PKG_CONFIG+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  case $ac_pt_PKG_CONFIG in
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_path_ac_pt_PKG_CONFIG+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) case $ac_pt_PKG_CONFIG in
   [\\/]* | ?:[\\/]*)
   ac_cv_path_ac_pt_PKG_CONFIG="$ac_pt_PKG_CONFIG" # Let the user override the test with a path.
   ;;
@@ -4266,11 +4807,15 @@ else
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_path_ac_pt_PKG_CONFIG="$as_dir$ac_word$ac_exec_ext"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
@@ -4278,15 +4823,16 @@ done
 IFS=$as_save_IFS
 
   ;;
+esac ;;
 esac
 fi
 ac_pt_PKG_CONFIG=$ac_cv_path_ac_pt_PKG_CONFIG
 if test -n "$ac_pt_PKG_CONFIG"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_PKG_CONFIG" >&5
-$as_echo "$ac_pt_PKG_CONFIG" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_pt_PKG_CONFIG" >&5
+printf "%s\n" "$ac_pt_PKG_CONFIG" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 fi
 
   if test "x$ac_pt_PKG_CONFIG" = x; then
@@ -4294,8 +4840,8 @@ fi
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
     PKG_CONFIG=$ac_pt_PKG_CONFIG
@@ -4307,14 +4853,14 @@ fi
 fi
 if test -n "$PKG_CONFIG"; then
 	_pkg_min_version=0.9.0
-	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking pkg-config is at least version $_pkg_min_version" >&5
-$as_echo_n "checking pkg-config is at least version $_pkg_min_version... " >&6; }
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking pkg-config is at least version $_pkg_min_version" >&5
+printf %s "checking pkg-config is at least version $_pkg_min_version... " >&6; }
 	if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
-		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
 	else
-		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+		{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
 		PKG_CONFIG=""
 	fi
 fi
@@ -4326,277 +4872,269 @@ static_LIBS='-lpthread -lz -lm'
 private_LIBS=$LDFLAGS
 
 # Check whether --enable-versioned-symbols was given.
-if test "${enable_versioned_symbols+set}" = set; then :
+if test ${enable_versioned_symbols+y}
+then :
   enableval=$enable_versioned_symbols;
-else
-  enable_versioned_symbols=yes
+else case e in #(
+  e) enable_versioned_symbols=yes ;;
+esac
 fi
 
 
 # Check whether --enable-bz2 was given.
-if test "${enable_bz2+set}" = set; then :
+if test ${enable_bz2+y}
+then :
   enableval=$enable_bz2;
-else
-  enable_bz2=yes
+else case e in #(
+  e) enable_bz2=yes ;;
+esac
 fi
 
 
 # Check whether --enable-gcs was given.
-if test "${enable_gcs+set}" = set; then :
+if test ${enable_gcs+y}
+then :
   enableval=$enable_gcs;
-else
-  enable_gcs=check
+else case e in #(
+  e) enable_gcs=check ;;
+esac
 fi
 
 
 # Check whether --enable-largefile was given.
-if test "${enable_largefile+set}" = set; then :
+if test ${enable_largefile+y}
+then :
   enableval=$enable_largefile;
 fi
-
-if test "$enable_largefile" != no; then
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for special C compiler options needed for large files" >&5
-$as_echo_n "checking for special C compiler options needed for large files... " >&6; }
-if ${ac_cv_sys_largefile_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_cv_sys_largefile_CC=no
-     if test "$GCC" != yes; then
-       ac_save_CC=$CC
-       while :; do
-	 # IRIX 6.2 and later do not support large files by default,
-	 # so use the C compiler's -n32 option if that helps.
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+if test "$enable_largefile,$enable_year2038" != no,no
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option to enable large file support" >&5
+printf %s "checking for $CC option to enable large file support... " >&6; }
+if test ${ac_cv_sys_largefile_opts+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_save_CC="$CC"
+  ac_opt_found=no
+  for ac_opt in "none needed" "-D_FILE_OFFSET_BITS=64" "-D_LARGE_FILES=1" "-n32"; do
+    if test x"$ac_opt" != x"none needed"
+then :
+  CC="$ac_save_CC $ac_opt"
+fi
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <sys/types.h>
- /* Check that off_t can represent 2**63 - 1 correctly.
-    We can't simply define LARGE_OFF_T to be 9223372036854775807,
+#ifndef FTYPE
+# define FTYPE off_t
+#endif
+ /* Check that FTYPE can represent 2**63 - 1 correctly.
+    We can't simply define LARGE_FTYPE to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
-  int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
-		       && LARGE_OFF_T % 2147483647 == 1)
+#define LARGE_FTYPE (((FTYPE) 1 << 31 << 31) - 1 + ((FTYPE) 1 << 31 << 31))
+  int FTYPE_is_large[(LARGE_FTYPE % 2147483629 == 721
+		       && LARGE_FTYPE % 2147483647 == 1)
 		      ? 1 : -1];
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-	 if ac_fn_c_try_compile "$LINENO"; then :
-  break
+if ac_fn_c_try_compile "$LINENO"
+then :
+  if test x"$ac_opt" = x"none needed"
+then :
+  # GNU/Linux s390x and alpha need _FILE_OFFSET_BITS=64 for wide ino_t.
+	 CC="$CC -DFTYPE=ino_t"
+	 if ac_fn_c_try_compile "$LINENO"
+then :
+
+else case e in #(
+  e) CC="$CC -D_FILE_OFFSET_BITS=64"
+	    if ac_fn_c_try_compile "$LINENO"
+then :
+  ac_opt='-D_FILE_OFFSET_BITS=64'
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext
-	 CC="$CC -n32"
-	 if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_sys_largefile_CC=' -n32'; break
+rm -f core conftest.err conftest.$ac_objext conftest.beam
 fi
-rm -f core conftest.err conftest.$ac_objext
-	 break
-       done
-       CC=$ac_save_CC
-       rm -f conftest.$ac_ext
-    fi
+      ac_cv_sys_largefile_opts=$ac_opt
+      ac_opt_found=yes
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_largefile_CC" >&5
-$as_echo "$ac_cv_sys_largefile_CC" >&6; }
-  if test "$ac_cv_sys_largefile_CC" != no; then
-    CC=$CC$ac_cv_sys_largefile_CC
-  fi
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _FILE_OFFSET_BITS value needed for large files" >&5
-$as_echo_n "checking for _FILE_OFFSET_BITS value needed for large files... " >&6; }
-if ${ac_cv_sys_file_offset_bits+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  while :; do
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <sys/types.h>
- /* Check that off_t can represent 2**63 - 1 correctly.
-    We can't simply define LARGE_OFF_T to be 9223372036854775807,
-    since some C++ compilers masquerading as C compilers
-    incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
-  int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
-		       && LARGE_OFF_T % 2147483647 == 1)
-		      ? 1 : -1];
-int
-main ()
-{
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+    test $ac_opt_found = no || break
+  done
+  CC="$ac_save_CC"
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_sys_file_offset_bits=no; break
+  test $ac_opt_found = yes || ac_cv_sys_largefile_opts="support not detected" ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#define _FILE_OFFSET_BITS 64
-#include <sys/types.h>
- /* Check that off_t can represent 2**63 - 1 correctly.
-    We can't simply define LARGE_OFF_T to be 9223372036854775807,
-    since some C++ compilers masquerading as C compilers
-    incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
-  int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
-		       && LARGE_OFF_T % 2147483647 == 1)
-		      ? 1 : -1];
-int
-main ()
-{
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_largefile_opts" >&5
+printf "%s\n" "$ac_cv_sys_largefile_opts" >&6; }
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_sys_file_offset_bits=64; break
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_cv_sys_file_offset_bits=unknown
-  break
-done
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_file_offset_bits" >&5
-$as_echo "$ac_cv_sys_file_offset_bits" >&6; }
-case $ac_cv_sys_file_offset_bits in #(
-  no | unknown) ;;
-  *)
-cat >>confdefs.h <<_ACEOF
-#define _FILE_OFFSET_BITS $ac_cv_sys_file_offset_bits
-_ACEOF
-;;
-esac
-rm -rf conftest*
-  if test $ac_cv_sys_file_offset_bits = unknown; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _LARGE_FILES value needed for large files" >&5
-$as_echo_n "checking for _LARGE_FILES value needed for large files... " >&6; }
-if ${ac_cv_sys_large_files+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  while :; do
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <sys/types.h>
- /* Check that off_t can represent 2**63 - 1 correctly.
-    We can't simply define LARGE_OFF_T to be 9223372036854775807,
-    since some C++ compilers masquerading as C compilers
-    incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
-  int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
-		       && LARGE_OFF_T % 2147483647 == 1)
-		      ? 1 : -1];
-int
-main ()
-{
+ac_have_largefile=yes
+case $ac_cv_sys_largefile_opts in #(
+  "none needed") :
+     ;; #(
+  "supported through gnulib") :
+     ;; #(
+  "support not detected") :
+    ac_have_largefile=no ;; #(
+  "-D_FILE_OFFSET_BITS=64") :
+
+printf "%s\n" "#define _FILE_OFFSET_BITS 64" >>confdefs.h
+ ;; #(
+  "-D_LARGE_FILES=1") :
+
+printf "%s\n" "#define _LARGE_FILES 1" >>confdefs.h
+ ;; #(
+  "-n32") :
+    CC="$CC -n32" ;; #(
+  *) :
+    as_fn_error $? "internal error: bad value for \$ac_cv_sys_largefile_opts" "$LINENO" 5 ;;
+esac
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_sys_large_files=no; break
+if test "$enable_year2038" != no
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CC option for timestamps after 2038" >&5
+printf %s "checking for $CC option for timestamps after 2038... " >&6; }
+if test ${ac_cv_sys_year2038_opts+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_save_CPPFLAGS="$CPPFLAGS"
+  ac_opt_found=no
+  for ac_opt in "none needed" "-D_TIME_BITS=64" "-D__MINGW_USE_VC2005_COMPAT" "-U_USE_32_BIT_TIME_T -D__MINGW_USE_VC2005_COMPAT"; do
+    if test x"$ac_opt" != x"none needed"
+then :
+  CPPFLAGS="$ac_save_CPPFLAGS $ac_opt"
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-#define _LARGE_FILES 1
-#include <sys/types.h>
- /* Check that off_t can represent 2**63 - 1 correctly.
-    We can't simply define LARGE_OFF_T to be 9223372036854775807,
-    since some C++ compilers masquerading as C compilers
-    incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
-  int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
-		       && LARGE_OFF_T % 2147483647 == 1)
-		      ? 1 : -1];
+
+  #include <time.h>
+  /* Check that time_t can represent 2**32 - 1 correctly.  */
+  #define LARGE_TIME_T \\
+    ((time_t) (((time_t) 1 << 30) - 1 + 3 * ((time_t) 1 << 30)))
+  int verify_time_t_range[(LARGE_TIME_T / 65537 == 65535
+                           && LARGE_TIME_T % 65537 == 0)
+                          ? 1 : -1];
+
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_sys_large_files=1; break
+if ac_fn_c_try_compile "$LINENO"
+then :
+  ac_cv_sys_year2038_opts="$ac_opt"
+      ac_opt_found=yes
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_cv_sys_large_files=unknown
-  break
-done
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_large_files" >&5
-$as_echo "$ac_cv_sys_large_files" >&6; }
-case $ac_cv_sys_large_files in #(
-  no | unknown) ;;
-  *)
-cat >>confdefs.h <<_ACEOF
-#define _LARGE_FILES $ac_cv_sys_large_files
-_ACEOF
-;;
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+    test $ac_opt_found = no || break
+  done
+  CPPFLAGS="$ac_save_CPPFLAGS"
+  test $ac_opt_found = yes || ac_cv_sys_year2038_opts="support not detected" ;;
 esac
-rm -rf conftest*
-  fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_year2038_opts" >&5
+printf "%s\n" "$ac_cv_sys_year2038_opts" >&6; }
 
+ac_have_year2038=yes
+case $ac_cv_sys_year2038_opts in #(
+  "none needed") :
+     ;; #(
+  "support not detected") :
+    ac_have_year2038=no ;; #(
+  "-D_TIME_BITS=64") :
+
+printf "%s\n" "#define _TIME_BITS 64" >>confdefs.h
+ ;; #(
+  "-D__MINGW_USE_VC2005_COMPAT") :
+
+printf "%s\n" "#define __MINGW_USE_VC2005_COMPAT 1" >>confdefs.h
+ ;; #(
+  "-U_USE_32_BIT_TIME_T"*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error $? "the 'time_t' type is currently forced to be 32-bit. It
+will stop working after mid-January 2038. Remove
+_USE_32BIT_TIME_T from the compiler flags.
+See 'config.log' for more details" "$LINENO" 5; } ;; #(
+  *) :
+    as_fn_error $? "internal error: bad value for \$ac_cv_sys_year2038_opts" "$LINENO" 5 ;;
+esac
 
 fi
 
+fi
 
 # Check whether --enable-libcurl was given.
-if test "${enable_libcurl+set}" = set; then :
+if test ${enable_libcurl+y}
+then :
   enableval=$enable_libcurl;
-else
-  enable_libcurl=check
+else case e in #(
+  e) enable_libcurl=check ;;
+esac
 fi
 
 
 # Check whether --enable-lzma was given.
-if test "${enable_lzma+set}" = set; then :
+if test ${enable_lzma+y}
+then :
   enableval=$enable_lzma;
-else
-  enable_lzma=yes
+else case e in #(
+  e) enable_lzma=yes ;;
+esac
 fi
 
 
 # Check whether --enable-plugins was given.
-if test "${enable_plugins+set}" = set; then :
+if test ${enable_plugins+y}
+then :
   enableval=$enable_plugins;
-else
-  enable_plugins=no
+else case e in #(
+  e) enable_plugins=no ;;
+esac
 fi
 
 
 
 
 # Check whether --with-external-htscodecs was given.
-if test "${with_external_htscodecs+set}" = set; then :
+if test ${with_external_htscodecs+y}
+then :
   withval=$with_external_htscodecs;
-else
-  with_external_htscodecs=no
+else case e in #(
+  e) with_external_htscodecs=no ;;
+esac
 fi
 
 
 
 
 # Check whether --with-libdeflate was given.
-if test "${with_libdeflate+set}" = set; then :
+if test ${with_libdeflate+y}
+then :
   withval=$with_libdeflate;
-else
-  with_libdeflate=check
+else case e in #(
+  e) with_libdeflate=check ;;
+esac
 fi
 
 
 
 # Check whether --with-plugin-dir was given.
-if test "${with_plugin_dir+set}" = set; then :
+if test ${with_plugin_dir+y}
+then :
   withval=$with_plugin_dir; case $withval in
      yes|no) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
@@ -4605,8 +5143,9 @@ endif
 EOF
    as_fn_error $? "no directory specified for --with-plugin-dir" "$LINENO" 5 ;;
    esac
-else
-  with_plugin_dir='$(libexecdir)/htslib'
+else case e in #(
+  e) with_plugin_dir='$(libexecdir)/htslib' ;;
+esac
 fi
 
 plugindir=$with_plugin_dir
@@ -4614,7 +5153,8 @@ plugindir=$with_plugin_dir
 
 
 # Check whether --with-plugin-path was given.
-if test "${with_plugin_path+set}" = set; then :
+if test ${with_plugin_path+y}
+then :
   withval=$with_plugin_path; case $withval in
      yes) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
@@ -4624,24 +5164,27 @@ EOF
    as_fn_error $? "no path specified for --with-plugin-path" "$LINENO" 5 ;;
      no)  with_plugin_path= ;;
    esac
-else
-  with_plugin_path=$with_plugin_dir
+else case e in #(
+  e) with_plugin_path=$with_plugin_dir ;;
+esac
 fi
 
 pluginpath=$with_plugin_path
 
 
 # Check whether --enable-s3 was given.
-if test "${enable_s3+set}" = set; then :
+if test ${enable_s3+y}
+then :
   enableval=$enable_s3;
-else
-  enable_s3=check
+else case e in #(
+  e) enable_s3=check ;;
+esac
 fi
 
 
 basic_host=${host_alias:-unknown-`uname -s`}
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking shared library type for $basic_host" >&5
-$as_echo_n "checking shared library type for $basic_host... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking shared library type for $basic_host" >&5
+printf %s "checking shared library type for $basic_host... " >&6; }
 case $basic_host in
   *-cygwin* | *-CYGWIN*)
     host_result="Cygwin DLL"
@@ -4670,43 +5213,49 @@ case $basic_host in
     PLUGIN_EXT=.so
     ;;
 esac
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $host_result" >&5
-$as_echo "$host_result" >&6; }
-
-
-if test x"$PLATFORM" = xdefault && test x"$enable_versioned_symbols" = xyes; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the linker supports versioned symbols" >&5
-$as_echo_n "checking whether the linker supports versioned symbols... " >&6; }
-if ${hts_cv_have_versioned_symbols+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $host_result" >&5
+printf "%s\n" "$host_result" >&6; }
+
+
+if test x"$PLATFORM" = xdefault && test x"$enable_versioned_symbols" = xyes
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the linker supports versioned symbols" >&5
+printf %s "checking whether the linker supports versioned symbols... " >&6; }
+if test ${hts_cv_have_versioned_symbols+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)
       save_LDFLAGS=$LDFLAGS
       LDFLAGS="-Wl,-version-script,$srcdir/htslib.map $LDFLAGS"
       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   hts_cv_have_versioned_symbols=yes
-else
-  hts_cv_have_versioned_symbols=no
+else case e in #(
+  e) hts_cv_have_versioned_symbols=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
       LDFLAGS=$save_LDFLAGS
-
+     ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_have_versioned_symbols" >&5
-$as_echo "$hts_cv_have_versioned_symbols" >&6; }
-   if test "x$hts_cv_have_versioned_symbols" = xyes; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_have_versioned_symbols" >&5
+printf "%s\n" "$hts_cv_have_versioned_symbols" >&6; }
+   if test "x$hts_cv_have_versioned_symbols" = xyes
+then :
 
      VERSION_SCRIPT_LDFLAGS='-Wl,-version-script,$(srcprefix)htslib.map'
 
@@ -4720,13 +5269,15 @@ fi
   # -fvisibility=hidden : GCC compatible
   # -xldscope=hidden    : SunStudio
   ac_opt_found=no
-  if test "x$ac_opt_found" = "xno"; then :
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -fvisibility=hidden" >&5
-$as_echo_n "checking whether the compiler accepts -fvisibility=hidden... " >&6; }
-if ${hts_cv_check__fvisibility_hidden+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_save_cflags=$CFLAGS
+  if test "x$ac_opt_found" = "xno"
+then :
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -fvisibility=hidden" >&5
+printf %s "checking whether the compiler accepts -fvisibility=hidden... " >&6; }
+if test ${hts_cv_check__fvisibility_hidden+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_save_cflags=$CFLAGS
     ac_check_save_ldflags=$LDFLAGS
     CFLAGS="$CFLAGS -fvisibility=hidden"
     LDFLAGS="$LDFLAGS -fvisibility=hidden"
@@ -4734,37 +5285,43 @@ else
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   hts_cv_check__fvisibility_hidden=yes
-       if test "xac_opt_found" != x; then :
+       if test "xac_opt_found" != x
+then :
   eval ac_opt_found="-fvisibility=hidden"
 fi
-else
-  hts_cv_check__fvisibility_hidden=no
+else case e in #(
+  e) hts_cv_check__fvisibility_hidden=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
     CFLAGS=$ac_check_save_cflags
-    LDFLAGS=$ac_check_save_ldflags
+    LDFLAGS=$ac_check_save_ldflags ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__fvisibility_hidden" >&5
-$as_echo "$hts_cv_check__fvisibility_hidden" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__fvisibility_hidden" >&5
+printf "%s\n" "$hts_cv_check__fvisibility_hidden" >&6; }
 
 fi
-   if test "x$ac_opt_found" = "xno"; then :
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -xldscope=hidden" >&5
-$as_echo_n "checking whether the compiler accepts -xldscope=hidden... " >&6; }
-if ${hts_cv_check__xldscope_hidden+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_save_cflags=$CFLAGS
+   if test "x$ac_opt_found" = "xno"
+then :
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -xldscope=hidden" >&5
+printf %s "checking whether the compiler accepts -xldscope=hidden... " >&6; }
+if test ${hts_cv_check__xldscope_hidden+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_save_cflags=$CFLAGS
     ac_check_save_ldflags=$LDFLAGS
     CFLAGS="$CFLAGS -xldscope=hidden"
     LDFLAGS="$LDFLAGS -xldscope=hidden"
@@ -4772,32 +5329,37 @@ else
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   hts_cv_check__xldscope_hidden=yes
-       if test "xac_opt_found" != x; then :
+       if test "xac_opt_found" != x
+then :
   eval ac_opt_found="-xldscope=hidden"
 fi
-else
-  hts_cv_check__xldscope_hidden=no
+else case e in #(
+  e) hts_cv_check__xldscope_hidden=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
     CFLAGS=$ac_check_save_cflags
-    LDFLAGS=$ac_check_save_ldflags
+    LDFLAGS=$ac_check_save_ldflags ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__xldscope_hidden" >&5
-$as_echo "$hts_cv_check__xldscope_hidden" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__xldscope_hidden" >&5
+printf "%s\n" "$hts_cv_check__xldscope_hidden" >&6; }
 
 fi
 
-  if test "x$ac_opt_found" != "xno"; then :
+  if test "x$ac_opt_found" != "xno"
+then :
   CFLAGS="$CFLAGS $ac_opt_found"
     LDFLAGS="$LDFLAGS $ac_opt_found"
 fi
@@ -4806,47 +5368,113 @@ fi
 
 
 
-  for ac_header in $ac_header_list
-do :
-  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
-ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
-"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
-  cat >>confdefs.h <<_ACEOF
-#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
+  # Make sure we can run config.sub.
+$SHELL "${ac_aux_dir}config.sub" sun4 >/dev/null 2>&1 ||
+  as_fn_error $? "cannot run $SHELL ${ac_aux_dir}config.sub" "$LINENO" 5
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
+printf %s "checking build system type... " >&6; }
+if test ${ac_cv_build+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "${ac_aux_dir}config.guess"`
+test "x$ac_build_alias" = x &&
+  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
+ac_cv_build=`$SHELL "${ac_aux_dir}config.sub" $ac_build_alias` ||
+  as_fn_error $? "$SHELL ${ac_aux_dir}config.sub $ac_build_alias failed" "$LINENO" 5
+ ;;
+esac
 fi
-
-done
-
-
-
-
-
-
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
+printf "%s\n" "$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
+esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
+printf %s "checking host system type... " >&6; }
+if test ${ac_cv_host+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "${ac_aux_dir}config.sub" $host_alias` ||
+    as_fn_error $? "$SHELL ${ac_aux_dir}config.sub $host_alias failed" "$LINENO" 5
+fi
+ ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
+printf "%s\n" "$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
 
 
-for ac_func in getpagesize
-do :
-  ac_fn_c_check_func "$LINENO" "getpagesize" "ac_cv_func_getpagesize"
-if test "x$ac_cv_func_getpagesize" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_GETPAGESIZE 1
-_ACEOF
 
-fi
+ac_func=
+for ac_item in $ac_func_c_list
+do
+  if test $ac_func; then
+    ac_fn_c_check_func "$LINENO" $ac_func ac_cv_func_$ac_func
+    if eval test \"x\$ac_cv_func_$ac_func\" = xyes; then
+      echo "#define $ac_item 1" >> confdefs.h
+    fi
+    ac_func=
+  else
+    ac_func=$ac_item
+  fi
 done
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working mmap" >&5
-$as_echo_n "checking for working mmap... " >&6; }
-if ${ac_cv_func_mmap_fixed_mapped+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "$cross_compiling" = yes; then :
-  ac_cv_func_mmap_fixed_mapped=no
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for working mmap" >&5
+printf %s "checking for working mmap... " >&6; }
+if test ${ac_cv_func_mmap_fixed_mapped+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test "$cross_compiling" = yes
+then :
+  case "$host_os" in # ((
+			  # Guess yes on platforms where we know the result.
+		  linux*) ac_cv_func_mmap_fixed_mapped=yes ;;
+			  # If we don't know, assume the worst.
+		  *)      ac_cv_func_mmap_fixed_mapped=no ;;
+		esac
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $ac_includes_default
 /* malloc might have been renamed as rpl_malloc. */
@@ -4867,25 +5495,21 @@ $ac_includes_default
    VM page cache was not coherent with the file system buffer cache
    like early versions of FreeBSD and possibly contemporary NetBSD.)
    For shared mappings, we should conversely verify that changes get
-   propagated back to all the places they're supposed to be.
-
-   Grep wants private fixed already mapped.
-   The main things grep needs to know about mmap are:
-   * does it exist and is it safe to write into the mmap'd area
-   * how to use it (BSD variants)  */
+   propagated back to all the places they're supposed to be.  */
 
 #include <fcntl.h>
 #include <sys/mman.h>
 
-#if !defined STDC_HEADERS && !defined HAVE_STDLIB_H
-char *malloc ();
-#endif
-
-/* This mess was copied from the GNU getpagesize.h.  */
-#ifndef HAVE_GETPAGESIZE
+#ifndef getpagesize
+/* Prefer sysconf to the legacy getpagesize function, as getpagesize has
+   been removed from POSIX and is limited to page sizes that fit in 'int'.  */
 # ifdef _SC_PAGESIZE
-#  define getpagesize() sysconf(_SC_PAGESIZE)
-# else /* no _SC_PAGESIZE */
+#  define getpagesize() sysconf (_SC_PAGESIZE)
+# elif defined _SC_PAGE_SIZE
+#  define getpagesize() sysconf (_SC_PAGE_SIZE)
+# elif HAVE_GETPAGESIZE
+int getpagesize ();
+# else
 #  ifdef HAVE_SYS_PARAM_H
 #   include <sys/param.h>
 #   ifdef EXEC_PAGESIZE
@@ -4909,16 +5533,15 @@ char *malloc ();
 #  else /* no HAVE_SYS_PARAM_H */
 #   define getpagesize() 8192	/* punt totally */
 #  endif /* no HAVE_SYS_PARAM_H */
-# endif /* no _SC_PAGESIZE */
-
-#endif /* no HAVE_GETPAGESIZE */
+# endif
+#endif
 
 int
-main ()
+main (void)
 {
   char *data, *data2, *data3;
   const char *cdata2;
-  int i, pagesize;
+  long i, pagesize;
   int fd, fd2;
 
   pagesize = getpagesize ();
@@ -4952,8 +5575,7 @@ main ()
     if (*(data2 + i))
       return 7;
   close (fd2);
-  if (munmap (data2, pagesize))
-    return 8;
+  /* 'return 8;' not currently used.  */
 
   /* Next, try to mmap the file at a fixed address which already has
      something else allocated at it.  If we can, also make sure that
@@ -4987,113 +5609,202 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
+if ac_fn_c_try_run "$LINENO"
+then :
   ac_cv_func_mmap_fixed_mapped=yes
-else
-  ac_cv_func_mmap_fixed_mapped=no
+else case e in #(
+  e) ac_cv_func_mmap_fixed_mapped=no ;;
+esac
 fi
 rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
+  conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
 fi
-
+ ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_mmap_fixed_mapped" >&5
-$as_echo "$ac_cv_func_mmap_fixed_mapped" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_mmap_fixed_mapped" >&5
+printf "%s\n" "$ac_cv_func_mmap_fixed_mapped" >&6; }
 if test $ac_cv_func_mmap_fixed_mapped = yes; then
 
-$as_echo "#define HAVE_MMAP 1" >>confdefs.h
+printf "%s\n" "#define HAVE_MMAP 1" >>confdefs.h
 
 fi
 rm -f conftest.mmap conftest.txt
 
-for ac_func in gmtime_r fsync drand48 srand48_deterministic
-do :
-  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
-ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
-if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
-  cat >>confdefs.h <<_ACEOF
-#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
-_ACEOF
+ac_fn_c_check_func "$LINENO" "gmtime_r" "ac_cv_func_gmtime_r"
+if test "x$ac_cv_func_gmtime_r" = xyes
+then :
+  printf "%s\n" "#define HAVE_GMTIME_R 1" >>confdefs.h
+
+fi
+ac_fn_c_check_func "$LINENO" "fsync" "ac_cv_func_fsync"
+if test "x$ac_cv_func_fsync" = xyes
+then :
+  printf "%s\n" "#define HAVE_FSYNC 1" >>confdefs.h
+
+fi
+ac_fn_c_check_func "$LINENO" "drand48" "ac_cv_func_drand48"
+if test "x$ac_cv_func_drand48" = xyes
+then :
+  printf "%s\n" "#define HAVE_DRAND48 1" >>confdefs.h
+
+fi
+ac_fn_c_check_func "$LINENO" "srand48_deterministic" "ac_cv_func_srand48_deterministic"
+if test "x$ac_cv_func_srand48_deterministic" = xyes
+then :
+  printf "%s\n" "#define HAVE_SRAND48_DETERMINISTIC 1" >>confdefs.h
 
 fi
-done
 
 
 # Darwin has a dubious fdatasync() symbol, but no declaration in <unistd.h>
-as_ac_Symbol=`$as_echo "ac_cv_have_decl_fdatasync(int)" | $as_tr_sh`
-ac_fn_c_check_decl "$LINENO" "fdatasync(int)" "$as_ac_Symbol" "$ac_includes_default"
-if eval test \"x\$"$as_ac_Symbol"\" = x"yes"; then :
-  for ac_func in fdatasync
-do :
+as_ac_Symbol=`printf "%s\n" "ac_cv_have_decl_fdatasync(int)" | sed "$as_sed_sh"`
+ac_fn_check_decl "$LINENO" "fdatasync(int)" "$as_ac_Symbol" "$ac_includes_default" "$ac_c_undeclared_builtin_options" "CFLAGS"
+if eval test \"x\$"$as_ac_Symbol"\" = x"yes"
+then :
   ac_fn_c_check_func "$LINENO" "fdatasync" "ac_cv_func_fdatasync"
-if test "x$ac_cv_func_fdatasync" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_FDATASYNC 1
-_ACEOF
+if test "x$ac_cv_func_fdatasync" = xyes
+then :
+  printf "%s\n" "#define HAVE_FDATASYNC 1" >>confdefs.h
 
 fi
-done
 
 fi
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for __attribute__((constructor))" >&5
+printf %s "checking for __attribute__((constructor))... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  static __attribute__((constructor)) void noop(void) {}
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+printf "%s\n" "#define HAVE_ATTRIBUTE_CONSTRUCTOR 1" >>confdefs.h
+
+
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; } ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for clock_gettime with CLOCK_PROCESS_CPUTIME_ID" >&5
+printf %s "checking for clock_gettime with CLOCK_PROCESS_CPUTIME_ID... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <time.h>
+int
+main (void)
+{
+
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+printf "%s\n" "#define HAVE_CLOCK_GETTIME_CPUTIME 1" >>confdefs.h
+
+
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; } ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
 
 if test $enable_plugins != no; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing dlsym" >&5
-$as_echo_n "checking for library containing dlsym... " >&6; }
-if ${ac_cv_search_dlsym+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing dlsym" >&5
+printf %s "checking for library containing dlsym... " >&6; }
+if test ${ac_cv_search_dlsym+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_func_search_save_LIBS=$LIBS
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char dlsym ();
+char dlsym (void);
 int
-main ()
+main (void)
 {
 return dlsym ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' dl; do
+for ac_lib in '' dl
+do
   if test -z "$ac_lib"; then
     ac_res="none required"
   else
     ac_res=-l$ac_lib
     LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
   fi
-  if ac_fn_c_try_link "$LINENO"; then :
+  if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_search_dlsym=$ac_res
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext
-  if ${ac_cv_search_dlsym+:} false; then :
+  if test ${ac_cv_search_dlsym+y}
+then :
   break
 fi
 done
-if ${ac_cv_search_dlsym+:} false; then :
+if test ${ac_cv_search_dlsym+y}
+then :
 
-else
-  ac_cv_search_dlsym=no
+else case e in #(
+  e) ac_cv_search_dlsym=no ;;
+esac
 fi
 rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
+LIBS=$ac_func_search_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_dlsym" >&5
-$as_echo "$ac_cv_search_dlsym" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_dlsym" >&5
+printf "%s\n" "$ac_cv_search_dlsym" >&6; }
 ac_res=$ac_cv_search_dlsym
-if test "$ac_res" != no; then :
+if test "$ac_res" != no
+then :
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
-else
-  cat > config.mk <<'EOF'
+else case e in #(
+  e) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
 endif
@@ -5101,17 +5812,19 @@ EOF
    as_fn_error $? "dlsym() not found
 
 Plugin support requires dynamic linking facilities from the operating system.
-Either configure with --disable-plugins or resolve this error to build HTSlib." "$LINENO" 5
+Either configure with --disable-plugins or resolve this error to build HTSlib." "$LINENO" 5 ;;
+esac
 fi
 
   # Check if the compiler understands -rdynamic
   # TODO Test whether this is required and/or needs tweaking per-platform
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -rdynamic" >&5
-$as_echo_n "checking whether the compiler accepts -rdynamic... " >&6; }
-if ${hts_cv_check__rdynamic+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_save_cflags=$CFLAGS
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler accepts -rdynamic" >&5
+printf %s "checking whether the compiler accepts -rdynamic... " >&6; }
+if test ${hts_cv_check__rdynamic+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_save_cflags=$CFLAGS
     ac_check_save_ldflags=$LDFLAGS
     CFLAGS="$CFLAGS -rdynamic"
     LDFLAGS="$LDFLAGS -rdynamic"
@@ -5119,30 +5832,35 @@ else
 /* end confdefs.h.  */
 
 int
-main ()
+main (void)
 {
 
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   hts_cv_check__rdynamic=yes
-       if test "xrdynamic_flag" != x; then :
+       if test "xrdynamic_flag" != x
+then :
   eval rdynamic_flag="-rdynamic"
 fi
-else
-  hts_cv_check__rdynamic=no
+else case e in #(
+  e) hts_cv_check__rdynamic=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
     CFLAGS=$ac_check_save_cflags
-    LDFLAGS=$ac_check_save_ldflags
+    LDFLAGS=$ac_check_save_ldflags ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__rdynamic" >&5
-$as_echo "$hts_cv_check__rdynamic" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $hts_cv_check__rdynamic" >&5
+printf "%s\n" "$hts_cv_check__rdynamic" >&6; }
 
-  if test x"$rdynamic_flag" != "xno"; then :
+  if test x"$rdynamic_flag" != "xno"
+then :
   LDFLAGS="$LDFLAGS $rdynamic_flag"
      static_LDFLAGS="$static_LDFLAGS $rdynamic_flag"
 fi
@@ -5150,72 +5868,81 @@ fi
     -l*) static_LIBS="$static_LIBS $ac_cv_search_dlsym" ;;
   esac
 
-$as_echo "#define ENABLE_PLUGINS 1" >>confdefs.h
+printf "%s\n" "#define ENABLE_PLUGINS 1" >>confdefs.h
 
 
 
-cat >>confdefs.h <<_ACEOF
-#define PLUGIN_EXT "$PLUGIN_EXT"
-_ACEOF
+printf "%s\n" "#define PLUGIN_EXT \"$PLUGIN_EXT\"" >>confdefs.h
 
 fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing log" >&5
-$as_echo_n "checking for library containing log... " >&6; }
-if ${ac_cv_search_log+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing log" >&5
+printf %s "checking for library containing log... " >&6; }
+if test ${ac_cv_search_log+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_func_search_save_LIBS=$LIBS
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char log ();
+char log (void);
 int
-main ()
+main (void)
 {
 return log ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' m; do
+for ac_lib in '' m
+do
   if test -z "$ac_lib"; then
     ac_res="none required"
   else
     ac_res=-l$ac_lib
     LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
   fi
-  if ac_fn_c_try_link "$LINENO"; then :
+  if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_search_log=$ac_res
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext
-  if ${ac_cv_search_log+:} false; then :
+  if test ${ac_cv_search_log+y}
+then :
   break
 fi
 done
-if ${ac_cv_search_log+:} false; then :
+if test ${ac_cv_search_log+y}
+then :
 
-else
-  ac_cv_search_log=no
+else case e in #(
+  e) ac_cv_search_log=no ;;
+esac
 fi
 rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
+LIBS=$ac_func_search_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_log" >&5
-$as_echo "$ac_cv_search_log" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_log" >&5
+printf "%s\n" "$ac_cv_search_log" >&6; }
 ac_res=$ac_cv_search_log
-if test "$ac_res" != no; then :
+if test "$ac_res" != no
+then :
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
-else
-  cat > config.mk <<'EOF'
+else case e in #(
+  e) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
 endif
@@ -5223,65 +5950,74 @@ EOF
    as_fn_error $? "log() not found
 
 HTSLIB requires a working floating-point math library.
-FAILED.  This error must be resolved in order to build HTSlib successfully." "$LINENO" 5
+FAILED.  This error must be resolved in order to build HTSlib successfully." "$LINENO" 5 ;;
+esac
 fi
 
 
 zlib_devel=ok
 ac_fn_c_check_header_compile "$LINENO" "zlib.h" "ac_cv_header_zlib_h" ";
 "
-if test "x$ac_cv_header_zlib_h" = xyes; then :
+if test "x$ac_cv_header_zlib_h" = xyes
+then :
 
-else
-  zlib_devel=missing
+else case e in #(
+  e) zlib_devel=missing ;;
+esac
 fi
 
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inflate in -lz" >&5
-$as_echo_n "checking for inflate in -lz... " >&6; }
-if ${ac_cv_lib_z_inflate+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for inflate in -lz" >&5
+printf %s "checking for inflate in -lz... " >&6; }
+if test ${ac_cv_lib_z_inflate+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-lz  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char inflate ();
+char inflate (void);
 int
-main ()
+main (void)
 {
 return inflate ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_z_inflate=yes
-else
-  ac_cv_lib_z_inflate=no
+else case e in #(
+  e) ac_cv_lib_z_inflate=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_inflate" >&5
-$as_echo "$ac_cv_lib_z_inflate" >&6; }
-if test "x$ac_cv_lib_z_inflate" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBZ 1
-_ACEOF
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_z_inflate" >&5
+printf "%s\n" "$ac_cv_lib_z_inflate" >&6; }
+if test "x$ac_cv_lib_z_inflate" = xyes
+then :
+  printf "%s\n" "#define HAVE_LIBZ 1" >>confdefs.h
 
   LIBS="-lz $LIBS"
 
-else
-  zlib_devel=missing
+else case e in #(
+  e) zlib_devel=missing ;;
+esac
 fi
 
 
@@ -5302,95 +6038,109 @@ is installed.
 FAILED.  This error must be resolved in order to build HTSlib successfully." "$LINENO" 5
 fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing recv" >&5
-$as_echo_n "checking for library containing recv... " >&6; }
-if ${ac_cv_search_recv+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing recv" >&5
+printf %s "checking for library containing recv... " >&6; }
+if test ${ac_cv_search_recv+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_func_search_save_LIBS=$LIBS
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char recv ();
+char recv (void);
 int
-main ()
+main (void)
 {
 return recv ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' socket ws2_32; do
+for ac_lib in '' socket ws2_32
+do
   if test -z "$ac_lib"; then
     ac_res="none required"
   else
     ac_res=-l$ac_lib
     LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
   fi
-  if ac_fn_c_try_link "$LINENO"; then :
+  if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_search_recv=$ac_res
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext
-  if ${ac_cv_search_recv+:} false; then :
+  if test ${ac_cv_search_recv+y}
+then :
   break
 fi
 done
-if ${ac_cv_search_recv+:} false; then :
+if test ${ac_cv_search_recv+y}
+then :
 
-else
-  ac_cv_search_recv=no
+else case e in #(
+  e) ac_cv_search_recv=no ;;
+esac
 fi
 rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
+LIBS=$ac_func_search_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_recv" >&5
-$as_echo "$ac_cv_search_recv" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_recv" >&5
+printf "%s\n" "$ac_cv_search_recv" >&6; }
 ac_res=$ac_cv_search_recv
-if test "$ac_res" != no; then :
+if test "$ac_res" != no
+then :
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
 if test "$ac_cv_search_recv" != "none required"
 then
   static_LIBS="$static_LIBS $ac_cv_search_recv"
 fi
-else
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing recv using declaration" >&5
-$as_echo_n "checking for library containing recv using declaration... " >&6; }
+else case e in #(
+  e)   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing recv using declaration" >&5
+printf %s "checking for library containing recv using declaration... " >&6; }
    LIBS="-lws2_32 $LIBS"
    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <winsock2.h>
 int
-main ()
+main (void)
 {
 recv(0, 0, 0, 0);
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: -lws2_32" >&5
-$as_echo "-lws2_32" >&6; }
+if ac_fn_c_try_link "$LINENO"
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: -lws2_32" >&5
+printf "%s\n" "-lws2_32" >&6; }
       static_LIBS="$static_LIBS -lws2_32"
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
       cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
 endif
 EOF
-   as_fn_error $? "unable to find the recv() function" "$LINENO" 5
+   as_fn_error $? "unable to find the recv() function" "$LINENO" 5 ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext ;;
+esac
 fi
 
 
@@ -5398,58 +6148,66 @@ if test "$enable_bz2" != no; then
   bz2_devel=ok
   ac_fn_c_check_header_compile "$LINENO" "bzlib.h" "ac_cv_header_bzlib_h" ";
 "
-if test "x$ac_cv_header_bzlib_h" = xyes; then :
+if test "x$ac_cv_header_bzlib_h" = xyes
+then :
 
-else
-  bz2_devel=missing
+else case e in #(
+  e) bz2_devel=missing ;;
+esac
 fi
 
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for BZ2_bzBuffToBuffCompress in -lbz2" >&5
-$as_echo_n "checking for BZ2_bzBuffToBuffCompress in -lbz2... " >&6; }
-if ${ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for BZ2_bzBuffToBuffCompress in -lbz2" >&5
+printf %s "checking for BZ2_bzBuffToBuffCompress in -lbz2... " >&6; }
+if test ${ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-lbz2  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char BZ2_bzBuffToBuffCompress ();
+char BZ2_bzBuffToBuffCompress (void);
 int
-main ()
+main (void)
 {
 return BZ2_bzBuffToBuffCompress ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress=yes
-else
-  ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress=no
+else case e in #(
+  e) ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress" >&5
-$as_echo "$ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress" >&6; }
-if test "x$ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBBZ2 1
-_ACEOF
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress" >&5
+printf "%s\n" "$ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress" >&6; }
+if test "x$ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress" = xyes
+then :
+  printf "%s\n" "#define HAVE_LIBBZ2 1" >>confdefs.h
 
   LIBS="-lbz2 $LIBS"
 
-else
-  bz2_devel=missing
+else case e in #(
+  e) bz2_devel=missing ;;
+esac
 fi
 
   if test $bz2_devel != ok; then
@@ -5481,66 +6239,72 @@ fi
 
 if test "$enable_lzma" != no; then
   lzma_devel=ok
-  for ac_header in lzma.h
+         for ac_header in lzma.h
 do :
   ac_fn_c_check_header_compile "$LINENO" "lzma.h" "ac_cv_header_lzma_h" ";
 "
-if test "x$ac_cv_header_lzma_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LZMA_H 1
-_ACEOF
+if test "x$ac_cv_header_lzma_h" = xyes
+then :
+  printf "%s\n" "#define HAVE_LZMA_H 1" >>confdefs.h
 
-else
-  lzma_devel=header-missing
+else case e in #(
+  e) lzma_devel=header-missing ;;
+esac
 fi
 
 done
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for lzma_easy_buffer_encode in -llzma" >&5
-$as_echo_n "checking for lzma_easy_buffer_encode in -llzma... " >&6; }
-if ${ac_cv_lib_lzma_lzma_easy_buffer_encode+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for lzma_easy_buffer_encode in -llzma" >&5
+printf %s "checking for lzma_easy_buffer_encode in -llzma... " >&6; }
+if test ${ac_cv_lib_lzma_lzma_easy_buffer_encode+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-llzma  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char lzma_easy_buffer_encode ();
+char lzma_easy_buffer_encode (void);
 int
-main ()
+main (void)
 {
 return lzma_easy_buffer_encode ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_lzma_lzma_easy_buffer_encode=yes
-else
-  ac_cv_lib_lzma_lzma_easy_buffer_encode=no
+else case e in #(
+  e) ac_cv_lib_lzma_lzma_easy_buffer_encode=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lzma_lzma_easy_buffer_encode" >&5
-$as_echo "$ac_cv_lib_lzma_lzma_easy_buffer_encode" >&6; }
-if test "x$ac_cv_lib_lzma_lzma_easy_buffer_encode" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBLZMA 1
-_ACEOF
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lzma_lzma_easy_buffer_encode" >&5
+printf "%s\n" "$ac_cv_lib_lzma_lzma_easy_buffer_encode" >&6; }
+if test "x$ac_cv_lib_lzma_lzma_easy_buffer_encode" = xyes
+then :
+  printf "%s\n" "#define HAVE_LIBLZMA 1" >>confdefs.h
 
   LIBS="-llzma $LIBS"
 
-else
-  lzma_devel=missing
+else case e in #(
+  e) lzma_devel=missing ;;
+esac
 fi
 
   if test $lzma_devel = missing; then
@@ -5566,69 +6330,81 @@ produced elsewhere unreadable) or resolve this error to build HTSlib." "$LINENO"
   static_LIBS="$static_LIBS -llzma"
 fi
 
-if test "x$with_external_htscodecs" != "xno"; then :
+if test "x$with_external_htscodecs" != "xno"
+then :
   libhtscodecs=ok
    ac_fn_c_check_header_compile "$LINENO" "htscodecs/rANS_static4x16.h" "ac_cv_header_htscodecs_rANS_static4x16_h" ";
 "
-if test "x$ac_cv_header_htscodecs_rANS_static4x16_h" = xyes; then :
+if test "x$ac_cv_header_htscodecs_rANS_static4x16_h" = xyes
+then :
 
-else
-  libhtscodecs='missing header'
+else case e in #(
+  e) libhtscodecs='missing header' ;;
+esac
 fi
 
-
-   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for rans_compress_bound_4x16 in -lhtscodecs" >&5
-$as_echo_n "checking for rans_compress_bound_4x16 in -lhtscodecs... " >&6; }
-if ${ac_cv_lib_htscodecs_rans_compress_bound_4x16+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for rans_compress_bound_4x16 in -lhtscodecs" >&5
+printf %s "checking for rans_compress_bound_4x16 in -lhtscodecs... " >&6; }
+if test ${ac_cv_lib_htscodecs_rans_compress_bound_4x16+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-lhtscodecs  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char rans_compress_bound_4x16 ();
+char rans_compress_bound_4x16 (void);
 int
-main ()
+main (void)
 {
 return rans_compress_bound_4x16 ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_htscodecs_rans_compress_bound_4x16=yes
-else
-  ac_cv_lib_htscodecs_rans_compress_bound_4x16=no
+else case e in #(
+  e) ac_cv_lib_htscodecs_rans_compress_bound_4x16=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_htscodecs_rans_compress_bound_4x16" >&5
-$as_echo "$ac_cv_lib_htscodecs_rans_compress_bound_4x16" >&6; }
-if test "x$ac_cv_lib_htscodecs_rans_compress_bound_4x16" = xyes; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_htscodecs_rans_compress_bound_4x16" >&5
+printf "%s\n" "$ac_cv_lib_htscodecs_rans_compress_bound_4x16" >&6; }
+if test "x$ac_cv_lib_htscodecs_rans_compress_bound_4x16" = xyes
+then :
   :
-else
-  libhtscodecs='missing library'
+else case e in #(
+  e) libhtscodecs='missing library' ;;
+esac
 fi
 
-   if test "$libhtscodecs" = "ok"; then :
+   if test "$libhtscodecs" = "ok"
+then :
 
-$as_echo "#define HAVE_EXTERNAL_LIBHTSCODECS 1" >>confdefs.h
+printf "%s\n" "#define HAVE_EXTERNAL_LIBHTSCODECS 1" >>confdefs.h
 
       LIBS="-lhtscodecs $LIBS"
       private_LIBS="-lhtscodecs $private_LIBS"
       static_LIBS="-lhtscodecs $static_LIBS"
       selected_htscodecs_mk="htscodecs_external.mk"
-else
-  cat > config.mk <<'EOF'
+else case e in #(
+  e) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
 endif
@@ -5640,19 +6416,22 @@ required header / library files.  You either need to supply these and
 if necessary set CPPFLAGS and LDFLAGS so the compiler can find them;
 or configure using --without-external-htscodecs to build the required
 functions from the htscodecs submodule.
-" "$LINENO" 5
+" "$LINENO" 5 ;;
+esac
 fi
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether htscodecs files are present" >&5
-$as_echo_n "checking whether htscodecs files are present... " >&6; }
-   if test -e "$srcdir/htscodecs/htscodecs/rANS_static4x16.h"; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether htscodecs files are present" >&5
+printf %s "checking whether htscodecs files are present... " >&6; }
+   if test -e "$srcdir/htscodecs/htscodecs/rANS_static4x16.h"
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
       selected_htscodecs_mk="htscodecs_bundled.mk"
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-      if test -e "$srcdir/.git"; then :
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+      if test -e "$srcdir/.git"
+then :
   cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
@@ -5667,8 +6446,8 @@ included as a submodule.  Try running:
 
 in  the top-level htslib directory to update it, and then re-run configure.
 " "$LINENO" 5
-else
-  cat > config.mk <<'EOF'
+else case e in #(
+  e) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
 endif
@@ -5677,73 +6456,89 @@ EOF
 
 You have an incomplete distribution.  Please try downloading one of the
 official releases from https://www.htslib.org
-" "$LINENO" 5
-fi
-fi
+" "$LINENO" 5 ;;
+esac
+fi ;;
+esac
+fi ;;
+esac
 fi
 
-if test "x$with_libdeflate" != "xno"; then :
+if test "x$with_libdeflate" != "xno"
+then :
   libdeflate=ok
    ac_fn_c_check_header_compile "$LINENO" "libdeflate.h" "ac_cv_header_libdeflate_h" ";
 "
-if test "x$ac_cv_header_libdeflate_h" = xyes; then :
+if test "x$ac_cv_header_libdeflate_h" = xyes
+then :
 
-else
-  libdeflate='missing header'
+else case e in #(
+  e) libdeflate='missing header' ;;
+esac
 fi
 
-
-   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for libdeflate_deflate_compress in -ldeflate" >&5
-$as_echo_n "checking for libdeflate_deflate_compress in -ldeflate... " >&6; }
-if ${ac_cv_lib_deflate_libdeflate_deflate_compress+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for libdeflate_deflate_compress in -ldeflate" >&5
+printf %s "checking for libdeflate_deflate_compress in -ldeflate... " >&6; }
+if test ${ac_cv_lib_deflate_libdeflate_deflate_compress+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-ldeflate  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char libdeflate_deflate_compress ();
+char libdeflate_deflate_compress (void);
 int
-main ()
+main (void)
 {
 return libdeflate_deflate_compress ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_deflate_libdeflate_deflate_compress=yes
-else
-  ac_cv_lib_deflate_libdeflate_deflate_compress=no
+else case e in #(
+  e) ac_cv_lib_deflate_libdeflate_deflate_compress=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_deflate_libdeflate_deflate_compress" >&5
-$as_echo "$ac_cv_lib_deflate_libdeflate_deflate_compress" >&6; }
-if test "x$ac_cv_lib_deflate_libdeflate_deflate_compress" = xyes; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_deflate_libdeflate_deflate_compress" >&5
+printf "%s\n" "$ac_cv_lib_deflate_libdeflate_deflate_compress" >&6; }
+if test "x$ac_cv_lib_deflate_libdeflate_deflate_compress" = xyes
+then :
   :
-else
-  libdeflate='missing library'
+else case e in #(
+  e) libdeflate='missing library' ;;
+esac
 fi
 
-   if test "$libdeflate" = "ok"; then :
+   if test "$libdeflate" = "ok"
+then :
 
-$as_echo "#define HAVE_LIBDEFLATE 1" >>confdefs.h
+printf "%s\n" "#define HAVE_LIBDEFLATE 1" >>confdefs.h
 
      LIBS="-ldeflate $LIBS"
      private_LIBS="$private_LIBS -ldeflate"
      static_LIBS="$static_LIBS -ldeflate"
-else
-  if test "x$with_libdeflate" != "xcheck"; then :
+else case e in #(
+  e) if test "x$with_libdeflate" != "xcheck"
+then :
   cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
@@ -5759,7 +6554,8 @@ are not currently on them.
 
 Either configure with --without-libdeflate or resolve this error to build
 HTSlib." "$LINENO" 5
-fi
+fi ;;
+esac
 fi
 fi
 
@@ -5768,105 +6564,124 @@ if test "$enable_libcurl" != no; then
   libcurl_devel=ok
   ac_fn_c_check_header_compile "$LINENO" "curl/curl.h" "ac_cv_header_curl_curl_h" ";
 "
-if test "x$ac_cv_header_curl_curl_h" = xyes; then :
+if test "x$ac_cv_header_curl_curl_h" = xyes
+then :
 
-else
-  libcurl_devel="headers not found"
+else case e in #(
+  e) libcurl_devel="headers not found" ;;
+esac
 fi
 
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for curl_easy_pause in -lcurl" >&5
-$as_echo_n "checking for curl_easy_pause in -lcurl... " >&6; }
-if ${ac_cv_lib_curl_curl_easy_pause+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for curl_easy_pause in -lcurl" >&5
+printf %s "checking for curl_easy_pause in -lcurl... " >&6; }
+if test ${ac_cv_lib_curl_curl_easy_pause+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-lcurl  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char curl_easy_pause ();
+char curl_easy_pause (void);
 int
-main ()
+main (void)
 {
 return curl_easy_pause ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_curl_curl_easy_pause=yes
-else
-  ac_cv_lib_curl_curl_easy_pause=no
+else case e in #(
+  e) ac_cv_lib_curl_curl_easy_pause=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_curl_curl_easy_pause" >&5
-$as_echo "$ac_cv_lib_curl_curl_easy_pause" >&6; }
-if test "x$ac_cv_lib_curl_curl_easy_pause" = xyes; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_curl_curl_easy_pause" >&5
+printf "%s\n" "$ac_cv_lib_curl_curl_easy_pause" >&6; }
+if test "x$ac_cv_lib_curl_curl_easy_pause" = xyes
+then :
   :
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for curl_easy_init in -lcurl" >&5
-$as_echo_n "checking for curl_easy_init in -lcurl... " >&6; }
-if ${ac_cv_lib_curl_curl_easy_init+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for curl_easy_init in -lcurl" >&5
+printf %s "checking for curl_easy_init in -lcurl... " >&6; }
+if test ${ac_cv_lib_curl_curl_easy_init+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_check_lib_save_LIBS=$LIBS
 LIBS="-lcurl  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char curl_easy_init ();
+char curl_easy_init (void);
 int
-main ()
+main (void)
 {
 return curl_easy_init ();
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_lib_curl_curl_easy_init=yes
-else
-  ac_cv_lib_curl_curl_easy_init=no
+else case e in #(
+  e) ac_cv_lib_curl_curl_easy_init=no ;;
+esac
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+LIBS=$ac_check_lib_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_curl_curl_easy_init" >&5
-$as_echo "$ac_cv_lib_curl_curl_easy_init" >&6; }
-if test "x$ac_cv_lib_curl_curl_easy_init" = xyes; then :
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_curl_curl_easy_init" >&5
+printf "%s\n" "$ac_cv_lib_curl_curl_easy_init" >&6; }
+if test "x$ac_cv_lib_curl_curl_easy_init" = xyes
+then :
   libcurl_devel="library is too old (7.18+ required)"
-else
-  libcurl_devel="library not found"
+else case e in #(
+  e) libcurl_devel="library not found" ;;
+esac
 fi
-
+ ;;
+esac
 fi
 
 
   if test "$libcurl_devel" = ok; then
 
-$as_echo "#define HAVE_LIBCURL 1" >>confdefs.h
+printf "%s\n" "#define HAVE_LIBCURL 1" >>confdefs.h
 
     libcurl=enabled
   elif test "$enable_libcurl" = check; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: libcurl not enabled: $libcurl_devel" >&5
-$as_echo "$as_me: WARNING: libcurl not enabled: $libcurl_devel" >&2;}
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: libcurl not enabled: $libcurl_devel" >&5
+printf "%s\n" "$as_me: WARNING: libcurl not enabled: $libcurl_devel" >&2;}
   else
     cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
@@ -5897,13 +6712,13 @@ gcs=disabled
 if test "$enable_gcs" != no; then
   if test $libcurl = enabled; then
 
-$as_echo "#define ENABLE_GCS 1" >>confdefs.h
+printf "%s\n" "#define ENABLE_GCS 1" >>confdefs.h
 
     gcs=enabled
   else
     case "$enable_gcs" in
-      check) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: GCS support not enabled: requires libcurl support" >&5
-$as_echo "$as_me: WARNING: GCS support not enabled: requires libcurl support" >&2;} ;;
+      check) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: GCS support not enabled: requires libcurl support" >&5
+printf "%s\n" "$as_me: WARNING: GCS support not enabled: requires libcurl support" >&2;} ;;
       *) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
@@ -5926,8 +6741,8 @@ if test "$enable_s3" != no; then
     need_crypto="$enable_s3"
   else
     case "$enable_s3" in
-      check) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: S3 support not enabled: requires libcurl support" >&5
-$as_echo "$as_me: WARNING: S3 support not enabled: requires libcurl support" >&2;} ;;
+      check) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: S3 support not enabled: requires libcurl support" >&5
+printf "%s\n" "$as_me: WARNING: S3 support not enabled: requires libcurl support" >&2;} ;;
       *) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
 $(error Resolve configure error first)
@@ -5945,75 +6760,87 @@ fi
 CRYPTO_LIBS=
 if test $need_crypto != no; then
   ac_fn_c_check_func "$LINENO" "CCHmac" "ac_cv_func_CCHmac"
-if test "x$ac_cv_func_CCHmac" = xyes; then :
-
-$as_echo "#define HAVE_COMMONCRYPTO 1" >>confdefs.h
-
-else
-  save_LIBS=$LIBS
-     { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing HMAC" >&5
-$as_echo_n "checking for library containing HMAC... " >&6; }
-if ${ac_cv_search_HMAC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
+if test "x$ac_cv_func_CCHmac" = xyes
+then :
+
+printf "%s\n" "#define HAVE_COMMONCRYPTO 1" >>confdefs.h
+
+else case e in #(
+  e) save_LIBS=$LIBS
+     { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing HMAC" >&5
+printf %s "checking for library containing HMAC... " >&6; }
+if test ${ac_cv_search_HMAC+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_func_search_save_LIBS=$LIBS
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char HMAC ();
+char HMAC (void);
 int
-main ()
+main (void)
 {
 return HMAC ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' crypto; do
+for ac_lib in '' crypto
+do
   if test -z "$ac_lib"; then
     ac_res="none required"
   else
     ac_res=-l$ac_lib
     LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
   fi
-  if ac_fn_c_try_link "$LINENO"; then :
+  if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_search_HMAC=$ac_res
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext
-  if ${ac_cv_search_HMAC+:} false; then :
+  if test ${ac_cv_search_HMAC+y}
+then :
   break
 fi
 done
-if ${ac_cv_search_HMAC+:} false; then :
+if test ${ac_cv_search_HMAC+y}
+then :
 
-else
-  ac_cv_search_HMAC=no
+else case e in #(
+  e) ac_cv_search_HMAC=no ;;
+esac
 fi
 rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
+LIBS=$ac_func_search_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_HMAC" >&5
-$as_echo "$ac_cv_search_HMAC" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_HMAC" >&5
+printf "%s\n" "$ac_cv_search_HMAC" >&6; }
 ac_res=$ac_cv_search_HMAC
-if test "$ac_res" != no; then :
+if test "$ac_res" != no
+then :
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
-$as_echo "#define HAVE_HMAC 1" >>confdefs.h
+printf "%s\n" "#define HAVE_HMAC 1" >>confdefs.h
 
         case "$ac_cv_search_HMAC" in
           -l*) CRYPTO_LIBS=$ac_cv_search_HMAC ;;
         esac
-else
-  case "$need_crypto" in
-     check) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: S3 support not enabled: requires SSL development files" >&5
-$as_echo "$as_me: WARNING: S3 support not enabled: requires SSL development files" >&2;}
+else case e in #(
+  e) case "$need_crypto" in
+     check) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: S3 support not enabled: requires SSL development files" >&5
+printf "%s\n" "$as_me: WARNING: S3 support not enabled: requires SSL development files" >&2;}
          s3=disabled ;;
      *) cat > config.mk <<'EOF'
 ifneq ($(MAKECMDGOALS),distclean)
@@ -6030,10 +6857,12 @@ libcurl4-*-dev package installed), or openssl-devel (on RPM-based Linux
 distributions or Cygwin) is installed.
 
 Either configure with --disable-s3 or resolve this error to build HTSlib." "$LINENO" 5 ;;
-       esac
+       esac ;;
+esac
 fi
 
-     LIBS=$save_LIBS
+     LIBS=$save_LIBS ;;
+esac
 fi
 
   if test "$enable_plugins" != yes ; then
@@ -6041,58 +6870,69 @@ fi
   fi
 fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing regcomp" >&5
-$as_echo_n "checking for library containing regcomp... " >&6; }
-if ${ac_cv_search_regcomp+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for library containing regcomp" >&5
+printf %s "checking for library containing regcomp... " >&6; }
+if test ${ac_cv_search_regcomp+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_func_search_save_LIBS=$LIBS
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
    Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
+   builtin and then its argument prototype would still apply.
+   The 'extern "C"' is for builds by C++ compilers;
+   although this is not generally supported in C code supporting it here
+   has little cost and some practical benefit (sr 110532).  */
 #ifdef __cplusplus
 extern "C"
 #endif
-char regcomp ();
+char regcomp (void);
 int
-main ()
+main (void)
 {
 return regcomp ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' regex; do
+for ac_lib in '' regex
+do
   if test -z "$ac_lib"; then
     ac_res="none required"
   else
     ac_res=-l$ac_lib
     LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
   fi
-  if ac_fn_c_try_link "$LINENO"; then :
+  if ac_fn_c_try_link "$LINENO"
+then :
   ac_cv_search_regcomp=$ac_res
 fi
-rm -f core conftest.err conftest.$ac_objext \
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext
-  if ${ac_cv_search_regcomp+:} false; then :
+  if test ${ac_cv_search_regcomp+y}
+then :
   break
 fi
 done
-if ${ac_cv_search_regcomp+:} false; then :
+if test ${ac_cv_search_regcomp+y}
+then :
 
-else
-  ac_cv_search_regcomp=no
+else case e in #(
+  e) ac_cv_search_regcomp=no ;;
+esac
 fi
 rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
+LIBS=$ac_func_search_save_LIBS ;;
+esac
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_regcomp" >&5
-$as_echo "$ac_cv_search_regcomp" >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_regcomp" >&5
+printf "%s\n" "$ac_cv_search_regcomp" >&6; }
 ac_res=$ac_cv_search_regcomp
-if test "$ac_res" != no; then :
+if test "$ac_res" != no
+then :
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
   libregex=needed
 fi
@@ -6101,11 +6941,12 @@ fi
 
 if test "$s3" = enabled ; then
 
-$as_echo "#define ENABLE_S3 1" >>confdefs.h
+printf "%s\n" "#define ENABLE_S3 1" >>confdefs.h
 
 fi
 
-if test "x$hts_late_cflags" != x; then :
+if test "x$hts_late_cflags" != x
+then :
   CFLAGS="$CFLAGS $hts_late_cflags"
 fi
 
@@ -6150,8 +6991,8 @@ cat >confcache <<\_ACEOF
 # config.status only pays attention to the cache file if you give it
 # the --recheck option to rerun configure.
 #
-# `ac_cv_env_foo' variables (set or unset) will be overridden when
-# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# 'ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* 'ac_cv_foo' will be assigned the
 # following values.
 
 _ACEOF
@@ -6167,8 +7008,8 @@ _ACEOF
     case $ac_val in #(
     *${as_nl}*)
       case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
-$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       esac
       case $ac_var in #(
       _ | IFS | as_nl) ;; #(
@@ -6181,14 +7022,14 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
   (set) 2>&1 |
     case $as_nl`(ac_space=' '; set) 2>&1` in #(
     *${as_nl}ac_space=\ *)
-      # `set' does not quote correctly, so add quotes: double-quote
+      # 'set' does not quote correctly, so add quotes: double-quote
       # substitution turns \\\\ into \\, and sed turns \\ into \.
       sed -n \
 	"s/'/'\\\\''/g;
 	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
       ;; #(
     *)
-      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      # 'set' quotes correctly as required by POSIX, so do not add quotes.
       sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
       ;;
     esac |
@@ -6198,15 +7039,15 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
      /^ac_cv_env_/b end
      t clear
      :clear
-     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     s/^\([^=]*\)=\(.*[{}].*\)$/test ${\1+y} || &/
      t end
      s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
      :end' >>confcache
 if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
   if test -w "$cache_file"; then
     if test "x$cache_file" != "x/dev/null"; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
-$as_echo "$as_me: updating cache $cache_file" >&6;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+printf "%s\n" "$as_me: updating cache $cache_file" >&6;}
       if test ! -f "$cache_file" || test -h "$cache_file"; then
 	cat confcache >"$cache_file"
       else
@@ -6220,8 +7061,8 @@ $as_echo "$as_me: updating cache $cache_file" >&6;}
       fi
     fi
   else
-    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
-$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+printf "%s\n" "$as_me: not updating unwritable cache $cache_file" >&6;}
   fi
 fi
 rm -f confcache
@@ -6238,7 +7079,7 @@ U=
 for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
   # 1. Remove the extension, and $U if already installed.
   ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
-  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  ac_i=`printf "%s\n" "$ac_i" | sed "$ac_script"`
   # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
   #    will be set to the directory where LIBOBJS objects are built.
   as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
@@ -6249,13 +7090,19 @@ LIBOBJS=$ac_libobjs
 LTLIBOBJS=$ac_ltlibobjs
 
 
+# Check whether --enable-year2038 was given.
+if test ${enable_year2038+y}
+then :
+  enableval=$enable_year2038;
+fi
+
 
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
 ac_clean_files_save=$ac_clean_files
 ac_clean_files="$ac_clean_files $CONFIG_STATUS"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
-$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+printf "%s\n" "$as_me: creating $CONFIG_STATUS" >&6;}
 as_write_fail=0
 cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
 #! $SHELL
@@ -6278,63 +7125,65 @@ cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
 
 # Be more Bourne compatible
 DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in #(
+else case e in #(
+  e) case `(set -o) 2>/dev/null` in #(
   *posix*) :
     set -o posix ;; #(
   *) :
      ;;
+esac ;;
 esac
 fi
 
 
+
+# Reset variables that may have inherited troublesome values from
+# the environment.
+
+# IFS needs to be set, to space, tab, and newline, in precisely that order.
+# (If _AS_PATH_WALK were called with IFS unset, it would have the
+# side effect of setting IFS to empty, thus disabling word splitting.)
+# Quoting is to prevent editors from complaining about space-tab.
 as_nl='
 '
 export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
+IFS=" ""	$as_nl"
+
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# Ensure predictable behavior from utilities with locale-dependent output.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# We cannot yet rely on "unset" to work, but we need these variables
+# to be unset--not just set to an empty or harmless value--now, to
+# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh).  This construct
+# also avoids known problems related to "unset" and subshell syntax
+# in other old shells (e.g. bash 2.01 and pdksh 5.2.14).
+for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH
+do eval test \${$as_var+y} \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+
+# Ensure that fds 0, 1, and 2 are open.
+if (exec 3>&0) 2>/dev/null; then :; else exec 0</dev/null; fi
+if (exec 3>&1) 2>/dev/null; then :; else exec 1>/dev/null; fi
+if (exec 3>&2)            ; then :; else exec 2>/dev/null; fi
 
 # The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
+if ${PATH_SEPARATOR+false} :; then
   PATH_SEPARATOR=:
   (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
     (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
@@ -6343,13 +7192,6 @@ if test "${PATH_SEPARATOR+set}" != set; then
 fi
 
 
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
 # Find who we are.  Look in the path if we contain no directory separator.
 as_myself=
 case $0 in #((
@@ -6358,43 +7200,27 @@ case $0 in #((
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    test -r "$as_dir$0" && as_myself=$as_dir$0 && break
   done
 IFS=$as_save_IFS
 
      ;;
 esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
+# We did not find ourselves, most probably we were run as 'sh COMMAND'
 # in which case we are not to be found in the path.
 if test "x$as_myself" = x; then
   as_myself=$0
 fi
 if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
   exit 1
 fi
 
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
 
 
 # as_fn_error STATUS ERROR [LINENO LOG_FD]
@@ -6407,9 +7233,9 @@ as_fn_error ()
   as_status=$1; test $as_status -eq 0 && as_status=1
   if test "$4"; then
     as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
   fi
-  $as_echo "$as_me: error: $2" >&2
+  printf "%s\n" "$as_me: error: $2" >&2
   as_fn_exit $as_status
 } # as_fn_error
 
@@ -6440,22 +7266,25 @@ as_fn_unset ()
   { eval $1=; unset $1;}
 }
 as_unset=as_fn_unset
+
 # as_fn_append VAR VALUE
 # ----------------------
 # Append the text in VALUE to the end of the definition contained in VAR. Take
 # advantage of any shell optimizations that allow amortized linear growth over
 # repeated appends, instead of the typical quadratic growth present in naive
 # implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null
+then :
   eval 'as_fn_append ()
   {
     eval $1+=\$2
   }'
-else
-  as_fn_append ()
+else case e in #(
+  e) as_fn_append ()
   {
     eval $1=\$$1\$2
-  }
+  } ;;
+esac
 fi # as_fn_append
 
 # as_fn_arith ARG...
@@ -6463,16 +7292,18 @@ fi # as_fn_append
 # Perform arithmetic evaluation on the ARGs, and store the result in the
 # global $as_val. Take advantage of shells that can avoid forks. The arguments
 # must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null
+then :
   eval 'as_fn_arith ()
   {
     as_val=$(( $* ))
   }'
-else
-  as_fn_arith ()
+else case e in #(
+  e) as_fn_arith ()
   {
     as_val=`expr "$@" || test $? -eq 1`
-  }
+  } ;;
+esac
 fi # as_fn_arith
 
 
@@ -6499,7 +7330,7 @@ as_me=`$as_basename -- "$0" ||
 $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
 	 X"$0" : 'X\(//\)$' \| \
 	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
+printf "%s\n" X/"$0" |
     sed '/^.*\/\([^/][^/]*\)\/*$/{
 	    s//\1/
 	    q
@@ -6521,6 +7352,10 @@ as_cr_Letters=$as_cr_letters$as_cr_LETTERS
 as_cr_digits='0123456789'
 as_cr_alnum=$as_cr_Letters$as_cr_digits
 
+
+# Determine whether it's possible to make 'echo' print without a newline.
+# These variables are no longer used directly by Autoconf, but are AC_SUBSTed
+# for compatibility with existing Makefiles.
 ECHO_C= ECHO_N= ECHO_T=
 case `echo -n x` in #(((((
 -n*)
@@ -6534,6 +7369,12 @@ case `echo -n x` in #(((((
   ECHO_N='-n';;
 esac
 
+# For backward compatibility with old third-party macros, we provide
+# the shell variables $as_echo and $as_echo_n.  New code should use
+# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively.
+as_echo='printf %s\n'
+as_echo_n='printf %s'
+
 rm -f conf$$ conf$$.exe conf$$.file
 if test -d conf$$.dir; then
   rm -f conf$$.dir/conf$$.file
@@ -6545,9 +7386,9 @@ if (echo >conf$$.file) 2>/dev/null; then
   if ln -s conf$$.file conf$$ 2>/dev/null; then
     as_ln_s='ln -s'
     # ... but there are two gotchas:
-    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -pR'.
+    # 1) On MSYS, both 'ln -s file dir' and 'ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; 'ln -s' creates a wrapper executable.
+    # In both cases, we have to default to 'cp -pR'.
     ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
       as_ln_s='cp -pR'
   elif ln conf$$.file conf$$ 2>/dev/null; then
@@ -6575,7 +7416,7 @@ as_fn_mkdir_p ()
     as_dirs=
     while :; do
       case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
       *) as_qdir=$as_dir;;
       esac
       as_dirs="'$as_qdir' $as_dirs"
@@ -6584,7 +7425,7 @@ $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$as_dir" : 'X\(//\)[^/]' \| \
 	 X"$as_dir" : 'X\(//\)$' \| \
 	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
+printf "%s\n" X"$as_dir" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -6628,10 +7469,12 @@ as_test_x='test -x'
 as_executable_p=as_fn_executable_p
 
 # Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+as_sed_cpp="y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g"
+as_tr_cpp="eval sed '$as_sed_cpp'" # deprecated
 
 # Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+as_sed_sh="y%*+%pp%;s%[^_$as_cr_alnum]%_%g"
+as_tr_sh="eval sed '$as_sed_sh'" # deprecated
 
 
 exec 6>&1
@@ -6646,8 +7489,8 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by HTSlib $as_me 1.18, which was
-generated by GNU Autoconf 2.69.  Invocation command line was
+This file was extended by HTSlib $as_me 1.21, which was
+generated by GNU Autoconf 2.72.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
   CONFIG_HEADERS  = $CONFIG_HEADERS
@@ -6680,7 +7523,7 @@ _ACEOF
 
 cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 ac_cs_usage="\
-\`$as_me' instantiates files and other configuration actions
+'$as_me' instantiates files and other configuration actions
 from templates according to the current configuration.  Unless the files
 and actions are specified as TAGs, all are instantiated by default.
 
@@ -6714,14 +7557,16 @@ Report bugs to <samtools-help@lists.sourceforge.net>.
 HTSlib home page: <http://www.htslib.org/>."
 
 _ACEOF
+ac_cs_config=`printf "%s\n" "$ac_configure_args" | sed "$ac_safe_unquote"`
+ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\''/g"`
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-HTSlib config.status 1.18
-configured by $0, generated by GNU Autoconf 2.69,
+HTSlib config.status 1.21
+configured by $0, generated by GNU Autoconf 2.72,
   with options \\"\$ac_cs_config\\"
 
-Copyright (C) 2012 Free Software Foundation, Inc.
+Copyright (C) 2023 Free Software Foundation, Inc.
 This config.status script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it."
 
@@ -6758,15 +7603,15 @@ do
   -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
     ac_cs_recheck=: ;;
   --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
-    $as_echo "$ac_cs_version"; exit ;;
+    printf "%s\n" "$ac_cs_version"; exit ;;
   --config | --confi | --conf | --con | --co | --c )
-    $as_echo "$ac_cs_config"; exit ;;
+    printf "%s\n" "$ac_cs_config"; exit ;;
   --debug | --debu | --deb | --de | --d | -d )
     debug=: ;;
   --file | --fil | --fi | --f )
     $ac_shift
     case $ac_optarg in
-    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
     '') as_fn_error $? "missing file argument" ;;
     esac
     as_fn_append CONFIG_FILES " '$ac_optarg'"
@@ -6774,23 +7619,23 @@ do
   --header | --heade | --head | --hea )
     $ac_shift
     case $ac_optarg in
-    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
     esac
     as_fn_append CONFIG_HEADERS " '$ac_optarg'"
     ac_need_defaults=false;;
   --he | --h)
     # Conflict between --help and --header
-    as_fn_error $? "ambiguous option: \`$1'
-Try \`$0 --help' for more information.";;
+    as_fn_error $? "ambiguous option: '$1'
+Try '$0 --help' for more information.";;
   --help | --hel | -h )
-    $as_echo "$ac_cs_usage"; exit ;;
+    printf "%s\n" "$ac_cs_usage"; exit ;;
   -q | -quiet | --quiet | --quie | --qui | --qu | --q \
   | -silent | --silent | --silen | --sile | --sil | --si | --s)
     ac_cs_silent=: ;;
 
   # This is an error.
-  -*) as_fn_error $? "unrecognized option: \`$1'
-Try \`$0 --help' for more information." ;;
+  -*) as_fn_error $? "unrecognized option: '$1'
+Try '$0 --help' for more information." ;;
 
   *) as_fn_append ac_config_targets " $1"
      ac_need_defaults=false ;;
@@ -6811,7 +7656,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 if \$ac_cs_recheck; then
   set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
   shift
-  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  \printf "%s\n" "running CONFIG_SHELL=$SHELL \$*" >&6
   CONFIG_SHELL='$SHELL'
   export CONFIG_SHELL
   exec "\$@"
@@ -6825,7 +7670,7 @@ exec 5>>config.log
   sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
 ## Running $as_me. ##
 _ASBOX
-  $as_echo "$ac_log"
+  printf "%s\n" "$ac_log"
 } >&5
 
 _ACEOF
@@ -6847,7 +7692,7 @@ do
     "htslib_vars.mk") CONFIG_FILES="$CONFIG_FILES htslib_vars.mk:builddir_vars.mk.in" ;;
     "mkdir") CONFIG_COMMANDS="$CONFIG_COMMANDS mkdir" ;;
 
-  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+  *) as_fn_error $? "invalid argument: '$ac_config_target'" "$LINENO" 5;;
   esac
 done
 
@@ -6857,10 +7702,10 @@ done
 # We use the long form for the default assignment because of an extremely
 # bizarre bug on SunOS 4.1.3.
 if $ac_need_defaults; then
-  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
-  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
-  test "${CONFIG_LINKS+set}" = set || CONFIG_LINKS=$config_links
-  test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
+  test ${CONFIG_FILES+y} || CONFIG_FILES=$config_files
+  test ${CONFIG_HEADERS+y} || CONFIG_HEADERS=$config_headers
+  test ${CONFIG_LINKS+y} || CONFIG_LINKS=$config_links
+  test ${CONFIG_COMMANDS+y} || CONFIG_COMMANDS=$config_commands
 fi
 
 # Have a temporary directory for convenience.  Make it in the build tree
@@ -6868,7 +7713,7 @@ fi
 # creating and moving files from /tmp can sometimes cause problems.
 # Hook for its removal unless debugging.
 # Note that there is a small window in which the directory will not be cleaned:
-# after its creation but before its name has been assigned to `$tmp'.
+# after its creation but before its name has been assigned to '$tmp'.
 $debug ||
 {
   tmp= ac_tmp=
@@ -6892,7 +7737,7 @@ ac_tmp=$tmp
 
 # Set up the scripts for CONFIG_FILES section.
 # No need to generate them if there are no CONFIG_FILES.
-# This happens for instance with `./config.status config.h'.
+# This happens for instance with './config.status config.h'.
 if test -n "$CONFIG_FILES"; then
 
 
@@ -7050,13 +7895,13 @@ fi # test -n "$CONFIG_FILES"
 
 # Set up the scripts for CONFIG_HEADERS section.
 # No need to generate them if there are no CONFIG_HEADERS.
-# This happens for instance with `./config.status Makefile'.
+# This happens for instance with './config.status Makefile'.
 if test -n "$CONFIG_HEADERS"; then
 cat >"$ac_tmp/defines.awk" <<\_ACAWK ||
 BEGIN {
 _ACEOF
 
-# Transform confdefs.h into an awk script `defines.awk', embedded as
+# Transform confdefs.h into an awk script 'defines.awk', embedded as
 # here-document in config.status, that substitutes the proper values into
 # config.h.in to produce config.h.
 
@@ -7166,7 +8011,7 @@ do
   esac
   case $ac_mode$ac_tag in
   :[FHL]*:*);;
-  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+  :L* | :C*:*) as_fn_error $? "invalid tag '$ac_tag'" "$LINENO" 5;;
   :[FH]-) ac_tag=-:-;;
   :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
   esac
@@ -7188,33 +8033,33 @@ do
       -) ac_f="$ac_tmp/stdin";;
       *) # Look for the file first in the build tree, then in the source tree
 	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
-	 # because $ac_f cannot contain `:'.
+	 # because $ac_f cannot contain ':'.
 	 test -f "$ac_f" ||
 	   case $ac_f in
 	   [\\/$]*) false;;
 	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
 	   esac ||
-	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+	   as_fn_error 1 "cannot find input file: '$ac_f'" "$LINENO" 5;;
       esac
-      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      case $ac_f in *\'*) ac_f=`printf "%s\n" "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
       as_fn_append ac_file_inputs " '$ac_f'"
     done
 
-    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # Let's still pretend it is 'configure' which instantiates (i.e., don't
     # use $as_me), people would be surprised to read:
     #    /* config.h.  Generated by config.status.  */
     configure_input='Generated from '`
-	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	  printf "%s\n" "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
 	`' by configure.'
     if test x"$ac_file" != x-; then
       configure_input="$ac_file.  $configure_input"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
-$as_echo "$as_me: creating $ac_file" >&6;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+printf "%s\n" "$as_me: creating $ac_file" >&6;}
     fi
     # Neutralize special characters interpreted by sed in replacement strings.
     case $configure_input in #(
     *\&* | *\|* | *\\* )
-       ac_sed_conf_input=`$as_echo "$configure_input" |
+       ac_sed_conf_input=`printf "%s\n" "$configure_input" |
        sed 's/[\\\\&|]/\\\\&/g'`;; #(
     *) ac_sed_conf_input=$configure_input;;
     esac
@@ -7231,7 +8076,7 @@ $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
 	 X"$ac_file" : 'X\(//\)[^/]' \| \
 	 X"$ac_file" : 'X\(//\)$' \| \
 	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$ac_file" |
+printf "%s\n" X"$ac_file" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -7255,9 +8100,9 @@ $as_echo X"$ac_file" |
 case "$ac_dir" in
 .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
 *)
-  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'`
   # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
   case $ac_top_builddir_sub in
   "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
   *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
@@ -7310,8 +8155,8 @@ ac_sed_dataroot='
 case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
 *datarootdir*) ac_datarootdir_seen=yes;;
 *@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
-$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+printf "%s\n" "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
   ac_datarootdir_hack='
@@ -7324,7 +8169,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 esac
 _ACEOF
 
-# Neutralize VPATH when `$srcdir' = `.'.
+# Neutralize VPATH when '$srcdir' = '.'.
 # Shell code in configure.ac might set extrasub.
 # FIXME: do we really want to maintain this feature?
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
@@ -7353,9 +8198,9 @@ test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
   { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
   { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
       "$ac_tmp/out"`; test -z "$ac_out"; } &&
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable 'datarootdir'
 which seems to be undefined.  Please make sure it is defined" >&5
-$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+printf "%s\n" "$as_me: WARNING: $ac_file contains a reference to the variable 'datarootdir'
 which seems to be undefined.  Please make sure it is defined" >&2;}
 
   rm -f "$ac_tmp/stdin"
@@ -7371,20 +8216,20 @@ which seems to be undefined.  Please make sure it is defined" >&2;}
   #
   if test x"$ac_file" != x-; then
     {
-      $as_echo "/* $configure_input  */" \
+      printf "%s\n" "/* $configure_input  */" >&1 \
       && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs"
     } >"$ac_tmp/config.h" \
       || as_fn_error $? "could not create $ac_file" "$LINENO" 5
     if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
-$as_echo "$as_me: $ac_file is unchanged" >&6;}
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
+printf "%s\n" "$as_me: $ac_file is unchanged" >&6;}
     else
       rm -f "$ac_file"
       mv "$ac_tmp/config.h" "$ac_file" \
 	|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
     fi
   else
-    $as_echo "/* $configure_input  */" \
+    printf "%s\n" "/* $configure_input  */" >&1 \
       && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \
       || as_fn_error $? "could not create -" "$LINENO" 5
   fi
@@ -7402,8 +8247,8 @@ $as_echo "$as_me: $ac_file is unchanged" >&6;}
       ac_source=$srcdir/$ac_source
     fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: linking $ac_source to $ac_file" >&5
-$as_echo "$as_me: linking $ac_source to $ac_file" >&6;}
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: linking $ac_source to $ac_file" >&5
+printf "%s\n" "$as_me: linking $ac_source to $ac_file" >&6;}
 
     if test ! -r "$ac_source"; then
       as_fn_error $? "$ac_source: file not found" "$LINENO" 5
@@ -7421,8 +8266,8 @@ $as_echo "$as_me: linking $ac_source to $ac_file" >&6;}
       as_fn_error $? "cannot link or copy $ac_source to $ac_file" "$LINENO" 5
   fi
  ;;
-  :C)  { $as_echo "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5
-$as_echo "$as_me: executing $ac_file commands" >&6;}
+  :C)  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5
+printf "%s\n" "$as_me: executing $ac_file commands" >&6;}
  ;;
   esac
 
@@ -7468,7 +8313,8 @@ if test "$no_create" != yes; then
   $ac_cs_success || as_fn_exit 1
 fi
 if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
-$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
 fi
 
+
diff --git a/htslib/configure.ac b/htslib/configure.ac
index c1afb38e7..cdb8391ff 100644
--- a/htslib/configure.ac
+++ b/htslib/configure.ac
@@ -1,6 +1,6 @@
 # Configure script for htslib, a C library for high-throughput sequencing data.
 #
-#    Copyright (C) 2015-2023 Genome Research Ltd.
+#    Copyright (C) 2015-2024 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -35,7 +35,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4])
 m4_include([m4/pkg.m4])
 
 dnl Copyright notice to be copied into the generated configure script
-AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd.
+AC_COPYRIGHT([Portions copyright (C) 2020-2024 Genome Research Ltd.
 
 This configure script is free software: you are free to change and
 redistribute it.  There is NO WARRANTY, to the extent permitted by law.])
@@ -82,55 +82,33 @@ AC_CHECK_DECL([_XOPEN_SOURCE], [],
   [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])],
   [])
 
+dnl Check that we have cpuid, and if so run the x86 SIMD checks
+AC_CHECK_DECLS([__get_cpuid_max, __cpuid_count], [
+   hts_have_cpuid=yes
+], [
+   hts_have_cpuid=no
+], [[#include <cpuid.h>]])
 
-dnl Check for various compiler flags to enable SIMD features
-dnl Options for rANS32x16 sse4.1 version - ssse3
-hts_cflags_sse4=""
-HTS_CHECK_COMPILE_FLAGS_NEEDED([ssse3], [-mssse3], [AC_LANG_PROGRAM([[
+AS_IF(test "x$hts_have_cpuid" = "xyes", [
+dnl Options for rANS32x16 sse4.1 version - sse4.1
+HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt],
+ [AC_LANG_PROGRAM([[
     #ifdef __x86_64__
     #include "x86intrin.h"
     #endif
   ]],[[
     #ifdef __x86_64__
     __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-    __m128i c = _mm_shuffle_epi8(a, b);
-    return *((char *) &c);
+    __m128i c = _mm_shuffle_epi8(_mm_max_epu32(a, b), b);
+    return _mm_popcnt_u32(*((char *) &c));
     #endif
   ]])], [
-  hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
+  hts_cflags_sse4="$flags_needed"
   AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.])
-])
-
-dnl Options for rANS32x16 sse4.1 version - popcnt
-HTS_CHECK_COMPILE_FLAGS_NEEDED([popcnt], [-mpopcnt], [AC_LANG_PROGRAM([[
-    #ifdef __x86_64__
-    #include "x86intrin.h"
-    #endif
-  ]],[[
-    #ifdef __x86_64__
-    unsigned int i = _mm_popcnt_u32(1);
-    return i != 1;
-    #endif
-  ]])], [
-  hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
   AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
-])
-
-dnl Options for rANS32x16 sse4.1 version - sse4.1
-HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1], [AC_LANG_PROGRAM([[
-    #ifdef __x86_64__
-    #include "x86intrin.h"
-    #endif
-  ]],[[
-    #ifdef __x86_64__
-    __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-    __m128i c = _mm_max_epu32(a, b);
-    return *((char *) &c);
-    #endif
-  ]])], [
-  hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
   AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled.
 ])
+
 dnl Propagate HTSlib's unaligned access preference to htscodecs
   AH_VERBATIM([UBSAN],[
 /* Prevent unaligned access in htscodecs SSE4 rANS codec */
@@ -142,7 +120,7 @@ dnl Propagate HTSlib's unaligned access preference to htscodecs
 AC_SUBST([hts_cflags_sse4])
 
 dnl Options for rANS32x16 avx2 version
-HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[
+HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2 -mpopcnt], [AC_LANG_PROGRAM([[
     #ifdef __x86_64__
     #include "x86intrin.h"
     #endif
@@ -151,16 +129,18 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[
     __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
     __m256i b = _mm256_add_epi32(a, a);
     long long c = _mm256_extract_epi64(b, 0);
-    return (int) c;
+    return _mm_popcnt_u32((int) c);
     #endif
   ]])], [
   hts_cflags_avx2="$flags_needed"
   AC_SUBST([hts_cflags_avx2])
+  AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
   AC_DEFINE([HAVE_AVX2],1,[Defined to 1 if rANS source using AVX2 can be compiled.])
 ])
 
 dnl Options for rANS32x16 avx512 version
-HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[
+HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt],
+ [AC_LANG_PROGRAM([[
     #ifdef __x86_64__
     #include "x86intrin.h"
     #endif
@@ -168,14 +148,57 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[
     #ifdef __x86_64__
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return *((char *) &b);
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
     #endif
   ]])], [
   hts_cflags_avx512="$flags_needed"
   AC_SUBST([hts_cflags_avx512])
+  AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
   AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.])
 ])
 
+dnl Check for working __builtin_cpu_supports (ssse3 is broken on some clangs)
+AC_MSG_CHECKING([for working __builtin_cpu_supports("ssse3")])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],[
+  if (__builtin_cpu_supports("ssse3")) {
+    return 0;
+  }
+])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_BUILTIN_CPU_SUPPORT_SSSE3], 1,
+            [Defined to 1 if __builtin_cpu_supports("ssse3") works])
+], [
+  AC_MSG_RESULT([no])
+])
+
+dnl Check for function attribute used in conjunction with __builtin_cpu_supports
+dnl and that it does enable the corresponding intrinsics (which is broken on ancient GCCs)
+AC_MSG_CHECKING([for working __attribute__((target("ssse3")))])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  #ifdef __x86_64__
+  #include "x86intrin.h"
+
+  __attribute__((target("ssse3")))
+  void shuffle(char *aptr, char *bptr) {
+    __m128i a = _mm_lddqu_si128((__m128i *)aptr);
+    __m128i b = _mm_shuffle_epi8(a, a);
+    _mm_storeu_si128((__m128i *)bptr, b);
+  }
+  #else
+  void shuffle(char *aptr, char *bptr) { }
+  #endif
+]], [[shuffle(0, 0);]])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_ATTRIBUTE_TARGET_SSSE3], 1,
+            [Define if __attribute__((target("ssse3"))) works.])
+], [
+  AC_MSG_RESULT([no])
+])
+
+]) dnl End of AS_IF(hts_have_cpuid)
+
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
 dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check
 dnl for pkg-config...
@@ -317,6 +340,25 @@ AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic])
 # Darwin has a dubious fdatasync() symbol, but no declaration in <unistd.h>
 AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)])
 
+AC_MSG_CHECKING([for __attribute__((constructor))])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+  static __attribute__((constructor)) void noop(void) {}
+]], [])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_ATTRIBUTE_CONSTRUCTOR], 1,
+            [Define if __attribute__((constructor)) is available.])
+], [AC_MSG_RESULT([no])])
+
+AC_MSG_CHECKING([for clock_gettime with CLOCK_PROCESS_CPUTIME_ID])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <time.h>]], [[
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+]])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_CLOCK_GETTIME_CPUTIME], 1,
+            [Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID.])
+], [AC_MSG_RESULT([no])])
+
 if test $enable_plugins != no; then
   AC_SEARCH_LIBS([dlsym], [dl], [],
     [MSG_ERROR([dlsym() not found
diff --git a/htslib/cram/cram_codecs.c b/htslib/cram/cram_codecs.c
index cc5e52b2c..a72419e1c 100644
--- a/htslib/cram/cram_codecs.c
+++ b/htslib/cram/cram_codecs.c
@@ -44,6 +44,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <errno.h>
 #include <stddef.h>
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#include "../fuzz_settings.h"
+#endif
+
 #include "../htslib/hts_endian.h"
 
 #if defined(HAVE_EXTERNAL_LIBHTSCODECS)
@@ -478,10 +482,10 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
             else if (option == E_BYTE || option == E_BYTE_ARRAY)
                 c->decode = cram_external_decode_char;
             else
-                return NULL;
+                goto malformed;
             break;
         default:
-            return NULL;
+            goto malformed;
         }
     } else {
         // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
@@ -1229,7 +1233,8 @@ cram_codec *cram_beta_encode_init(cram_stats *st,
                                   void *dat,
                                   int version, varint_vec *vv) {
     cram_codec *c;
-    int min_val, max_val, len = 0;
+    hts_pos_t min_val, max_val;
+    int len = 0;
     int64_t range;
 
     c = malloc(sizeof(*c));
@@ -1247,8 +1252,8 @@ cram_codec *cram_beta_encode_init(cram_stats *st,
     c->flush = NULL;
 
     if (dat) {
-        min_val = ((int *)dat)[0];
-        max_val = ((int *)dat)[1];
+        min_val = ((hts_pos_t *)dat)[0];
+        max_val = ((hts_pos_t *)dat)[1];
     } else {
         min_val = INT_MAX;
         max_val = INT_MIN;
@@ -1276,9 +1281,26 @@ cram_codec *cram_beta_encode_init(cram_stats *st,
         }
     }
 
-    assert(max_val >= min_val);
-    c->u.e_beta.offset = -min_val;
+    if (max_val < min_val)
+        goto err;
+
     range = (int64_t) max_val - min_val;
+    switch (option) {
+    case E_SINT:
+        if (min_val < INT_MIN || range > INT_MAX)
+            goto err;
+        break;
+
+    case E_INT:
+        if (max_val > UINT_MAX || range > UINT_MAX)
+            goto err;
+        break;
+
+    default:
+        break;
+    }
+
+    c->u.e_beta.offset = -min_val;
     while (range) {
         len++;
         range >>= 1;
@@ -1286,6 +1308,10 @@ cram_codec *cram_beta_encode_init(cram_stats *st,
     c->u.e_beta.nbits = len;
 
     return c;
+
+ err:
+    free(c);
+    return NULL;
 }
 
 /*
@@ -2795,7 +2821,12 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
         errno = ENOMEM;
         return NULL;
     }
-
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
+        errno = ENOMEM;
+        return NULL;
+    }
+#endif
     h = calloc(1, sizeof(*h));
     if (!h)
         return NULL;
diff --git a/htslib/cram/cram_decode.c b/htslib/cram/cram_decode.c
index 26c7c1fac..2b2ad6029 100644
--- a/htslib/cram/cram_decode.c
+++ b/htslib/cram/cram_decode.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -1000,9 +1000,9 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
     hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
     hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
     if (hdr->num_content_ids < 1 ||
-        hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) {
-        /* Slice must have at least one data block,
-           and malloc'd size shouldn't wrap. */
+        hdr->num_content_ids >= 10000) {
+        // Slice must have at least one data block, and there is no need
+        // for more than 2 per possible aux-tag plus ancillary.
         free(hdr);
         return NULL;
     }
@@ -2351,7 +2351,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
             s->ref_start = s->hdr->ref_seq_start;
             s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
             if (s->hdr->ref_seq_span > b->uncomp_size) {
-                hts_log_error("Embedded reference is too small at #%d:%d-%d",
+                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
                               ref_id, s->ref_start, s->ref_end);
                 return -1;
             }
@@ -2410,7 +2410,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
             if (s->hdr->ref_seq_start >= s->ref_start) {
                 start = s->hdr->ref_seq_start - s->ref_start;
             } else {
-                hts_log_warning("Slice starts before base 1 at #%d:%d-%d",
+                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
                                 ref_id, s->ref_start, s->ref_end);
                 start = 0;
             }
@@ -2418,7 +2418,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
             if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
                 len = s->hdr->ref_seq_span;
             } else {
-                hts_log_warning("Slice ends beyond reference end at #%d:%d-%d",
+                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
                                 ref_id, s->ref_start, s->ref_end);
                 len = s->ref_end - s->ref_start + 1;
             }
@@ -2448,7 +2448,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
             char M[33];
             const char *rname = sam_hdr_tid2name(sh, ref_id);
             if (!rname) rname="?"; // cannot happen normally
-            hts_log_error("MD5 checksum reference mismatch at %s:%d-%d",
+            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
                           rname, s->ref_start, s->ref_end);
             hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
             hts_log_error("Ref   : %s", md5_print(digest, M));
@@ -3004,8 +3004,8 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
  * Returns the used size of the bam record on success
  *         -1 on failure.
  */
-static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
-                       cram_record *cr, int rec, bam_seq_t **bam) {
+int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
+                cram_record *cr, int rec, bam_seq_t **bam) {
     int ret, rg_len;
     char name_a[1024], *name;
     int name_len;
@@ -3172,7 +3172,7 @@ static cram_container *cram_first_slice(cram_fd *fd) {
     return c;
 }
 
-static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
+cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
     cram_container *c_curr;  // container being consumed via cram_get_seq()
     cram_slice *s_curr = NULL;
 
diff --git a/htslib/cram/cram_decode.h b/htslib/cram/cram_decode.h
index 400eb6beb..16d87a073 100644
--- a/htslib/cram/cram_decode.h
+++ b/htslib/cram/cram_decode.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2013, 2018 Genome Research Ltd.
+Copyright (c) 2012-2013, 2018, 2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -94,6 +94,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);
 
 
+/*! INTERNAL:
+ * Loads and decodes the next slice worth of data.
+ *
+ * @return
+ * Returns cram slice pointer on success;
+ *         NULL on failure
+ */
+cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp);
+
 /*! INTERNAL:
  * Decode an entire slice from container blocks. Fills out s->crecs[] array.
  *
@@ -105,6 +114,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
                       sam_hdr_t *hdr);
 
 
+/*! INTERNAL:
+ * Converts a cram in-memory record into a bam in-memory record. We
+ * pass a pointer to a bam_seq_t pointer along with the a pointer to
+ * the allocated size. These can initially be pointers to NULL and zero.
+ *
+ * This function will reallocate the bam buffer as required and update
+ * (*bam)->alloc accordingly, allowing it to be used within a loop
+ * efficiently without needing to allocate new bam objects over and
+ * over again.
+ *
+ * Returns the used size of the bam record on success
+ *         -1 on failure.
+ */
+int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
+                cram_record *cr, int rec, bam_seq_t **bam);
+
 /*
  * Drains and frees the decode read-queue for a multi-threaded reader.
  */
diff --git a/htslib/cram/cram_encode.c b/htslib/cram/cram_encode.c
index 9797fa7a8..5d22db54d 100644
--- a/htslib/cram/cram_encode.c
+++ b/htslib/cram/cram_encode.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -528,6 +528,12 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
         cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
         cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
     } else {
+        if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
+            hts_log_error("Reference position too large for CRAM 3");
+            cram_free_block(b);
+            free(buf);
+            return NULL;
+        }
         cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
         cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
     }
@@ -947,14 +953,14 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
      */
     {
         int i;
-        for (i = 0; i < s->naux_block; i++) {
-            if (!s->aux_block[i] || s->aux_block[i] == s->block[0])
+        for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
+            if (!s->block[i] || s->block[i] == s->block[0])
                 continue;
 
-            if (s->aux_block[i]->method != RAW)
+            if (s->block[i]->method != RAW)
                 continue;
 
-            if (cram_compress_block2(fd, s, s->aux_block[i], s->aux_block[i]->m,
+            if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
                                      method, level))
                 return -1;
         }
@@ -1157,8 +1163,10 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c,
     if (c->tags_used) {
         int n;
         s->hdr->num_blocks = DS_END;
-        for (n = 0; n < s->naux_block; n++)
+        for (n = 0; n < s->naux_block; n++) {
             s->block[s->hdr->num_blocks++] = s->aux_block[n];
+            s->aux_block[n] = NULL;
+        }
     }
 
     /* Encode reads */
@@ -1235,6 +1243,58 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c,
     return -1;
 }
 
+static inline const char *bam_data_end(bam1_t *b) {
+    return (const char *)b->data + b->l_data;
+}
+
+/*
+ * A bounds checking version of bam_aux2i.
+ */
+static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
+    int type = *aux++;
+    switch (type) {
+        case 'c':
+            if (aux_end - aux < 1) {
+                errno = EINVAL;
+                return 0;
+            }
+            return *(int8_t *)aux;
+        case 'C':
+            if (aux_end - aux < 1) {
+                errno = EINVAL;
+                return 0;
+            }
+            return *aux;
+        case 's':
+            if (aux_end - aux < 2) {
+                errno = EINVAL;
+                return 0;
+            }
+            return le_to_i16(aux);
+        case 'S':
+            if (aux_end - aux < 2) {
+                errno = EINVAL;
+                return 0;
+            }
+            return le_to_u16(aux);
+        case 'i':
+            if (aux_end - aux < 4) {
+                errno = EINVAL;
+                return 0;
+            }
+            return le_to_i32(aux);
+        case 'I':
+            if (aux_end - aux < 4) {
+                errno = EINVAL;
+                return 0;
+            }
+            return le_to_u32(aux);
+        default:
+            errno = EINVAL;
+    }
+    return 0;
+}
+
 /*
  * Returns the number of expected read names for this record.
  */
@@ -1243,7 +1303,7 @@ static int expected_template_count(bam_seq_t *b) {
 
     uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
     if (TC) {
-        int n = bam_aux2i(TC);
+        int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
         if (expected < n)
             expected = n;
     }
@@ -1449,16 +1509,25 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
         return 0;
 
     // realloc
+    if (pos - ref_start > UINT_MAX)
+        return -2; // protect overflow in new_end calculation
+
     hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
-    hts_pos_t new_end = *ref_end = ref_start + 1000 + (pos-ref_start)*1.5;
+    hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
+
+    // Refuse to work on excessively large blocks.
+    // We'll just switch to referenceless encoding, which is probably better
+    // here as this must be very sparse data anyway.
+    if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
+        return -2;
 
-    char *tmp = realloc(*ref, *ref_end-ref_start);
+    char *tmp = realloc(*ref, new_end-ref_start+1);
     if (!tmp)
         return -1;
     *ref = tmp;
 
     uint32_t (*tmp5)[5] = realloc(**hist,
-                                  (*ref_end - ref_start)*sizeof(**hist));
+                                  (new_end - ref_start)*sizeof(**hist));
     if (!tmp5)
         return -1;
     *hist = tmp5;
@@ -1474,6 +1543,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
 }
 
 // Walk through MD + seq to generate ref
+// Returns 1 on success, <0 on failure
 static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
                               hts_pos_t ref_start, hts_pos_t *ref_end,
                               const uint8_t *MD) {
@@ -1648,6 +1718,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
     char *ref = NULL;
     uint32_t (*hist)[5] = NULL;
     hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
+    if (ref_start < 0)
+        return -1; // cannot build consensus from unmapped data
 
     // initial allocation
     if (extend_ref(&ref, &hist,
@@ -1688,6 +1760,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
     c->ref       = ref;
     c->ref_start = ref_start+1;
     c->ref_end   = ref_end+1;
+    c->ref_free  = 1;
 
     return 0;
 
@@ -1758,12 +1831,29 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     int r1, r2, sn, nref, embed_ref, no_ref;
     spare_bams *spares;
 
+    if (!c->bams)
+        goto err;
+
     if (CRAM_MAJOR_VERS(fd->version) == 1)
         goto err;
 
 //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
 #define goto_err goto err
 
+    // Don't try embed ref if we repeatedly fail
+    pthread_mutex_lock(&fd->ref_lock);
+    int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
+    if (!failed_embed && c->embed_ref == -2) {
+        hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
+        fd->no_ref = c->no_ref = 0;
+        fd->embed_ref = c->embed_ref = 2;
+    } else if (failed_embed && c->embed_ref == -2) {
+        // We've tried several times, so this time give up for good
+        hts_log_warning("Keeping non-ref mode from now on");
+        fd->embed_ref = c->embed_ref = 0;
+    }
+    pthread_mutex_unlock(&fd->ref_lock);
+
  restart:
     /* Cache references up-front if we have unsorted access patterns */
     pthread_mutex_lock(&fd->ref_lock);
@@ -1775,7 +1865,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     /* To create M5 strings */
     /* Fetch reference sequence */
     if (!no_ref) {
-        if (!c->bams || !c->bams[0])
+        if (!c->bams || !c->curr_c_rec || !c->bams[0])
             goto_err;
         bam_seq_t *b = c->bams[0];
 
@@ -1825,8 +1915,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
             // This starts as 'N' and is amended on-the-fly as we go
             // based on MD:Z tags.
             if ((c->ref_id = bam_ref(b)) >= 0) {
-                c->ref_free = 1;
                 c->ref = NULL;
+                // c->ref_free is boolean; whether to free c->ref.  In this
+                // case c->ref will be our auto-embedded sequence instead of
+                // a "global" portion of reference from fd->refs.
+                // Do not confuse with fd->ref_free which is a pointer to a
+                // reference string to free.
+                c->ref_free = 1;
             }
         }
         c->ref_seq_id = c->ref_id;
@@ -1891,10 +1986,16 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
                                     "switching to non-ref mode");
                 }
                 pthread_mutex_lock(&fd->ref_lock);
-                c->embed_ref = fd->embed_ref = 0;
+                c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
                 c->no_ref = fd->no_ref = 1;
+                fd->no_ref_counter++; // more likely to keep permanent action
                 pthread_mutex_unlock(&fd->ref_lock);
+                failed_embed = 1;
                 goto restart;
+            } else {
+                pthread_mutex_lock(&fd->ref_lock);
+                fd->no_ref_counter -= (fd->no_ref_counter > 0);
+                pthread_mutex_unlock(&fd->ref_lock);
             }
         }
 
@@ -2087,7 +2188,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     } else {
         // Removed BETA in v4.0.
         // Should we consider dropping use of it for 3.0 too?
-        int p[2] = {0, c->max_apos};
+        hts_pos_t p[2] = {0, c->max_apos};
         h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
                                              is_v4 ? E_LONG : E_INT,
                                              p, fd->version, &fd->vv);
@@ -2407,7 +2508,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
     c->comp_hdr_block = c_hdr;
 
     if (c->ref_seq_id >= 0) {
-        cram_ref_decr(fd->refs, c->ref_seq_id);
+        if (c->ref_free) {
+            free(c->ref);
+            c->ref = NULL;
+        } else {
+            cram_ref_decr(fd->refs, c->ref_seq_id);
+        }
     }
 
     /* Cache references up-front if we have unsorted access patterns */
@@ -2653,6 +2759,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
     char *aux, *orig;
     sam_hrec_rg_t *brg = NULL;
     int aux_size = bam_get_l_aux(b);
+    const char *aux_end = bam_data_end(b);
     cram_block *td_b = c->comp_hdr->TD_blk;
     int TD_blk_size = BLOCK_SIZE(td_b), new;
     char *key;
@@ -2678,24 +2785,36 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
         aux[aux_size++] = 'C';
         aux[aux_size++] = cf_tag;
         orig = aux;
+        aux_end = aux + aux_size;
     }
 
     // Copy aux keys to td_b and aux values to slice aux blocks
-    while (aux - orig < aux_size && aux[0] != 0) {
+    while (aux_end - aux >= 1 && aux[0] != 0) {
         int r;
 
+        // Room for code + type + at least 1 byte of data
+        if (aux - orig >= aux_size - 3)
+            goto err;
+
         // RG:Z
         if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
             char *rg = &aux[3];
+            aux = rg;
+            while (aux < aux_end && *aux++);
+            if (aux == aux_end && aux[-1] != '\0') {
+                hts_log_error("Unterminated RG:Z tag for read \"%s\"",
+                              bam_get_qname(b));
+                goto err;
+            }
             brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
             if (brg) {
-                while (*aux++);
                 if (CRAM_MAJOR_VERS(fd->version) >= 4)
                     BLOCK_APPEND(td_b, "RG*", 3);
                 continue;
             } else {
                 // RG:Z tag will be stored verbatim
                 hts_log_warning("Missing @RG header for RG \"%s\"", rg);
+                aux = rg - 3;
             }
         }
 
@@ -2703,7 +2822,12 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
         if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
             if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
                 if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
-                    while (*aux++);
+                    while (aux < aux_end && *aux++);
+                    if (aux == aux_end && aux[-1] != '\0') {
+                        hts_log_error("Unterminated MD:Z tag for read \"%s\"",
+                                      bam_get_qname(b));
+                        goto err;
+                    }
                     if (CRAM_MAJOR_VERS(fd->version) >= 4)
                         BLOCK_APPEND(td_b, "MD*", 3);
                     continue;
@@ -2714,7 +2838,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
         // NM:i
         if (aux[0] == 'N' && aux[1] == 'M') {
             if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
-                int NM_ = bam_aux2i((uint8_t *)aux+2);
+                int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
                 if (NM_ == NM) {
                     switch(aux[2]) {
                     case 'A': case 'C': case 'c': aux+=4; break;
@@ -2722,7 +2846,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
                     case 'I': case 'i': case 'f': aux+=7; break;
                     default:
                         hts_log_error("Unhandled type code for NM tag");
-                        return NULL;
+                        goto err;
                     }
                     if (CRAM_MAJOR_VERS(fd->version) >= 4)
                         BLOCK_APPEND(td_b, "NM*", 3);
@@ -2735,10 +2859,12 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
 
         // Container level tags_used, for TD series
         // Maps integer key ('X0i') to cram_tag_map struct.
-        int key = (aux[0]<<16)|(aux[1]<<8)|aux[2];
+        int key = (((unsigned char *) aux)[0]<<16 |
+                   ((unsigned char *) aux)[1]<<8  |
+                   ((unsigned char *) aux)[2]);
         k = kh_put(m_tagmap, c->tags_used, key, &r);
         if (-1 == r)
-            return NULL;
+            goto err;
         else if (r != 0)
             kh_val(c->tags_used, k) = NULL;
 
@@ -2750,7 +2876,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
             k_global = kh_put(m_metrics, fd->tags_used, key, &r);
             if (-1 == r) {
                 pthread_mutex_unlock(&fd->metrics_lock);
-                return NULL;
+                goto err;
             }
             if (r >= 1) {
                 kh_val(fd->tags_used, k_global) = cram_new_metrics();
@@ -2901,9 +3027,12 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
 
         switch(aux[2]) {
         case 'A': case 'C': case 'c':
+            if (aux_end - aux < 3+1)
+                goto err;
+
             if (!tm->blk) {
                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
-                    return NULL;
+                    goto err;
                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
             }
 
@@ -2915,9 +3044,12 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
             break;
 
         case 'S': case 's':
+            if (aux_end - aux < 3+2)
+                goto err;
+
             if (!tm->blk) {
                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
-                    return NULL;
+                    goto err;
                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
             }
 
@@ -2928,9 +3060,12 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
             break;
 
         case 'I': case 'i': case 'f':
+            if (aux_end - aux < 3+4)
+                goto err;
+
             if (!tm->blk) {
                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
-                    return NULL;
+                    goto err;
                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
             }
 
@@ -2941,9 +3076,12 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
             break;
 
         case 'd':
+            if (aux_end - aux < 3+8)
+                goto err;
+
             if (!tm->blk) {
                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
-                    return NULL;
+                    goto err;
                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
             }
 
@@ -2953,35 +3091,47 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
             aux+=8;
             break;
 
-        case 'Z': case 'H':
-            {
-                if (!tm->blk) {
-                    if (!(tm->blk = cram_new_block(EXTERNAL, key)))
-                        return NULL;
-                    codec->out = tm->blk;
-                }
+        case 'Z': case 'H': {
+            if (aux_end - aux < 3)
+                goto err;
+
+            if (!tm->blk) {
+                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
+                    goto err;
+                codec->out = tm->blk;
+            }
 
-                char *aux_s;
-                aux += 3;
-                aux_s = aux;
-                while (*aux++);
-                if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
-                    return NULL;
+            char *aux_s;
+            aux += 3;
+            aux_s = aux;
+            while (aux < aux_end && *aux++);
+            if (aux == aux_end && aux[-1] != '\0') {
+                hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
+                              aux_s[-3], aux_s[-2], aux_s[-1],
+                              bam_get_qname(b));
+                goto err;
             }
+            if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
+                goto err;
             break;
+        }
 
         case 'B': {
-            int type = aux[3], blen;
-            uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
-                                        (((unsigned char *)aux)[5]<< 8) +
-                                        (((unsigned char *)aux)[6]<<16) +
-                                        (((unsigned char *)aux)[7]<<24));
+            if (aux_end - aux < 4+4)
+                goto err;
+
+            int type = aux[3];
+            uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
+                              ((uint64_t)((unsigned char *)aux)[5]) << 8 |
+                              ((uint64_t)((unsigned char *)aux)[6]) <<16 |
+                              ((uint64_t)((unsigned char *)aux)[7]) <<24);
+            uint64_t blen;
             if (!tm->blk) {
                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
-                    return NULL;
+                    goto err;
                 if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
                     if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
-                        return NULL;
+                        goto err;
                     codec->u.e_byte_array_len.len_codec->out = tm->blk2;
                     codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
                 } else {
@@ -3006,19 +3156,21 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
                 break;
             default:
                 hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
-                return NULL;
+                goto err;
             }
 
             blen += 5; // sub-type & length
+            if (aux_end - aux < blen || blen > INT_MAX)
+                goto err;
 
-            if (codec->encode(s, codec, aux, blen) < 0)
-                return NULL;
+            if (codec->encode(s, codec, aux, (int) blen) < 0)
+                goto err;
             aux += blen;
             break;
         }
         default:
-            hts_log_error("Unknown aux type '%c'", aux[2]);
-            return NULL;
+            hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
+            goto err;
         }
         tm->blk->m = tm->m;
     }
@@ -3036,7 +3188,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
         goto block_err;
     k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
     if (new < 0) {
-        return NULL;
+        goto err;
     } else if (new == 0) {
         BLOCK_SIZE(td_b) = TD_blk_size;
     } else {
@@ -3249,6 +3401,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     c->num_bases   += cr->len;
     cr->apos        = bam_pos(b)+1;
+    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
+        goto err;
     if (c->pos_sorted) {
         if (cr->apos < s->last_apos && !fd->ap_delta) {
             c->pos_sorted = 0;
@@ -3287,6 +3441,11 @@ static int process_one_read(cram_fd *fd, cram_container *c,
         int64_t apos = cr->apos-1, spos = 0;
         int64_t MD_last = apos; // last position of edit in MD tag
 
+        if (apos < 0) {
+            hts_log_error("Mapped read with position <= 0 is disallowed");
+            return -1;
+        }
+
         cr->cigar       = s->ncigar;
         cr->ncigar      = bam_cigar_len(b);
         while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
@@ -3536,7 +3695,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
         cr->cigar  = 0;
         cr->ncigar = 0;
         cr->nfeature = 0;
-        cr->aend = cr->apos;
+        cr->aend = MIN(cr->apos, c->ref_end);
         for (i = 0; i < cr->len; i++)
             if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
                 goto block_err;
@@ -3994,9 +4153,25 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
         if (c->bams[c->curr_c_rec] == NULL)
             return -1;
     }
+    if (bam_seq_len(b)) {
+        c->s_num_bases += bam_seq_len(b);
+    } else {
+        // No sequence in BAM record.  CRAM doesn't directly support this
+        // case, it ends up being stored as a string of N's for each query
+        // consuming CIGAR operation.  As this can become very inefficient
+        // in time and memory, data where the query length is excessively
+        // long are rejected.
+        hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
+        if (qlen > 100000000) {
+            hts_log_error("CIGAR query length %"PRIhts_pos
+                          " for read \"%s\" is too long",
+                          qlen, bam_get_qname(b));
+            return -1;
+        }
+        c->s_num_bases += qlen;
+    }
     c->curr_rec++;
     c->curr_c_rec++;
-    c->s_num_bases += bam_seq_len(b);
     c->s_aux_bytes += bam_get_l_aux(b);
     c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
     fd->record_counter++;
diff --git a/htslib/cram/cram_external.c b/htslib/cram/cram_external.c
index 7455185ad..4943750dd 100644
--- a/htslib/cram/cram_external.c
+++ b/htslib/cram/cram_external.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2015, 2018-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -121,6 +121,16 @@ int cram_container_is_empty(cram_fd *fd) {
     return fd->empty_container;
 }
 
+void cram_container_get_coords(cram_container *c,
+                               int *refid, hts_pos_t *start, hts_pos_t *span) {
+    if (refid)
+        *refid = c->ref_seq_id;
+    if (start)
+        *start = c->ref_seq_start;
+    if (span)
+        *span  = c->ref_seq_span;
+}
+
 
 /*
  *-----------------------------------------------------------------------------
@@ -281,7 +291,7 @@ static cram_codec *cram_codec_iter_next(cram_codec_iter *iter,
             iter->curr_map = iter->curr_map->next;
             return cc;
         }
-    } while (iter->idx <= CRAM_MAP_HASH);
+    } while (iter->idx < CRAM_MAP_HASH);
 
     // End of codecs
     return NULL;
@@ -683,6 +693,7 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
             cram_free_block(blk);
             return -1;
         }
+
         if (cram_write_block(out, blk) != 0) {
             cram_free_block(blk);
             return -1;
@@ -704,6 +715,192 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
     return 0;
 }
 
+/*
+ * Discards the next containers worth of data.
+ * Only the cram structure has been read so far.
+ *
+ * Returns 0 on success,
+ *        -1 on failure
+ */
+static int cram_skip_container(cram_fd *in, cram_container *c) {
+    // Compression header
+    cram_block *blk;
+    if (!(blk = cram_read_block(in)))
+        return -1;
+    cram_free_block(blk);
+
+    int i;
+    for (i = 0; i < c->num_landmarks; i++) {
+        cram_block_slice_hdr *hdr;
+
+        if (!(blk = cram_read_block(in)))
+            return -1;
+        if (!(hdr = cram_decode_slice_header(in, blk))) {
+            cram_free_block(blk);
+            return -1;
+        }
+        cram_free_block(blk);
+
+        int num_blocks = cram_slice_hdr_get_num_blocks(hdr), j;
+        for (j = 0; j < num_blocks; j++) {
+            blk = cram_read_block(in);
+            if (!blk) {
+                cram_free_slice_header(hdr);
+                return -1;
+            }
+            cram_free_block(blk);
+        }
+        cram_free_slice_header(hdr);
+    }
+
+    return 0;
+}
+
+
+/*
+ * Copies a container, but filtering it down to a specific region,
+ * which has already been set on the 'in' fd.
+ *
+ * This is used in e.g. samtools cat where we specified a region and discover
+ * that a region doesn't entirely span the container, so we have to select
+ * which reads we need to copy out of it.
+ *
+ * If ref_id is non-NULL we also return the last ref_id we filtered.
+ * This can be -2 if it's multi-ref and we observe more than one reference,
+ * and actual ref_id >= -1 if it's multi-ref and we observe just one ref or
+ * it's fixed reference.
+ *
+ * Returns 0 on success
+ *        -1 on error
+ */
+int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
+                          int *ref_id) {
+    int err = 0, fixed_ref = -3;
+
+    if (ref_id)
+        *ref_id = c->ref_seq_id;
+
+    int rid = in->range.refid == -2 ? -1 : in->range.refid;
+    if (rid != c->ref_seq_id ||
+        in->range.start > c->ref_seq_start + c->ref_seq_span-1)
+        // Except for multi-ref cases
+        if (c->ref_seq_id != -2)
+            return cram_skip_container(in, c);
+
+    // Container compression header
+    cram_block *blk = cram_read_block(in);
+    if (!blk)
+        return -1;
+    c->comp_hdr = cram_decode_compression_header(in, blk);
+    in->ctr = c;
+
+    // If it's multi-ref but a constant ref-id, then we can still do
+    // basic level chromosome filtering.  Similarly multi-ref where we're
+    // _already_ in ref "*" (unmapped) means we can just copy the container
+    // as there are no positions to filter on and "*" sorts to the end.
+    // TODO: how to tell "already in" though?
+    if (c->ref_seq_id == -2) {
+        cram_codec *cd = c->comp_hdr->codecs[DS_RI];
+        if (cd && cd->codec == E_HUFFMAN && cd->u.huffman.ncodes == 1 &&
+            // this check should be always true anyway
+            rid == cd->u.huffman.codes[0].symbol)
+            // We're in multi-ref mode, but actually the entire container
+            // matches.  So if we're in whole-chromosome mode we can just
+            // copy.
+            if (in->range.start <= 1 &&
+                in->range.end >= (INT64_MAX&(0xffffffffULL<<32))) {
+                if (ref_id)
+                    *ref_id = rid;
+                err |= cram_write_container(out, c) < 0;
+                err |= cram_write_block(out, blk);
+                return cram_copy_slice(in, out, c->num_landmarks) | -err;
+            }
+    }
+
+    // A simple read-write loop with region filtering automatically due to
+    // an earlier CRAM_OPT_RANGE request.
+    //
+    // We can hit EOF when reaching the end of the range, but we still need
+    // to manually check we don't attempt to read beyond this single container.
+
+    cram_range rng_copy = in->range;
+    in->range.start = INT64_MIN;
+    in->range.end = INT64_MAX;
+
+    bam1_t *b = bam_init1();
+    while ((c->curr_slice < c->max_slice ||
+            c->slice->curr_rec < c->slice->max_rec)) {
+        cram_slice *s;
+        if (c->slice && c->slice->curr_rec < c->slice->max_rec)
+            s = c->slice;
+        else if (c->curr_slice < c->max_slice)
+            s = cram_next_slice(in, &c);
+        else
+            break; // end of container
+        c->slice = s;
+
+        // This is more efficient if we check as a cram record instead of a
+        // bam record as we don't have to parse CIGAR end.
+        cram_record *cr = &c->slice->crecs[c->slice->curr_rec];
+        if (fixed_ref == -3)
+            fixed_ref = cr->ref_id;
+        else if (fixed_ref != cr->ref_id)
+            fixed_ref = -2;
+
+        if (rng_copy.refid != cr->ref_id) {
+            if (rng_copy.refid == -2) {
+                if (cr->ref_id > -1) {
+                    // Want unmapped, but have mapped
+                    c->slice->curr_rec++;
+                    continue;
+                }
+            } else {
+                if (rng_copy.refid > cr->ref_id || rng_copy.refid == -1) {
+                    // multi-ref and not at the correct ref yet
+                    c->slice->curr_rec++;
+                    continue;
+                } else {
+                    // multi-ref and beyond the desired ref
+                    break;
+                }
+            }
+        }
+
+        // Correct ref, but check the desired region
+        if (cr->aend < rng_copy.start) {
+            c->slice->curr_rec++;
+            continue;
+        }
+        if (cr->apos > rng_copy.end)
+            break;
+
+        // Broadly rquivalent to cram_get_bam_seq, but starting from 'cr'
+        err |= cram_to_bam(in->header, in, s, cr, s->curr_rec++, &b) < 0;
+
+        if (cram_put_bam_seq(out, b) < 0) {
+            err |= 1;
+            break;
+        }
+    }
+    bam_destroy1(b);
+
+    if (ref_id)
+        *ref_id = fixed_ref;
+
+    in->range = rng_copy;
+
+    // Avoids double frees as we stole the container from our other
+    // file descriptor.
+    in->ctr    = NULL;
+    in->ctr_mt = NULL;
+
+    err |= cram_flush(out);
+    cram_free_block(blk);
+
+    return -err;
+}
+
+
 /*
  * Renumbers RG numbers in a cram compression header.
  *
diff --git a/htslib/cram/cram_index.c b/htslib/cram/cram_index.c
index b775e9431..77c953d6c 100644
--- a/htslib/cram/cram_index.c
+++ b/htslib/cram/cram_index.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2013-2020, 2023 Genome Research Ltd.
+Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -410,6 +410,9 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
         // Continue from a previous search.
         // We switch to just scanning the linked list, as the nested
         // lists are typically short.
+        if (refid == HTS_IDX_NOCOOR)
+            refid = -1;
+
         e = from->e_next;
         if (e && e->refid == refid && e->start <= pos)
             return e;
@@ -423,6 +426,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
         // fail, or already there, dealt with elsewhere.
         return NULL;
 
+    case -1:
     case HTS_IDX_NOCOOR:
         refid = -1;
         pos = 0;
@@ -737,10 +741,11 @@ int cram_index_container(cram_fd *fd,
         int ret;
 
         spos = htell(fd->fp);
-        if (spos - cpos - c->offset != c->landmark[j]) {
+        if (spos - cpos - (off_t) c->offset != c->landmark[j]) {
             hts_log_error("CRAM slice offset %"PRId64" does not match"
-                          " landmark %d in container header (%d)",
-                          spos - cpos - c->offset, j, c->landmark[j]);
+                          " landmark %d in container header (%"PRId32")",
+                          (int64_t) (spos - cpos - (off_t) c->offset),
+                          j, c->landmark[j]);
             return -1;
         }
 
@@ -826,8 +831,13 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
             return -1;
         }
 
-        cpos = htell(fd->fp);
-        assert(cpos == hpos + c->length);
+        off_t next_cpos = htell(fd->fp);
+        if (next_cpos != hpos + c->length) {
+            hts_log_error("Length %"PRId32" in container header at offset %lld does not match block lengths (%lld)",
+                          c->length, (long long) cpos, (long long) next_cpos - hpos);
+            return -1;
+        }
+        cpos = next_cpos;
 
         cram_free_container(c);
     }
@@ -838,3 +848,193 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
 
     return (bgzf_close(fp) >= 0)? 0 : -4;
 }
+
+// internal recursive step
+static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos,
+                                            int64_t nct,
+                                            off_t cstart, off_t cend,
+                                            int64_t *first, int64_t *last) {
+    int64_t nc = 0, i;
+
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (e->offset >= cstart && (!cend || e->offset <= cend)) {
+                if (first && *first < 0)
+                    *first = nct;
+                if (last)
+                    *last = nct;
+            }
+            nc++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    for (i = 0; i < e->nslice; i++)
+        nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct,
+                                           cstart, cend, first, last);
+
+    return nc;
+}
+
+/*! Returns the number of containers in the CRAM file within given offsets.
+ *
+ * The cstart and cend offsets are the locations of the start of containers
+ * as returned by index_container_offset.
+ *
+ * If non-NULL, first and last will hold the inclusive range of container
+ * numbers, counting from zero.
+ *
+ * @return
+ * Returns the number of containers, equivalent to *last-*first+1.
+ */
+int64_t cram_num_containers_between(cram_fd *fd,
+                                    off_t cstart, off_t cend,
+                                    int64_t *first, int64_t *last) {
+    int64_t nc = 0, i;
+    int64_t last_pos = -99;
+    int64_t l_first = -1, l_last = -1;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc,
+                                           cstart, cend, &l_first, &l_last);
+    }
+
+    if (first)
+        *first = l_first;
+    if (last)
+        *last = l_last;
+
+    return l_last - l_first + 1;
+}
+
+/*
+ * Queries the total number of distinct containers in the index.
+ * Note there may be more containers in the file than in the index, as we
+ * are not required to have an index entry for every one.
+ */
+int64_t cram_num_containers(cram_fd *fd) {
+    return cram_num_containers_between(fd, 0, 0, NULL, NULL);
+}
+
+
+/*! Returns the byte offset for the start of the n^th container.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+static cram_index *cram_container_num2offset_(cram_index *e, int num,
+                                              int64_t *last_pos, int *nc) {
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (*nc == num)
+                return e;
+            (*nc)++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    int i;
+    for (i = 0; i < e->nslice; i++) {
+        cram_index *tmp = cram_container_num2offset_(&e->e[i], num,
+                                                     last_pos, nc);
+        if (tmp)
+            return tmp;
+    }
+
+
+    return NULL;
+}
+
+off_t cram_container_num2offset(cram_fd *fd, int64_t num) {
+    int nc = 0, i;
+    int64_t last_pos = -9;
+    cram_index *e = NULL;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        if (!fd->index[j].nslice)
+            continue;
+        if ((e = cram_container_num2offset_(&fd->index[j], num,
+                                            &last_pos, &nc)))
+            break;
+    }
+
+    return e ? e->offset : -1;
+}
+
+
+/*! Returns the container number for the first container at offset >= pos.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+static cram_index *cram_container_offset2num_(cram_index *e, off_t pos,
+                                              int64_t *last_pos, int *nc) {
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (e->offset >= pos)
+                return e;
+            (*nc)++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    int i;
+    for (i = 0; i < e->nslice; i++) {
+        cram_index *tmp = cram_container_offset2num_(&e->e[i], pos,
+                                                     last_pos, nc);
+        if (tmp)
+            return tmp;
+    }
+
+
+    return NULL;
+}
+
+int64_t cram_container_offset2num(cram_fd *fd, off_t pos) {
+    int nc = 0, i;
+    int64_t last_pos = -9;
+    cram_index *e = NULL;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        if (!fd->index[j].nslice)
+            continue;
+        if ((e = cram_container_offset2num_(&fd->index[j], pos,
+                                            &last_pos, &nc)))
+            break;
+    }
+
+    return e ? nc : -1;
+}
+
+/*!
+ * Returns the file offsets of CRAM containers covering a specific region
+ * query.  Note both offsets are the START of the container.
+ *
+ * first will point to the start of the first overlapping container
+ * last will point to the start of the last overlapping container
+ *
+ * Returns 0 on success
+ *        <0 on failure
+ */
+int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
+                       off_t *first, off_t *last) {
+    cram_index *ci;
+
+    if (first) {
+        if (!(ci = cram_index_query(fd, refid, start, NULL)))
+            return -1;
+        *first = ci->offset;
+    }
+
+    if (last) {
+        if (!(ci = cram_index_query_last(fd, refid, end)))
+            return -1;
+        *last = ci->offset;
+    }
+
+    return 0;
+}
diff --git a/htslib/cram/cram_io.c b/htslib/cram/cram_io.c
index ca226e29c..7f7ffca49 100644
--- a/htslib/cram/cram_io.c
+++ b/htslib/cram/cram_io.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2023 Genome Research Ltd.
+Copyright (c) 2012-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -69,6 +69,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
 #endif
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#include "../fuzz_settings.h"
+#endif
+
 #include "cram.h"
 #include "os.h"
 #include "../htslib/hts.h"
@@ -1568,6 +1572,11 @@ int cram_uncompress_block(cram_block *b) {
     char *uncomp;
     size_t uncomp_size = 0;
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
+    b->crc32_checked = 1;
+#endif
+
     if (b->crc32_checked == 0) {
         uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
         b->crc32_checked = 1;
@@ -1843,8 +1852,9 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
         // see enum cram_block. We map RANS_* methods to order bit-fields
         static int methmap[] = { 1, 64,9, 128,129, 192,193 };
 
+        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
         cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
-                                method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]);
+                                m | RANS_ORDER_SIMD_AUTO);
         *out_size = out_size_i;
         return (char *)cp;
     }
@@ -1974,11 +1984,15 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s,
         // We also get large fluctuations based on genome coordinate for
         // e.g. SA:Z and SC series, but we consider the typical scale of
         // delta between blocks and use this to look for abnormality.
+
+        // Equivalent to (but minus possible integer overflow)
+        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
+        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
         if (metrics->input_avg_sz &&
-            (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) ||
-             b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) &&
-            ABS(b->uncomp_size-metrics->input_avg_sz)
-                > 10*metrics->input_avg_delta) {
+            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
+             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
+            ABS(b->uncomp_size-metrics->input_avg_sz)/10
+                > metrics->input_avg_delta) {
             metrics->next_trial = 0;
         }
 
@@ -2069,10 +2083,10 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s,
                     } else if (c) {
                         free(c);
                     } else {
-                        sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
+                        sz[m] = UINT_MAX; // arbitrarily worse than raw
                     }
                 } else {
-                    sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
+                    sz[m] = UINT_MAX; // arbitrarily worse than raw
                 }
             }
 
@@ -3211,7 +3225,8 @@ void cram_ref_decr(refs_t *r, int id) {
  * Returns all or part of a reference sequence on success (malloced);
  *         NULL on failure.
  */
-static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
+static char *load_ref_portion(BGZF *fp, ref_entry *e,
+                              hts_pos_t start, hts_pos_t end) {
     off_t offset, len;
     char *seq;
 
@@ -3307,7 +3322,7 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
  */
 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
     ref_entry *e = r->ref_id[id];
-    int start = 1, end = e->length;
+    hts_pos_t start = 1, end = e->length;
     char *seq;
 
     if (e->seq) {
@@ -3391,7 +3406,7 @@ ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
  * Returns reference on success,
  *         NULL on failure
  */
-char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
+char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
     ref_entry *r;
     char *seq;
     int ostart = start;
@@ -3686,6 +3701,14 @@ cram_container *cram_new_container(int nrec, int nslice) {
     return NULL;
 }
 
+static void free_bam_list(bam_seq_t **bams, int max_rec) {
+    int i;
+    for (i = 0; i < max_rec; i++)
+        bam_free(bams[i]);
+
+    free(bams);
+}
+
 void cram_free_container(cram_container *c) {
     enum cram_DS_ID id;
     int i;
@@ -3739,6 +3762,14 @@ void cram_free_container(cram_container *c) {
                 cram_codec *c = tm->codec;
 
                 if (c) c->free(c);
+
+                // If tm->blk or tm->blk2 is set, then we haven't yet got to
+                // cram_encode_container which copies the blocks to s->aux_block
+                // and NULLifies tm->blk*.  In this case we failed to complete
+                // the container construction, so we have to free up our partially
+                // converted CRAM.
+                cram_free_block(tm->blk);
+                cram_free_block(tm->blk2);
                 free(tm);
             }
         }
@@ -3749,6 +3780,9 @@ void cram_free_container(cram_container *c) {
     if (c->ref_free)
         free(c->ref);
 
+    if (c->bams)
+        free_bam_list(c->bams, c->max_c_rec);
+
     free(c);
 }
 
@@ -3852,7 +3886,13 @@ cram_container *cram_read_container(cram_fd *fd) {
         return NULL;
 
     *c = c2;
-
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
+        fd->err = errno = ENOMEM;
+        cram_free_container(c);
+        return NULL;
+    }
+#endif
     if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
         fd->err = errno;
         cram_free_container(c);
@@ -3875,6 +3915,11 @@ cram_container *cram_read_container(cram_fd *fd) {
             rd+=4;
         }
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
+        crc = c->crc32;
+#endif
+
         if (crc != c->crc32) {
             hts_log_error("Container header CRC32 failure");
             cram_free_container(c);
@@ -4400,6 +4445,14 @@ void cram_free_slice(cram_slice *s) {
         free(s->block);
     }
 
+    {
+        // Normally already copied into s->block[], but potentially still
+        // here if we error part way through cram_encode_slice.
+        int i;
+        for (i = 0; i < s->naux_block; i++)
+            cram_free_block(s->aux_block[i]);
+    }
+
     if (s->block_by_id)
         free(s->block_by_id);
 
@@ -4679,6 +4732,11 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
         if (-1 == int32_decode(fd, &header_len))
             return NULL;
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        if (header_len > FUZZ_ALLOC_LIMIT)
+            return NULL;
+#endif
+
         /* Alloc and read */
         if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
             return NULL;
@@ -4875,7 +4933,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
             if (!sam_hrecs_find_key(ty, "M5", NULL)) {
                 char unsigned buf[16];
                 char buf2[33];
-                int rlen;
+                hts_pos_t rlen;
                 hts_md5_context *md5;
 
                 if (!fd->refs ||
@@ -4903,7 +4961,19 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
                 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
                 if (!(md5 = hts_md5_init()))
                     return -1;
-                hts_md5_update(md5, ref, rlen);
+                if (HTS_POS_MAX <= ULONG_MAX) {
+                    // Platforms with 64-bit unsigned long update in one go
+                    hts_md5_update(md5, ref, rlen);
+                } else {
+                    // Those with 32-bit ulong (Windows) may have to loop
+                    // over epic references
+                    hts_pos_t pos = 0;
+                    while (rlen - pos > ULONG_MAX) {
+                        hts_md5_update(md5, ref + pos, ULONG_MAX);
+                        pos += ULONG_MAX;
+                    }
+                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
+                }
                 hts_md5_final(buf, md5);
                 hts_md5_destroy(md5);
                 cram_ref_decr(fd->refs, i);
@@ -5300,6 +5370,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
     fd->slices_per_container = SLICE_PER_CNT;
     fd->embed_ref = -1; // automatic selection
     fd->no_ref = 0;
+    fd->no_ref_counter = 0;
     fd->ap_delta = 0;
     fd->ignore_md5 = 0;
     fd->lossy_read_names = 0;
@@ -5323,6 +5394,11 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
     fd->ooc         = 0;
     fd->required_fields = INT_MAX;
 
+    pthread_mutex_init(&fd->metrics_lock, NULL);
+    pthread_mutex_init(&fd->ref_lock, NULL);
+    pthread_mutex_init(&fd->range_lock, NULL);
+    pthread_mutex_init(&fd->bam_list_lock, NULL);
+
     for (i = 0; i < DS_END; i++) {
         fd->m[i] = cram_new_metrics();
         if (!fd->m[i])
@@ -5393,15 +5469,22 @@ int cram_flush(cram_fd *fd) {
     if (!fd)
         return -1;
 
+    int ret = 0;
+
     if (fd->mode == 'w' && fd->ctr) {
         if(fd->ctr->slice)
             cram_update_curr_slice(fd->ctr, fd->version);
 
         if (-1 == cram_flush_container_mt(fd, fd->ctr))
-            return -1;
+            ret = -1;
+
+        cram_free_container(fd->ctr);
+        if (fd->ctr_mt == fd->ctr)
+            fd->ctr_mt = NULL;
+        fd->ctr = NULL;
     }
 
-    return 0;
+    return ret;
 }
 
 /*
@@ -5488,6 +5571,7 @@ int cram_write_eof_block(cram_fd *fd) {
 
     return 0;
 }
+
 /*
  * Closes a CRAM file.
  * Returns 0 on success
@@ -5495,7 +5579,7 @@ int cram_write_eof_block(cram_fd *fd) {
  */
 int cram_close(cram_fd *fd) {
     spare_bams *bl, *next;
-    int i;
+    int i, ret = 0;
 
     if (!fd)
         return -1;
@@ -5505,7 +5589,7 @@ int cram_close(cram_fd *fd) {
             cram_update_curr_slice(fd->ctr, fd->version);
 
         if (-1 == cram_flush_container_mt(fd, fd->ctr))
-            return -1;
+            ret = -1;
     }
 
     if (fd->mode != 'w')
@@ -5515,40 +5599,37 @@ int cram_close(cram_fd *fd) {
         hts_tpool_process_flush(fd->rqueue);
 
         if (0 != cram_flush_result(fd))
-            return -1;
+            ret = -1;
 
         if (fd->mode == 'w')
             fd->ctr = NULL; // prevent double freeing
 
-        pthread_mutex_destroy(&fd->metrics_lock);
-        pthread_mutex_destroy(&fd->ref_lock);
-        pthread_mutex_destroy(&fd->bam_list_lock);
-
         //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
 
         hts_tpool_process_destroy(fd->rqueue);
     }
 
-    if (fd->mode == 'w') {
+    pthread_mutex_destroy(&fd->metrics_lock);
+    pthread_mutex_destroy(&fd->ref_lock);
+    pthread_mutex_destroy(&fd->range_lock);
+    pthread_mutex_destroy(&fd->bam_list_lock);
+
+    if (ret == 0 && fd->mode == 'w') {
         /* Write EOF block */
         if (0 != cram_write_eof_block(fd))
-            return -1;
+            ret = -1;
     }
 
     for (bl = fd->bl; bl; bl = next) {
-        int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
+        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
 
         next = bl->next;
-        for (i = 0; i < max_rec; i++) {
-            if (bl->bams[i])
-                bam_free(bl->bams[i]);
-        }
-        free(bl->bams);
+        free_bam_list(bl->bams, max_rec);
         free(bl);
     }
 
     if (hclose(fd->fp) != 0)
-        return -1;
+        ret = -1;
 
     if (fd->file_def)
         cram_free_file_def(fd->file_def);
@@ -5592,10 +5673,11 @@ int cram_close(cram_fd *fd) {
 
     if (fd->idxfp)
         if (bgzf_close(fd->idxfp) < 0)
-            return -1;
+            ret = -1;
 
     free(fd);
-    return 0;
+
+    return ret;
 }
 
 /*
@@ -5806,10 +5888,6 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
                 return -1;
 
             fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
-            pthread_mutex_init(&fd->metrics_lock, NULL);
-            pthread_mutex_init(&fd->ref_lock, NULL);
-            pthread_mutex_init(&fd->range_lock, NULL);
-            pthread_mutex_init(&fd->bam_list_lock, NULL);
             fd->shared_ref = 1;
             fd->own_pool = 1;
         }
@@ -5823,10 +5901,6 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
             fd->rqueue = hts_tpool_process_init(fd->pool,
                                                 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
                                                 0);
-            pthread_mutex_init(&fd->metrics_lock, NULL);
-            pthread_mutex_init(&fd->ref_lock, NULL);
-            pthread_mutex_init(&fd->range_lock, NULL);
-            pthread_mutex_init(&fd->bam_list_lock, NULL);
         }
         fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
         fd->own_pool = 0;
diff --git a/htslib/cram/cram_io.h b/htslib/cram/cram_io.h
index 53ae30f59..d2d583df7 100644
--- a/htslib/cram/cram_io.h
+++ b/htslib/cram/cram_io.h
@@ -227,10 +227,8 @@ static inline int block_resize(cram_block *b, size_t len) {
     if (b->alloc > len)
         return 0;
 
-    size_t alloc = b->alloc;
-    while (alloc <= len)
-        alloc = alloc ? alloc + (alloc>>2) : 1024;
-
+    size_t alloc = b->alloc+800;
+    alloc = MAX(alloc + (alloc>>2), len);
     return block_resize_exact(b, alloc);
 }
 
@@ -393,7 +391,7 @@ void refs_free(refs_t *r);
  * Returns reference on success;
  *         NULL on failure
  */
-char *cram_get_ref(cram_fd *fd, int id, int start, int end);
+char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end);
 void cram_ref_incr(refs_t *r, int id);
 void cram_ref_decr(refs_t *r, int id);
 /**@}*/
diff --git a/htslib/cram/cram_stats.c b/htslib/cram/cram_stats.c
index 3ceda0db1..d06b8ffb9 100644
--- a/htslib/cram/cram_stats.c
+++ b/htslib/cram/cram_stats.c
@@ -132,8 +132,9 @@ void cram_stats_dump(cram_stats *st) {
  * Returns the best codec to use.
  */
 enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
-    int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX;
+    int nvals, i, max_val = 0, min_val = INT_MAX;
     int *vals = NULL, *freqs = NULL, vals_alloc = 0;
+    int ntot HTS_UNUSED = 0;
 
 #if DEBUG_CRAM_STATS
     cram_stats_dump(st);
diff --git a/htslib/cram/cram_structs.h b/htslib/cram/cram_structs.h
index 160663392..9540b5618 100644
--- a/htslib/cram/cram_structs.h
+++ b/htslib/cram/cram_structs.h
@@ -455,7 +455,8 @@ struct cram_container {
     int qs_seq_orient;           // 1 => same as seq. 0 => original orientation
 
     /* Copied from fd before encoding, to allow multi-threading */
-    int ref_start, first_base, last_base, ref_id, ref_end;
+    int ref_id;
+    hts_pos_t ref_start, first_base, last_base, ref_end;
     char *ref;
     int embed_ref;               // 1 if embedding ref, 2 if embedding cons
     int no_ref;                  // true if referenceless
@@ -648,8 +649,8 @@ struct cram_slice {
     khash_t(m_s2i) *pair[2];   // for identifying read-pairs in this slice.
 
     char *ref;                 // slice of current reference
-    int ref_start;             // start position of current reference;
-    int ref_end;               // end position of current reference;
+    hts_pos_t ref_start;       // start position of current reference;
+    hts_pos_t ref_end;         // end position of current reference;
     int ref_id;
 
     // For going from BAM to CRAM; an array of auxiliary blocks per type
@@ -802,12 +803,12 @@ struct cram_fd {
     int first_base, last_base; // copied to container
 
     // cached reference portion
-    refs_t *refs;              // ref meta-data structure
-    char *ref, *ref_free;      // current portion held in memory
-    int   ref_id;              // copied to container
-    int   ref_start;           // copied to container
-    int   ref_end;             // copied to container
-    char *ref_fn;   // reference fasta filename
+    refs_t   *refs;                // ref meta-data structure
+    char     *ref, *ref_free;      // current portion held in memory
+    int       ref_id;              // copied to container
+    hts_pos_t ref_start;           // copied to container
+    hts_pos_t ref_end;             // copied to container
+    char     *ref_fn;              // reference fasta filename
 
     // compression level and metrics
     int level;
@@ -821,6 +822,7 @@ struct cram_fd {
     int slices_per_container;
     int embed_ref; // copied to container
     int no_ref;    // copied to container
+    int no_ref_counter; // decide if permanent switch
     int ignore_md5;
     int use_bz2;
     int use_rans;
diff --git a/htslib/faidx.c b/htslib/faidx.c
index 5dd4bf1c0..ed39c0ca0 100644
--- a/htslib/faidx.c
+++ b/htslib/faidx.c
@@ -1,6 +1,6 @@
 /*  faidx.c -- FASTA and FASTQ random access.
 
-    Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -43,6 +43,29 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kstring.h"
 #include "hts_internal.h"
 
+// Faster isgraph; assumes ASCII
+static inline int isgraph_(unsigned char c) {
+    return c > ' ' && c <= '~';
+}
+
+#ifdef isgraph
+#  undef isgraph
+#endif
+#define isgraph isgraph_
+
+// An optimised bgzf_getc.
+// We could consider moving this to bgzf.h, but our own code uses it here only.
+static inline int bgzf_getc_(BGZF *fp) {
+    if (fp->block_offset+1 < fp->block_length) {
+        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+        fp->uncompressed_address++;
+        return c;
+    }
+
+    return bgzf_getc(fp);
+}
+#define bgzf_getc bgzf_getc_
+
 typedef struct {
     int id; // faidx_t->name[id] is for this struct.
     uint32_t line_len, line_blen;
@@ -190,7 +213,7 @@ static faidx_t *fai_build_core(BGZF *bgzf) {
                 kputsn("", 0, &name);
 
                 if (c < 0) {
-                    hts_log_error("The last entry '%s' has no sequence", name.s);
+                    hts_log_error("The last entry '%s' has no sequence at line %d", name.s, line_num);
                     goto fail;
                 }
 
@@ -247,7 +270,7 @@ static faidx_t *fai_build_core(BGZF *bgzf) {
                         state = SEQ_END;
 
                 } else if (line_len < ll) {
-                    hts_log_error("Different line length in sequence '%s'", name.s);
+                    hts_log_error("Different line length in sequence '%s' at line %d", name.s, line_num);
                     goto fail;
                 }
 
@@ -269,7 +292,7 @@ static faidx_t *fai_build_core(BGZF *bgzf) {
             case IN_QUAL:
                 if (c == '\n') {
                     if (!read_done) {
-                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s'", name.s);
+                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s' at line %d", name.s, line_num);
                         goto fail;
                     }
 
@@ -312,6 +335,7 @@ static faidx_t *fai_build_core(BGZF *bgzf) {
         if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
             goto fail;
     } else {
+        hts_log_error("File truncated at line %d", line_num);
         goto fail;
     }
 
@@ -446,7 +470,7 @@ static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi)
     bgzf = bgzf_open(fn, "r");
 
     if ( !bgzf ) {
-        hts_log_error("Failed to open the file %s", fn);
+        hts_log_error("Failed to open the file %s : %s", fn, strerror(errno));
         goto fail;
     }
 
@@ -691,9 +715,8 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
 
 static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
                           uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
-    char *s;
-    size_t l;
-    int c = 0;
+    char *buffer, *s;
+    ssize_t nread, remaining, firstline_len, firstline_blen;
     int ret;
 
     if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
@@ -719,26 +742,57 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
         return NULL;
     }
 
-    l = 0;
-    s = (char*)malloc((size_t) end - beg + 2);
-    if (!s) {
+    // Over-allocate so there is extra space for one end-of-line sequence
+    buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
+    if (!buffer) {
         *len = -1;
         return NULL;
     }
 
-    while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
-        if (isgraph(c)) s[l++] = c;
-    if (c < 0) {
-        hts_log_error("Failed to retrieve block: %s",
-            c == -1 ? "unexpected end of file" : "error reading file");
-        free(s);
-        *len = -1;
-        return NULL;
+    remaining = *len = end - beg;
+    firstline_blen = val->line_blen - beg % val->line_blen;
+
+    // Special case when the entire interval requested is within a single FASTA/Q line
+    if (remaining <= firstline_blen) {
+        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
+        if (nread < remaining) goto error;
+        buffer[nread] = '\0';
+        return buffer;
+    }
+
+    s = buffer;
+    firstline_len = val->line_len - beg % val->line_blen;
+
+    // Read the (partial) first line and its line terminator, but increment  s  past the
+    // line contents only, so the terminator characters will be overwritten by the next line.
+    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
+    if (nread < firstline_len) goto error;
+    s += firstline_blen;
+    remaining -= firstline_blen;
+
+    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
+    while (remaining > val->line_blen) {
+        nread = bgzf_read_small(fai->bgzf, s, val->line_len);
+        if (nread < (ssize_t) val->line_len) goto error;
+        s += val->line_blen;
+        remaining -= val->line_blen;
     }
 
-    s[l] = '\0';
-    *len = l;
-    return s;
+    if (remaining > 0) {
+        nread = bgzf_read_small(fai->bgzf, s, remaining);
+        if (nread < remaining) goto error;
+        s += remaining;
+    }
+
+    *s = '\0';
+    return buffer;
+
+error:
+    hts_log_error("Failed to retrieve block: %s",
+                  (nread == 0)? "unexpected end of file" : "error reading file");
+    free(buffer);
+    *len = -1;
+    return NULL;
 }
 
 static int fai_get_val(const faidx_t *fai, const char *str,
diff --git a/htslib/fuzz_settings.h b/htslib/fuzz_settings.h
new file mode 100644
index 000000000..821581927
--- /dev/null
+++ b/htslib/fuzz_settings.h
@@ -0,0 +1,35 @@
+/*  fuzz_settings.h -- fuzz-tester specific definitions
+
+    Copyright (C) 2023 Genome Research Ltd.
+
+    Author: Rob Davies <rmd@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_FUZZ_SETTINGS_H
+#define HTSLIB_FUZZ_SETTINGS_H
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+
+#ifndef FUZZ_ALLOC_LIMIT
+// By default libfuzzer reports out-of-memory on allocations > 2 Gbytes
+#define FUZZ_ALLOC_LIMIT 2000000000ULL
+#endif
+
+#endif
+#endif
diff --git a/htslib/header.c b/htslib/header.c
index 1d2fee491..7f62074f0 100644
--- a/htslib/header.c
+++ b/htslib/header.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2018-2020 Genome Research Ltd.
+Copyright (c) 2018-2020, 2023 Genome Research Ltd.
 Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -687,7 +687,7 @@ static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
     pool_free(hrecs->tag_pool, tag);
 }
 
-static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) {
+static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found, int remove_hash) {
     if (!hrecs || !type_name || !type_found)
         return -1;
 
@@ -715,7 +715,7 @@ static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_
         }
     }
 
-    if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))
+    if (remove_hash && (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)))
         sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
 
     sam_hrecs_free_tags(hrecs, type_found->tag);
@@ -1429,7 +1429,7 @@ int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key,
     if (!type_found)
         return 0;
 
-    int ret = sam_hrecs_remove_line(hrecs, type, type_found);
+    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
     if (ret == 0) {
         if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
             return -1;
@@ -1469,7 +1469,7 @@ int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
     if (!type_found)
         return -1;
 
-    int ret = sam_hrecs_remove_line(hrecs, type, type_found);
+    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
     if (ret == 0) {
         if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
             return -1;
@@ -1609,6 +1609,37 @@ int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
     return ret;
 }
 
+static int rebuild_hash(sam_hrecs_t *hrecs, const char *type) {
+    sam_hrec_type_t *head, *step;
+    khiter_t k;
+
+    if (strncmp(type, "SQ", 2) == 0) {
+        hrecs->nref = 0;
+        kh_clear(m_s2i, hrecs->ref_hash);
+    } else if (strncmp(type, "RG", 2) == 0) {
+        hrecs->nrg = 0;
+        kh_clear(m_s2i, hrecs->rg_hash);
+    }
+
+    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
+
+    if (k != kh_end(hrecs->h)) { // something to rebuild
+        head = kh_val(hrecs->h, k);
+        step = head;
+
+        do {
+            if (sam_hrecs_update_hashes(hrecs, TYPEKEY(type), step) == -1) {
+                hts_log_error("Unable to rebuild hashes");
+                return -1;
+            }
+
+            step = step->next;
+        } while (step != head);
+    }
+
+    return 0;
+}
+
 int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
     sam_hrecs_t *hrecs;
     if (!bh || !type)
@@ -1643,11 +1674,21 @@ int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, c
     while (step != type_found) {
         sam_hrec_type_t *to_remove = step;
         step = step->next;
-        ret &= sam_hrecs_remove_line(hrecs, type, to_remove);
+        ret &= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
     }
 
     if (remove_all)
-        ret &= sam_hrecs_remove_line(hrecs, type, type_found);
+        ret &= sam_hrecs_remove_line(hrecs, type, type_found, 0);
+
+    /* if RG or SQ, delete then rebuild the hashes (as it is faster
+       to rebuild than delete one by one).
+    */
+
+    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
+        if (rebuild_hash(hrecs, type)) {
+            return -1;
+        }
+    }
 
     if (!ret && hrecs->dirty)
         redact_header_text(bh);
@@ -1691,7 +1732,7 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *
            if (k == kh_end(rh)) { // value is not in the hash table, so remove
                sam_hrec_type_t *to_remove = step;
                step = step->next;
-               ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
+               ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
            } else {
                step = step->next;
            }
@@ -1707,10 +1748,20 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *
        if (k == kh_end(rh)) { // value is not in the hash table, so remove
            sam_hrec_type_t *to_remove = head;
            head = head->next;
-           ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
+           ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
        }
     }
 
+    /* if RG or SQ, delete then rebuild the hashes (as it is faster
+       to rebuild than delete one by one).
+    */
+
+    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
+        if (rebuild_hash(hrecs, type)) {
+            return -1;
+        }
+    }
+
     if (!ret && hrecs->dirty)
         redact_header_text(bh);
 
@@ -2088,23 +2139,34 @@ static int sam_hdr_link_pg(sam_hdr_t *bh) {
         k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
 
         if (k == kh_end(hrecs->pg_hash)) {
-            hts_log_warning("PG line with PN:%s has a PP link to missing program '%s'",
+            hts_log_warning("PG line with ID:%s has a PP link to missing program '%s'",
                     hrecs->pg[i].name, tag->str+3);
             continue;
         }
 
-        hrecs->pg[i].prev_id = hrecs->pg[kh_val(hrecs->pg_hash, k)].id;
-        hrecs->pg_end[kh_val(hrecs->pg_hash, k)] = -1;
-        chain_size[i] = chain_size[kh_val(hrecs->pg_hash, k)]+1;
+        int pp_idx = kh_val(hrecs->pg_hash, k);
+        if (pp_idx == i) {
+            hts_log_warning("PG line with ID:%s has a PP link to itself",
+                            hrecs->pg[i].name);
+            continue;
+        }
+
+        hrecs->pg[i].prev_id = hrecs->pg[pp_idx].id;
+        hrecs->pg_end[pp_idx] = -1;
+        chain_size[i] = chain_size[pp_idx]+1;
     }
 
+    int last_end = -1;
     for (i = j = 0; i < hrecs->npg; i++) {
-        if (hrecs->pg_end[i] != -1 && chain_size[i] > 0)
-            hrecs->pg_end[j++] = hrecs->pg_end[i];
+        if (hrecs->pg_end[i] != -1) {
+            last_end = hrecs->pg_end[i];
+            if (chain_size[i] > 0)
+                hrecs->pg_end[j++] = hrecs->pg_end[i];
+        }
     }
     /* Only leafs? Choose the last one! */
-    if (!j && hrecs->npg_end > 0) {
-        hrecs->pg_end[0] = hrecs->pg_end[hrecs->npg_end-1];
+    if (!j && hrecs->npg_end > 0 && last_end >= 0) {
+        hrecs->pg_end[0] = last_end;
         j = 1;
     }
 
@@ -2243,6 +2305,7 @@ int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
                 free(end);
                 return -1;
             }
+            assert(end[i] >= 0 && end[i] < hrecs->npg);
             va_start(args, name);
             if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
                                      "ID", id,
@@ -2295,7 +2358,7 @@ void sam_hdr_incr_ref(sam_hdr_t *bh) {
  * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
  *         NULL on failure
  */
-sam_hrecs_t *sam_hrecs_new() {
+sam_hrecs_t *sam_hrecs_new(void) {
     sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
 
     if (!hrecs)
diff --git a/htslib/hfile.c b/htslib/hfile.c
index 78533dd56..552b71774 100644
--- a/htslib/hfile.c
+++ b/htslib/hfile.c
@@ -1,6 +1,6 @@
 /*  hfile.c -- buffered low-level input/output streams.
 
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2021, 2023-2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
@@ -121,6 +121,7 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
     fp->at_eof = 0;
     fp->mobile = 1;
     fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
+    fp->preserve = 0;
     fp->has_errno = 0;
     return fp;
 
@@ -143,6 +144,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode,
     fp->at_eof = 1;
     fp->mobile = 0;
     fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
+    fp->preserve = 0;
     fp->has_errno = 0;
     return fp;
 }
@@ -482,8 +484,10 @@ int hclose(hFILE *fp)
     int err = fp->has_errno;
 
     if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
-    if (fp->backend->close(fp) < 0) err = errno;
-    hfile_destroy(fp);
+    if (!fp->preserve) {
+        if (fp->backend->close(fp) < 0) err = errno;
+        hfile_destroy(fp);
+    }
 
     if (err) {
         errno = err;
@@ -495,6 +499,8 @@ int hclose(hFILE *fp)
 void hclose_abruptly(hFILE *fp)
 {
     int save = errno;
+    if (fp->preserve)
+        return;
     if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
     hfile_destroy(fp);
     errno = save;
@@ -697,7 +703,7 @@ static int is_preload_url_remote(const char *url){
 
 static hFILE *hopen_preload(const char *url, const char *mode){
     hFILE* fp = hopen(url + 8, mode);
-    return hpreload(fp);
+    return fp ? hpreload(fp) : NULL;
 }
 
 hFILE *hdopen(int fd, const char *mode)
@@ -878,11 +884,18 @@ char *hfile_mem_steal_buffer(hFILE *file, size_t *length) {
     return buf;
 }
 
+// open() stub for mem: which only works with the vopen() interface
+// Use 'data:,' for data encoded in the URL
+static hFILE *hopen_not_supported(const char *fname, const char *mode) {
+    errno = EINVAL;
+    return NULL;
+}
+
 int hfile_plugin_init_mem(struct hFILE_plugin *self)
 {
     // mem files are declared remote so they work with a tabix index
     static const struct hFILE_scheme_handler handler =
-            {NULL, hfile_always_remote, "mem", 2000 + 50, hopenv_mem};
+            {hopen_not_supported, hfile_always_remote, "mem", 2000 + 50, hopenv_mem};
     self->name = "mem";
     hfile_add_scheme_handler("mem", &handler);
     return 0;
@@ -917,7 +930,7 @@ static hFILE *crypt4gh_needed(const char *url, const char *mode)
 int hfile_plugin_init_crypt4gh_needed(struct hFILE_plugin *self)
 {
     static const struct hFILE_scheme_handler handler =
-        { crypt4gh_needed, NULL, "crypt4gh-needed", 0, NULL };
+        { crypt4gh_needed, hfile_always_local, "crypt4gh-needed", 0, NULL };
     self->name = "crypt4gh-needed";
     hfile_add_scheme_handler("crypt4gh", &handler);
     return 0;
@@ -963,7 +976,7 @@ void hfile_shutdown(int do_close_plugin)
     pthread_mutex_unlock(&plugins_lock);
 }
 
-static void hfile_exit()
+static void hfile_exit(void)
 {
     hfile_shutdown(0);
     pthread_mutex_destroy(&plugins_lock);
@@ -1016,6 +1029,10 @@ void hfile_add_scheme_handler(const char *scheme,
                               const struct hFILE_scheme_handler *handler)
 {
     int absent;
+    if (handler->open == NULL || handler->isremote == NULL) {
+        hts_log_warning("Couldn't register scheme handler for %s: missing method", scheme);
+        return;
+    }
     if (!schemes) {
         if (try_exe_add_scheme_handler(scheme, handler) != 0) {
             hts_log_warning("Couldn't register scheme handler for %s", scheme);
@@ -1065,7 +1082,7 @@ static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *),
  * Returns 0 on success,
  *        <0 on failure
  */
-static int load_hfile_plugins()
+static int load_hfile_plugins(void)
 {
     static const struct hFILE_scheme_handler
         data = { hopen_mem, hfile_always_local, "built-in", 80 },
diff --git a/htslib/hfile_libcurl.c b/htslib/hfile_libcurl.c
index 1e4a4486f..6bbd88fe9 100644
--- a/htslib/hfile_libcurl.c
+++ b/htslib/hfile_libcurl.c
@@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE.  */
 #ifndef _WIN32
 # include <sys/select.h>
 #endif
+#include <sys/stat.h>
 #include <assert.h>
 
 #include "hfile_internal.h"
@@ -277,7 +278,7 @@ static void free_auth(auth_token *tok) {
     free(tok);
 }
 
-static void libcurl_exit()
+static void libcurl_exit(void)
 {
     if (curl_share_cleanup(curl.share) == CURLSHE_OK)
         curl.share = NULL;
@@ -837,7 +838,7 @@ static ssize_t libcurl_read(hFILE *fpv, void *bufferv, size_t nbytes)
         got = fp->buffer.ptr.rd - buffer;
 
         if (to_skip >= 0) { // Skipping over a small seek
-            if (got < to_skip) { // Need to skip more data
+            if (got <= to_skip) { // Need to skip more data
                 to_skip -= got;
             } else {
                 got -= to_skip;
@@ -1246,6 +1247,19 @@ libcurl_open(const char *url, const char *modes, http_headers *headers)
         if (env_curl_ca_bundle) {
             err |= curl_easy_setopt(fp->easy, CURLOPT_CAINFO, env_curl_ca_bundle);
         }
+#if defined __linux__ && defined BUILDING_WHEEL
+        else {
+            // Linux wheels are (currently) built on AlmaLinux, so the libcurl.so bundled
+            // into the wheel follows Alma/Red Hat/Fedora conventions for the location of
+            // its certificate bundle. This fails when the wheel is used on a Debian/Ubuntu
+            // platform with a different convention for this location. When not overridden
+            // by $CURL_CA_BUNDLE, work around this by specifying the expected Debian bundle
+            // location if the Red Hat one isn't present.
+            struct stat st;
+            if (stat("/etc/pki", &st) < 0 && errno == ENOENT)
+                err |= curl_easy_setopt(fp->easy, CURLOPT_CAINFO, "/etc/ssl/certs/ca-certificates.crt");
+        }
+#endif
     }
     err |= curl_easy_setopt(fp->easy, CURLOPT_USERAGENT, curl.useragent.s);
     if (fp->headers.callback) {
diff --git a/htslib/hfile_s3.c b/htslib/hfile_s3.c
index e2718f656..c7c52e617 100644
--- a/htslib/hfile_s3.c
+++ b/htslib/hfile_s3.c
@@ -1,6 +1,6 @@
 /*  hfile_s3.c -- Amazon S3 backend for low-level file streams.
 
-    Copyright (C) 2015-2017, 2019-2023 Genome Research Ltd.
+    Copyright (C) 2015-2017, 2019-2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
@@ -51,6 +51,7 @@ typedef struct s3_auth_data {
     kstring_t user_query_string;
     kstring_t host;
     kstring_t profile;
+    enum {s3_auto, s3_virtual, s3_path} url_style;
     time_t creds_expiry_time;
     char *bucket;
     kstring_t auth_hdr;
@@ -563,17 +564,32 @@ static int redirect_endpoint_callback(void *auth, long response,
             kputs(new_region, &ad->region);
 
             ad->host.l = 0;
-            ksprintf(&ad->host, "s3.%s.amazonaws.com", new_region);
 
+            if (ad->url_style == s3_path) {
+                // Path style https://s3.{region-code}.amazonaws.com/{bucket-name}/{key-name}
+                ksprintf(&ad->host, "s3.%s.amazonaws.com", new_region);
+            } else {
+                // Virtual https://{bucket-name}.s3.{region-code}.amazonaws.com/{key-name}
+                // Extract the {bucket-name} from {ad->host} to include in subdomain
+                kstring_t url_prefix = KS_INITIALIZE;
+                kputsn(ad->host.s, strcspn(ad->host.s, "."), &url_prefix);
+
+                ksprintf(&ad->host, "%s.s3.%s.amazonaws.com", url_prefix.s, new_region);
+                free(url_prefix.s);
+            }
             if (ad->region.l && ad->host.l) {
+               int e = 0;
                url->l = 0;
-               kputs(ad->host.s, url);
-               kputsn(ad->bucket, strlen(ad->bucket), url);
-               if (ad->user_query_string.l) {
-                   kputc('?', url);
-                   kputsn(ad->user_query_string.s, ad->user_query_string.l, url);
-               }
-               ret = 0;
+               e |= kputs("https://", url) < 0;
+               e |= kputs(ad->host.s, url) < 0;
+               e |= kputsn(ad->bucket, strlen(ad->bucket), url) < 0;
+
+               if (!e)
+                   ret = 0;
+            }
+            if (ad->user_query_string.l) {
+                kputc('?', url);
+                kputsn(ad->user_query_string.s, ad->user_query_string.l, url);
             }
         }
     }
@@ -591,11 +607,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
     ptrdiff_t bucket_len;
     int is_https = 1, dns_compliant;
     char *query_start;
-    enum {s3_auto, s3_virtual, s3_path} address_style = s3_auto;
 
     if (!ad)
         return NULL;
     ad->mode = strchr(mode, 'r') ? 'r' : 'w';
+    ad->url_style = s3_auto;
 
     // Our S3 URL format is s3[+SCHEME]://[ID[:SECRET[:TOKEN]]@]BUCKET/PATH
 
@@ -647,9 +663,9 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
 
         if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) {
             if (strcasecmp(v, "virtual") == 0) {
-                address_style = s3_virtual;
+                ad->url_style = s3_virtual;
             } else if (strcasecmp(v, "path") == 0) {
-                address_style = s3_path;
+                ad->url_style = s3_path;
             }
         }
     }
@@ -669,11 +685,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
 
         if (url_style.l) {
             if (strcmp(url_style.s, "virtual") == 0) {
-                address_style = s3_virtual;
+                ad->url_style = s3_virtual;
             } else if (strcmp(url_style.s, "path") == 0) {
-                address_style = s3_path;
+                ad->url_style = s3_path;
             } else {
-                address_style = s3_auto;
+                ad->url_style = s3_auto;
             }
         }
         if (expiry_time.l) {
@@ -703,9 +719,9 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
             // Conforming to s3cmd's GitHub PR#416, host_bucket without the "%(bucket)s" string
             // indicates use of path style adressing.
             if (strstr(url_style.s, "%(bucket)s") == NULL) {
-                address_style = s3_path;
+                ad->url_style = s3_path;
             } else {
-                address_style = s3_auto;
+                ad->url_style = s3_auto;
             }
         }
 
@@ -717,9 +733,9 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode,
 
 
     // if address_style is set, force the dns_compliant setting
-    if (address_style == s3_virtual) {
+    if (ad->url_style == s3_virtual) {
         dns_compliant = 1;
-    } else if (address_style == s3_path) {
+    } else if (ad->url_style == s3_path) {
         dns_compliant = 0;
     } else {
         dns_compliant = is_dns_compliant(bucket, path, is_https);
@@ -872,7 +888,7 @@ static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *sig
     const unsigned char service[] = "s3";
     const unsigned char request[] = "aws4_request";
 
-    kstring_t secret_access_key = {0, 0, NULL};
+    kstring_t secret_access_key = KS_INITIALIZE;
     unsigned int len;
     unsigned int i, j;
 
@@ -899,11 +915,11 @@ static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *sig
 
 
 static int make_authorisation(s3_auth_data *ad, char *http_request, char *content, kstring_t *auth) {
-    kstring_t signed_headers = {0, 0, NULL};
-    kstring_t canonical_headers = {0, 0, NULL};
-    kstring_t canonical_request = {0, 0, NULL};
-    kstring_t scope = {0, 0, NULL};
-    kstring_t string_to_sign = {0, 0, NULL};
+    kstring_t signed_headers = KS_INITIALIZE;
+    kstring_t canonical_headers = KS_INITIALIZE;
+    kstring_t canonical_request = KS_INITIALIZE;
+    kstring_t scope = KS_INITIALIZE;
+    kstring_t string_to_sign = KS_INITIALIZE;
     char cr_hash[HASH_LENGTH_SHA256];
     char signature_string[HASH_LENGTH_SHA256];
     int ret = -1;
@@ -1024,7 +1040,7 @@ static int order_query_string(kstring_t *qs) {
     int *query_offset = NULL;
     int num_queries, i;
     char **queries = NULL;
-    kstring_t ordered = {0, 0, NULL};
+    kstring_t ordered = KS_INITIALIZE;
     char *escaped = NULL;
     int ret = -1;
 
@@ -1298,6 +1314,24 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
 
         if (fp == NULL) goto error;
 
+        if (http_response == 307) {
+            // Follow additional redirect.
+            ad->refcount = 1;
+            hclose_abruptly(fp);
+
+            url.l  = 0;
+            ksprintf(&url, "https://%s%s", ad->host.s, ad->bucket);
+
+            fp = hopen(url.s, mode, "va_list", argsp,
+                   "httphdr_callback", v4_auth_header_callback,
+                   "httphdr_callback_data", ad,
+                   "redirect_callback", redirect_endpoint_callback,
+                   "redirect_callback_data", ad,
+                   "http_response_ptr", &http_response,
+                   "fail_on_error", 0,
+                   NULL);
+        }
+
         if (http_response == 400) {
             ad->refcount = 1;
             if (handle_400_response(fp, ad) != 0) {
@@ -1318,7 +1352,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) {
 
         if (fp == NULL) goto error;
     } else {
-        kstring_t final_url = {0, 0, NULL};
+        kstring_t final_url = KS_INITIALIZE;
 
          // add the scheme marker
         ksprintf(&final_url, "s3w+%s", url.s);
diff --git a/htslib/hfile_s3_write.c b/htslib/hfile_s3_write.c
index d54945839..a501645ca 100644
--- a/htslib/hfile_s3_write.c
+++ b/htslib/hfile_s3_write.c
@@ -822,7 +822,7 @@ static hFILE *vhopen_s3_write(const char *url, const char *mode, va_list args) {
 }
 
 
-static void s3_write_exit() {
+static void s3_write_exit(void) {
     if (curl_share_cleanup(curl.share) == CURLSHE_OK)
         curl.share = NULL;
 
diff --git a/htslib/hts.c b/htslib/hts.c
index b7b528a61..a8a8bead2 100644
--- a/htslib/hts.c
+++ b/htslib/hts.c
@@ -1,6 +1,6 @@
 /*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
 
-    Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2012-2024 Genome Research Ltd.
     Copyright (C) 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -81,7 +81,7 @@ KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
 HTSLIB_EXPORT
 int hts_verbose = HTS_LOG_WARNING;
 
-const char *hts_version()
+const char *hts_version(void)
 {
     return HTS_VERSION_TEXT;
 }
@@ -431,6 +431,27 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim)
     return 1;
 }
 
+static inline int
+alternate_zeros(const unsigned char *u, const unsigned char *ulim)
+{
+    for (; u < ulim; u += 2)
+        if (*u != '\0') return 0;
+    return 1;
+}
+
+static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
+{
+    if (ulim - u >= 6 &&
+        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
+         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
+        return 2;
+    else if (ulim - u >= 8 &&
+             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
+        return 1;
+    else
+        return 0;
+}
+
 static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
 {
     const unsigned char *eol = memchr(u, '\n', ulim - u);
@@ -794,6 +815,7 @@ char *hts_format_description(const htsFormat *format)
     case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
     case custom: kputs(" compressed", &str); break;
     case gzip:   kputs(" gzip-compressed", &str); break;
+
     case bgzf:
         switch (format->format) {
         case bam:
@@ -808,6 +830,22 @@ char *hts_format_description(const htsFormat *format)
             break;
         }
         break;
+
+    case no_compression:
+        switch (format->format) {
+        case bam:
+        case bcf:
+        case cram:
+        case csi:
+        case tbi:
+            // These are normally compressed, so emphasise that this one isn't
+            kputs(" uncompressed", &str);
+            break;
+        default:
+            break;
+        }
+        break;
+
     default: break;
     }
 
@@ -921,10 +959,18 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
          fmt->format == fastq_format))
         fp->format.format = fmt->format;
 
-    if (fmt && fmt->specific)
-        if (hts_opt_apply(fp, fmt->specific) != 0)
+    if (fmt && fmt->specific) {
+        if (hts_opt_apply(fp, fmt->specific) != 0) {
+            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
+                (errno == ENOENT || errno == EIO || errno == EBADF ||
+                  errno == EACCES || errno == EISDIR)) {
+                /* error during reference file operation
+                 for these specific errors, set the error as EINVAL */
+                errno = EINVAL;
+            }
             goto error;
-
+        }
+    }
     if ( rmme ) free(rmme);
     return fp;
 
@@ -1276,7 +1322,7 @@ int hts_parse_opt_list(htsFormat *fmt, const char *str) {
  *        -1 on failure.
  */
 int hts_parse_format(htsFormat *format, const char *str) {
-    char fmt[8];
+    char fmt[9];
     const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
 
     format->version.minor = 0; // unknown
@@ -1404,6 +1450,7 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode,
 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
 {
     hFILE *hfile_orig = hfile;
+    hFILE *hfile_cleanup = hfile;
     htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
     char simple_mode[101], *cp, *opts;
     simple_mode[100] = '\0';
@@ -1431,6 +1478,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
         // Deal with formats that re-direct an underlying file via a plug-in.
         // Loops as we may have crypt4gh served via htsget, or
         // crypt4gh-in-crypt4gh.
+
         while (fp->format.format == htsget ||
                fp->format.format == hts_crypt4gh_format) {
             // Ensure we don't get stuck in an endless redirect loop
@@ -1443,11 +1491,30 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
                 hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
                 if (hfile2 == NULL) goto error;
 
+                if (hfile != hfile_cleanup) {
+                    // Close the result of an earlier redirection
+                    hclose_abruptly(hfile);
+                }
+
                 hfile = hfile2;
             }
             else if (fp->format.format == hts_crypt4gh_format) {
+                int should_preserve = (hfile == hfile_orig);
+                int update_cleanup = (hfile == hfile_cleanup);
                 if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
                     goto error;
+                if (should_preserve) {
+                    // The original hFILE is now contained in a crypt4gh
+                    // wrapper.  Should we need to close the wrapper due
+                    // to a later error, we need to prevent the wrapped
+                    // handle from being closed as the caller will see
+                    // this function return NULL and try to clean up itself.
+                    hfile_orig->preserve = 1;
+                }
+                if (update_cleanup) {
+                    // Update handle to close at the end if redirected by htsget
+                    hfile_cleanup = hfile;
+                }
             }
 
             // Re-detect format against the result of the redirection
@@ -1529,9 +1596,13 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
     if (opts)
         hts_process_opts(fp, opts);
 
-    // If redirecting, close the original hFILE now (pedantically we would
-    // instead close it in hts_close(), but this a simplifying optimisation)
-    if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
+    // Allow original file to close if it was preserved earlier by crypt4gh
+    hfile_orig->preserve = 0;
+
+    // If redirecting via htsget, close the original hFILE now (pedantically
+    // we would instead close it in hts_close(), but this a simplifying
+    // optimisation)
+    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
 
     return fp;
 
@@ -1540,6 +1611,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
 
     // If redirecting, close the failed redirection hFILE that we have opened
     if (hfile != hfile_orig) hclose_abruptly(hfile);
+    hfile_orig->preserve = 0; // Allow caller to close the original hfile
 
     if (fp) {
         free(fp->fn);
@@ -1549,9 +1621,15 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
     return NULL;
 }
 
+static int hts_idx_close_otf_fp(hts_idx_t *idx);
+
 int hts_close(htsFile *fp)
 {
     int ret = 0, save;
+    if (!fp) {
+        errno = EINVAL;
+        return -1;
+    }
 
     switch (fp->format.format) {
     case binary_format:
@@ -1598,6 +1676,14 @@ int hts_close(htsFile *fp)
         break;
     }
 
+    if (fp->idx) {
+        // Close deferred index file handle, if present.
+        // Unfortunately this means errors on the index will get mixed with
+        // those on the main file, but as we only have the EOF block left to
+        // write it hopefully won't happen that often.
+        ret |= hts_idx_close_otf_fp(fp->idx);
+    }
+
     save = errno;
     sam_hdr_destroy(fp->bam_header);
     hts_idx_destroy(fp->idx);
@@ -1678,7 +1764,7 @@ static hFILE *hts_hfile(htsFile *fp) {
     case bcf:          // fall through
     case bam:          return bgzf_hfile(fp->fp.bgzf);
     case cram:         return cram_hfile(fp->fp.cram);
-    case text_format:  return fp->fp.hfile;
+    case text_format:  // fall through
     case vcf:          // fall through
     case fastq_format: // fall through
     case fasta_format: // fall through
@@ -1896,6 +1982,12 @@ hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
     return fp;
 }
 
+int hts_is_utf16_text(const kstring_t *str)
+{
+    const unsigned char *u = (const unsigned char *) (str->s);
+    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
+}
+
 // For VCF/BCF backward sweeper. Not exposing these functions because their
 // future is uncertain. Things will probably have to change with hFILE...
 BGZF *hts_get_bgzfp(htsFile *fp)
@@ -1965,6 +2057,8 @@ char **hts_readlist(const char *string, int is_file, int *_n)
         while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
         {
             if (str.l == 0) continue;
+            if (n == 0 && hts_is_utf16_text(&str))
+                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
             if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
                 goto err;
             s[n] = strdup(str.s);
@@ -2024,6 +2118,8 @@ char **hts_readlines(const char *fn, int *_n)
         str.s = 0; str.l = str.m = 0;
         while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
             if (str.l == 0) continue;
+            if (n == 0 && hts_is_utf16_text(&str))
+                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
             if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
                 goto err;
             s[n] = strdup(str.s);
@@ -2148,6 +2244,7 @@ struct hts_idx_t {
         uint64_t off_beg, off_end;
         uint64_t n_mapped, n_unmapped;
     } z; // keep internal states
+    BGZF *otf_fp;  // Index on-the-fly output file
 };
 
 static char * idx_format_name(int fmt) {
@@ -2274,6 +2371,7 @@ hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_l
     }
     idx->tbi_n = -1;
     idx->last_tbi_tid = -1;
+    idx->otf_fp = NULL;
     return idx;
 }
 
@@ -2379,9 +2477,14 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
     return ret;
 }
 
+static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
+{
+    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
+}
+
 int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
 {
-    int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
+    hts_pos_t maxpos = hts_idx_maxpos(idx);
     if (tid < 0 || (beg <= maxpos && end <= maxpos))
         return 0;
 
@@ -2589,6 +2692,17 @@ static inline void swap_bins(bins_t *p)
     }
 }
 
+static int need_idx_ugly_delay_hack(const hts_idx_t *idx)
+{
+    // Ugly hack for on-the-fly BAI indexes.  As these are uncompressed,
+    // we need to delay writing a few bytes of data until file close
+    // so that we have something to force a modification time update.
+    //
+    // (For compressed indexes like CSI, the BGZF EOF block serves the same
+    // purpose).
+    return idx->otf_fp && !idx->otf_fp->is_compressed;
+}
+
 static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
 {
     int32_t i, j;
@@ -2641,7 +2755,12 @@ static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
         }
     }
 
-    check(idx_write_uint64(fp, idx->n_no_coor));
+    if (!need_idx_ugly_delay_hack(idx)) {
+        // Write this for compressed (CSI) indexes, but for BAI we
+        // need to save a bit for later.  See hts_idx_close_otf_fp()
+        check(idx_write_uint64(fp, idx->n_no_coor));
+    }
+
 #ifdef DEBUG_INDEX
     idx_dump(idx);
 #endif
@@ -2672,16 +2791,9 @@ int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
     return ret;
 }
 
-int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
+static int hts_idx_write_out(const hts_idx_t *idx, BGZF *fp, int fmt)
 {
-    BGZF *fp;
-
-    #define check(ret) if ((ret) < 0) goto fail
-
-    if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
-
-    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
-    if (fp == NULL) return -1;
+    #define check(ret) if ((ret) < 0) return -1
 
     if (fmt == HTS_FMT_CSI) {
         check(bgzf_write(fp, "CSI\1", 4));
@@ -2697,12 +2809,64 @@ int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int
 
     check(idx_save_core(idx, fp, fmt));
 
-    return bgzf_close(fp);
     #undef check
+    return 0;
+}
 
-fail:
-    bgzf_close(fp);
-    return -1;
+int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
+{
+    BGZF *fp;
+
+    if (fnidx == NULL)
+        return hts_idx_save(idx, fn, fmt);
+
+    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
+    if (fp == NULL) return -1;
+
+    if (hts_idx_write_out(idx, fp, fmt) < 0) {
+        int save_errno = errno;
+        bgzf_close(fp);
+        errno = save_errno;
+        return -1;
+    }
+
+    return bgzf_close(fp);
+}
+
+// idx_save for on-the-fly indexes.  Mostly duplicated from above, except
+// idx is not const because we want to store the file handle in it, and
+// the index file handle is not closed.  This allows the index file to be
+// closed after the EOF block on the indexed file has been written out,
+// so the modification times on the two files will be in the correct order.
+int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt)
+{
+    idx->otf_fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
+    if (idx->otf_fp == NULL) return -1;
+
+    if (hts_idx_write_out(idx, idx->otf_fp, fmt) < 0) {
+        int save_errno = errno;
+        bgzf_close(idx->otf_fp);
+        idx->otf_fp = NULL;
+        errno = save_errno;
+        return -1;
+    }
+
+    return bgzf_flush(idx->otf_fp);
+}
+
+static int hts_idx_close_otf_fp(hts_idx_t *idx)
+{
+    if (idx && idx->otf_fp) {
+        int ret = 0;
+        if (need_idx_ugly_delay_hack(idx)) {
+            // BAI index - write out the bytes we deferred earlier
+            ret = idx_write_uint64(idx->otf_fp, idx->n_no_coor) < 0;
+        }
+        ret |= bgzf_close(idx->otf_fp) < 0;
+        idx->otf_fp = NULL;
+        return ret == 0 ? 0 : -1;
+    }
+    return 0;
 }
 
 static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
@@ -3094,7 +3258,7 @@ static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid,
     size_t reg_bin_count = 0, hash_bin_count;
     int res;
 
-    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
+    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
         return -1;
 
     hash_bin_count = kh_n_buckets(bidx);
@@ -3213,6 +3377,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
     khint_t k;
     bidx_t *bidx;
     uint64_t min_off, max_off;
+    hts_pos_t idx_maxpos;
     hts_itr_t *iter;
     uint32_t unmapped = 0, rel_off;
 
@@ -3257,6 +3422,9 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
 
             if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
 
+            idx_maxpos = hts_idx_maxpos(idx);
+            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
+
             rel_off = beg>>idx->min_shift;
             // compute min_off
             bin = hts_bin_first(idx->n_lvls) + rel_off;
@@ -3299,7 +3467,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
             // compute max_off: a virtual offset from a bin to the right of end
             // First check if end lies within the range of the index (it won't
             // if it's HTS_POS_MAX)
-            if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
+            if (end <= idx_maxpos) {
                 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
                 if (bin >= idx->n_bins) bin = 0;
                 while (1) {
@@ -3385,7 +3553,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
     bidx_t *bidx;
     uint64_t min_off, max_off, t_off = (uint64_t)-1;
     int tid;
-    hts_pos_t beg, end;
+    hts_pos_t beg, end, idx_maxpos;
     hts_reglist_t *curr_reg;
     uint32_t unmapped = 0, rel_off;
 
@@ -3427,6 +3595,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
             else
                 unmapped = 1;
 
+            idx_maxpos = hts_idx_maxpos(idx);
+
             for(j=0; j<curr_reg->count; j++) {
                 hts_pair32_t *curr_intv = &curr_reg->intervals[j];
                 if (curr_intv->end < curr_intv->beg)
@@ -3434,6 +3604,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
 
                 beg = curr_intv->beg;
                 end = curr_intv->end;
+                if (beg >= idx_maxpos)
+                    continue;
                 rel_off = beg>>idx->min_shift;
 
                 /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
@@ -3478,7 +3650,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
                 // compute max_off: a virtual offset from a bin to the right of end
                 // First check if end lies within the range of the index (it
                 // won't if it's HTS_POS_MAX)
-                if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
+                if (end <= idx_maxpos) {
                     bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
                     if (bin >= idx->n_bins) bin = 0;
                     while (1) {
@@ -3654,7 +3826,7 @@ void hts_itr_destroy(hts_itr_t *iter)
     }
 }
 
-static inline long long push_digit(long long i, char c)
+static inline unsigned long long push_digit(unsigned long long i, char c)
 {
     // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
     int digit = c - '0';
@@ -3663,7 +3835,7 @@ static inline long long push_digit(long long i, char c)
 
 long long hts_parse_decimal(const char *str, char **strend, int flags)
 {
-    long long n = 0;
+    unsigned long long n = 0;
     int digits = 0, decimals = 0, e = 0, lost = 0;
     char sign = '+', esign = '+';
     const char *s, *str_orig = str;
@@ -4277,11 +4449,12 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
                                     break;
 
                                 uint64_t max = iter->off[j].max;
-                                if ((max>>32) != tid)
+                                if ((max>>32) != tid) {
                                     tid = HTS_IDX_START; // => no range limit
-
-                                if (end < rl->intervals[max & 0xffffffff].end)
-                                    end = rl->intervals[max & 0xffffffff].end;
+                                } else {
+                                    if (end < rl->intervals[max & 0xffffffff].end)
+                                        end = rl->intervals[max & 0xffffffff].end;
+                                }
                                 if (v < iter->off[j].v)
                                     v = iter->off[j].v;
                                 j++;
@@ -4511,9 +4684,19 @@ static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_
 }
 
 /*
- * Check the existence of a local index file using part of the alignment file name.
- * The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai
+ * Check the existence of a local index file using part of the alignment
+ * file name.
+ *
+ * For a filename fn of fn.fmt (eg fn.bam or fn.cram) the order of checks is
+ * fn.fmt.csi,  fn.csi,
+ * fn.fmt.bai,  fn.bai  - if fmt is HTS_FMT_BAI
+ * fn.fmt.tbi,  fn.tbi  - if fmt is HTS_FMT_TBI
+ * fn.fmt.crai, fn.crai - if fmt is HTS_FMT_CRAI
+ * fn.fmt.fai           - if fmt is HTS_FMT_FAI
+ *   also .gzi if fmt is ".gz"
+ *
  * @param fn    - pointer to the file name
+ * @param fmt   - one of the HTS_FMT index formats
  * @param fnidx - pointer to the index file name placeholder
  * @return        1 for success, 0 for failure
  */
@@ -4521,11 +4704,12 @@ int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
     int i, l_fn, l_ext;
     const char *fn_tmp = NULL;
     char *fnidx_tmp;
-    char *csi_ext = ".csi";
-    char *bai_ext = ".bai";
-    char *tbi_ext = ".tbi";
-    char *crai_ext = ".crai";
-    char *fai_ext = ".fai";
+    const char *csi_ext = ".csi";
+    const char *bai_ext = ".bai";
+    const char *tbi_ext = ".tbi";
+    const char *crai_ext = ".crai";
+    const char *fai_ext = ".fai";
+    const char *gzi_ext = ".gzi";
 
     if (!fn)
         return 0;
@@ -4622,10 +4806,21 @@ int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
                 }
         }
     } else if (fmt == HTS_FMT_FAI) { // Or .fai
-        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, fai_ext);
+        // Check .gzi if we have a .gz file
+        strcpy(fnidx_tmp, fn_tmp);
+        int gzi_ok = 1;
+        if ((l_fn > 3 && strcmp(fn_tmp+l_fn-3, ".gz") == 0) ||
+            (l_fn > 5 && strcmp(fn_tmp+l_fn-5, ".bgzf") == 0)) {
+            strcpy(fnidx_tmp + l_fn, gzi_ext);
+            gzi_ok = stat(fnidx_tmp, &sbuf)==0;
+        }
+
+        // Now check for .fai.  Occurs second as we're returning this
+        // in *fnidx irrespective of whether we did gzi check.
+        strcpy(fnidx_tmp + l_fn, fai_ext);
         *fnidx = fnidx_tmp;
-        if(stat(fnidx_tmp, &sbuf) == 0)
-            return 1;
+        if (stat(fnidx_tmp, &sbuf) == 0)
+            return gzi_ok;
         else
             return 0;
     }
@@ -4900,7 +5095,7 @@ int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
     return 0;
 }
 
-void hts_lib_shutdown()
+void hts_lib_shutdown(void)
 {
     hfile_shutdown(1);
 }
@@ -4914,7 +5109,7 @@ void hts_set_log_level(enum htsLogLevel level)
     hts_verbose = level;
 }
 
-enum htsLogLevel hts_get_log_level()
+enum htsLogLevel hts_get_log_level(void)
 {
     return hts_verbose;
 }
diff --git a/htslib/hts_expr.c b/htslib/hts_expr.c
index 5e5a132ea..dfd15b151 100644
--- a/htslib/hts_expr.c
+++ b/htslib/hts_expr.c
@@ -1,6 +1,6 @@
 /*  hts_expr.c -- filter expression parsing and processing.
 
-    Copyright (C) 2020-2022 Genome Research Ltd.
+    Copyright (C) 2020-2022, 2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -527,8 +527,10 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
         } else {
             break;
         }
@@ -560,8 +562,10 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
         } else {
             break;
         }
@@ -593,8 +597,10 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
         } else {
             break;
         }
diff --git a/htslib/hts_internal.h b/htslib/hts_internal.h
index 61956da21..52f29e6c1 100644
--- a/htslib/hts_internal.h
+++ b/htslib/hts_internal.h
@@ -67,6 +67,11 @@ void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset);
 
 int hts_idx_fmt(hts_idx_t *idx);
 
+// Internal interface to save on-the-fly indexes.  The index file handle
+// is kept open so hts_close() can close if after writing out the EOF
+// block for its own file.
+int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt);
+
 // Construct a unique filename based on fname and open it.
 struct hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname);
 
@@ -82,6 +87,9 @@ typedef struct hts_cram_idx_t {
     struct cram_fd *cram;
 } hts_cram_idx_t;
 
+// Determine whether the string's contents appear to be UTF-16-encoded text.
+// Returns 1 if they are, 2 if there is also a BOM, or 0 otherwise.
+int hts_is_utf16_text(const kstring_t *str);
 
 // Entry point to hFILE_multipart backend.
 struct hFILE *hopen_htsget_redirect(struct hFILE *hfile, const char *mode);
@@ -115,18 +123,6 @@ const char *hts_plugin_path(void);
  */
 int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped);
 
-/*
- * bgzf analogue to hts_idx_amend_last.
- *
- * This is needed when multi-threading and writing indices on the fly.
- * At the point of writing a record we know the virtual offset for start
- * and end, but that end virtual offset may be the end of the current
- * block.  In standard indexing our end virtual offset becomes the start
- * of the next block.  Thus to ensure bit for bit compatibility we
- * detect this boundary case and fix it up here.
- */
-void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset);
-
 static inline int find_file_extension(const char *fn, char ext_out[static HTS_MAX_EXT_LEN])
 {
     const char *delim = fn ? strstr(fn, HTS_IDX_DELIM) : NULL, *ext;
diff --git a/htslib/htscodecs/htscodecs/arith_dynamic.c b/htslib/htscodecs/htscodecs/arith_dynamic.c
index 93b4eeb35..37aca77b2 100644
--- a/htslib/htscodecs/htscodecs/arith_dynamic.c
+++ b/htslib/htscodecs/htscodecs/arith_dynamic.c
@@ -98,10 +98,11 @@ static
 unsigned char *arith_compress_O0(unsigned char *in, unsigned int in_size,
                                  unsigned char *out, unsigned int *out_size) {
     int i, bound = arith_compress_bound(in_size,0)-5; // -5 for order/size
+    unsigned char *out_free = NULL;
 
     if (!out) {
         *out_size = bound;
-        out = malloc(*out_size);
+        out_free = out = malloc(*out_size);
     }
     if (!out || bound > *out_size)
         return NULL;
@@ -118,12 +119,16 @@ unsigned char *arith_compress_O0(unsigned char *in, unsigned int in_size,
 
     RangeCoder rc;
     RC_SetOutput(&rc, (char *)out+1);
+    RC_SetOutputEnd(&rc, (char *)out + *out_size);
     RC_StartEncode(&rc);
 
     for (i = 0; i < in_size; i++)
         SIMPLE_MODEL(256, _encodeSymbol)(&byte_model, &rc, in[i]);
 
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        free(out_free);
+        return NULL;
+    }
 
     // Finalise block size and return it
     *out_size = RC_OutSize(&rc)+1;
@@ -141,8 +146,9 @@ unsigned char *arith_uncompress_O0(unsigned char *in, unsigned int in_size,
     SIMPLE_MODEL(256,_) byte_model;
     SIMPLE_MODEL(256,_init)(&byte_model, m);
 
+    unsigned char *out_free = NULL;
     if (!out)
-        out = malloc(out_sz);
+        out_free = out = malloc(out_sz);
     if (!out)
         return NULL;
 
@@ -152,7 +158,10 @@ unsigned char *arith_uncompress_O0(unsigned char *in, unsigned int in_size,
     for (i = 0; i < out_sz; i++)
         out[i] = SIMPLE_MODEL(256, _decodeSymbol)(&byte_model, &rc);
 
-    RC_FinishDecode(&rc);
+    if (RC_FinishDecode(&rc) < 0) {
+        free(out_free);
+        return NULL;
+    }
     
     return out;
 }
@@ -192,6 +201,7 @@ unsigned char *arith_compress_O1(unsigned char *in, unsigned int in_size,
 
     RangeCoder rc;
     RC_SetOutput(&rc, (char *)out+1);
+    RC_SetOutputEnd(&rc, (char *)out + *out_size);
     RC_StartEncode(&rc);
 
     uint8_t last = 0;
@@ -200,7 +210,11 @@ unsigned char *arith_compress_O1(unsigned char *in, unsigned int in_size,
         last = in[i];
     }
 
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        free(out_free);
+        htscodecs_tls_free(byte_model);
+        return NULL;
+    }
 
     // Finalise block size and return it
     *out_size = RC_OutSize(&rc)+1;
@@ -241,7 +255,11 @@ unsigned char *arith_uncompress_O1(unsigned char *in, unsigned int in_size,
         last = out[i];
     }
 
-    RC_FinishDecode(&rc);
+    if (RC_FinishDecode(&rc) < 0) {
+        htscodecs_tls_free(byte_model);
+        free(out_free);
+        return NULL;
+    }
     
     htscodecs_tls_free(byte_model);
     return out;
@@ -259,10 +277,11 @@ unsigned char *arith_compress_O2(unsigned char *in, unsigned int in_size,
 
     int i, j;
     int bound = arith_compress_bound(in_size,0)-5; // -5 for order/size
+    unsigned char *out_free = NULL;
 
     if (!out) {
         *out_size = bound;
-        out = malloc(*out_size);
+        out_free = out = malloc(*out_size);
     }
     if (!out || bound > *out_size)
         return NULL;
@@ -285,6 +304,7 @@ unsigned char *arith_compress_O2(unsigned char *in, unsigned int in_size,
 
     RangeCoder rc;
     RC_SetOutput(&rc, (char *)out+1);
+    RC_SetOutputEnd(&rc, (char *)out + *out_size);
     RC_StartEncode(&rc);
 
     unsigned char last1 = 0, last2 = 0;
@@ -295,7 +315,10 @@ unsigned char *arith_compress_O2(unsigned char *in, unsigned int in_size,
     }
 
     free(byte_model);
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        free(out_free);
+        return NULL;
+    }
 
     // Finalise block size and return it
     *out_size = RC_OutSize(&rc)+1;
@@ -310,9 +333,10 @@ unsigned char *arith_compress_O2(unsigned char *in, unsigned int in_size,
     int i, j;
     int bound = arith_compress_bound(in_size,0)-5; // -5 for order/size
 
+    unsigned char *out_free = NULL;
     if (!out) {
         *out_size = bound;
-        out = malloc(*out_size);
+        out_free = out = malloc(*out_size);
     }
     if (!out || bound > *out_size)
         return NULL;
@@ -338,6 +362,7 @@ unsigned char *arith_compress_O2(unsigned char *in, unsigned int in_size,
 
     RangeCoder rc;
     RC_SetOutput(&rc, (char *)out+1);
+    RC_SetOutputEnd(&rc, (char *)out + *out_size);
     RC_StartEncode(&rc);
 
     unsigned char last1 = 0, last2 = 0;
@@ -355,7 +380,10 @@ unsigned char *arith_compress_O2(unsigned char *in, unsigned int in_size,
     }
 
     free(byte_model);
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        free(out_free);
+        return NULL;
+    }
 
     // Finalise block size and return it
     *out_size = RC_OutSize(&rc)+1;
@@ -375,8 +403,9 @@ unsigned char *arith_uncompress_O2(unsigned char *in, unsigned int in_size,
         for (j = 0; j < 256; j++)
             SIMPLE_MODEL(256,_init)(&byte_model[i*256+j], m);
     
+    unsigned char *out_free = NULL;
     if (!out)
-        out = malloc(out_sz);
+        out_free = out = malloc(out_sz);
     if (!out)
         return NULL;
 
@@ -391,7 +420,10 @@ unsigned char *arith_uncompress_O2(unsigned char *in, unsigned int in_size,
     }
 
     free(byte_model);
-    RC_FinishDecode(&rc);
+    if (RC_FinishDecode(&rc) < 0) {
+        free(out_free);
+        return NULL;
+    }
     
     return out;
 }
@@ -440,6 +472,7 @@ unsigned char *arith_compress_O0_RLE(unsigned char *in, unsigned int in_size,
 
     RangeCoder rc;
     RC_SetOutput(&rc, (char *)out+1);
+    RC_SetOutputEnd(&rc, (char *)out + *out_size);
     RC_StartEncode(&rc);
 
     unsigned char last = 0;
@@ -466,7 +499,11 @@ unsigned char *arith_compress_O0_RLE(unsigned char *in, unsigned int in_size,
         } while (run);
     }
 
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        htscodecs_tls_free(run_model);
+        free(out_free);
+        return NULL;
+    }
 
     // Finalise block size and return it
     *out_size = RC_OutSize(&rc)+1;
@@ -524,7 +561,11 @@ unsigned char *arith_uncompress_O0_RLE(unsigned char *in, unsigned int in_size,
             out[++i] = last;
     }
 
-    RC_FinishDecode(&rc);
+    if (RC_FinishDecode(&rc) < 0) {
+        htscodecs_tls_free(run_model);
+        free(out_free);
+        return NULL;
+    }
 
     htscodecs_tls_free(run_model);
     return out;
@@ -571,6 +612,7 @@ unsigned char *arith_compress_O1_RLE(unsigned char *in, unsigned int in_size,
 
     RangeCoder rc;
     RC_SetOutput(&rc, (char *)out+1);
+    RC_SetOutputEnd(&rc, (char *)out + *out_size);
     RC_StartEncode(&rc);
 
     unsigned char last = 0;
@@ -597,7 +639,12 @@ unsigned char *arith_compress_O1_RLE(unsigned char *in, unsigned int in_size,
         } while (run);
     }
 
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        htscodecs_tls_free(byte_model);
+        htscodecs_tls_free(run_model);
+        free(out_free);
+        return NULL;
+    }
 
     // Finalise block size and return it
     *out_size = RC_OutSize(&rc)+1;
@@ -663,7 +710,12 @@ unsigned char *arith_uncompress_O1_RLE(unsigned char *in, unsigned int in_size,
             out[++i] = last;
     }
 
-    RC_FinishDecode(&rc);
+    if (RC_FinishDecode(&rc) < 0) {
+        htscodecs_tls_free(byte_model);
+        htscodecs_tls_free(run_model);
+        free(out_free);
+        return NULL;
+    }
 
     htscodecs_tls_free(byte_model);
     htscodecs_tls_free(run_model);
diff --git a/htslib/htscodecs/htscodecs/c_range_coder.h b/htslib/htscodecs/htscodecs/c_range_coder.h
index 3ee397761..df299689c 100644
--- a/htslib/htscodecs/htscodecs/c_range_coder.h
+++ b/htslib/htscodecs/htscodecs/c_range_coder.h
@@ -31,13 +31,18 @@ typedef struct {
     uc *in_buf;
     uc *out_buf;
     uc *in_end;
+    uc *out_end;
+    int err;
 } RangeCoder;
 
 static inline void RC_SetInput(RangeCoder *rc, char *in, char *in_end) {
     rc->out_buf = rc->in_buf = (uc *)in;
     rc->in_end = (uc *)in_end;
 }
-static inline void RC_SetOutput(RangeCoder *rc, char *out) { rc->in_buf = rc->out_buf = (uc *)out; }
+
+// NB: call RC_SetOutput first, and then RC_SetOutputEnd
+static inline void RC_SetOutput(RangeCoder *rc, char *out) { rc->in_buf = rc->out_buf = (uc *)out; rc->out_end = NULL;}
+static inline void RC_SetOutputEnd(RangeCoder *rc, char *out_end) { rc->out_end = (uc *)out_end; }
 static inline char *RC_GetInput(RangeCoder *rc) { return (char *)rc->in_buf; }
 static inline char *RC_GetOutput(RangeCoder *rc) { return (char *)rc->out_buf; }
 static inline size_t RC_OutSize(RangeCoder *rc) { return rc->out_buf - rc->in_buf; }
@@ -51,6 +56,7 @@ static inline void RC_StartEncode(RangeCoder *rc)
     rc->Carry = 0;
     rc->Cache = 0;
     rc->code  = 0;
+    rc->err   = 0;
 }
 
 static inline void RC_StartDecode(RangeCoder *rc)
@@ -61,6 +67,7 @@ static inline void RC_StartDecode(RangeCoder *rc)
     rc->Carry = 0;
     rc->Cache = 0;
     rc->code  = 0;
+    rc->err   = 0;
     if (rc->in_buf+5 > rc->in_end) {
         rc->in_buf = rc->in_end; // prevent decode
         return;
@@ -68,6 +75,31 @@ static inline void RC_StartDecode(RangeCoder *rc)
     DO(5) rc->code = (rc->code<<8) | *rc->in_buf++;
 }
 
+static inline void RC_ShiftLowCheck(RangeCoder *rc) {
+    if (rc->low < Thres || rc->Carry) {
+        if (rc->out_end && rc->FFNum >= rc->out_end - rc->out_buf) {
+            rc->err = -1;
+            return;
+        }
+
+        *rc->out_buf++ = rc->Cache + rc->Carry;
+
+        // Flush any stored FFs
+        while (rc->FFNum) {
+            *rc->out_buf++ = rc->Carry-1; // (Carry-1)&255;
+            rc->FFNum--;
+        }
+
+        // Take copy of top byte ready for next flush
+        rc->Cache = rc->low >> 24;
+        rc->Carry = 0;
+    } else {
+        // Low if FFxx xxxx.  Bump FF count and shift in as before
+        rc->FFNum++;
+    }
+    rc->low = rc->low<<8;
+}
+
 static inline void RC_ShiftLow(RangeCoder *rc) {
     if (rc->low < Thres || rc->Carry) {
         *rc->out_buf++ = rc->Cache + rc->Carry;
@@ -88,12 +120,15 @@ static inline void RC_ShiftLow(RangeCoder *rc) {
     rc->low = rc->low<<8;
 }
 
-static inline void RC_FinishEncode(RangeCoder *rc) 
+static inline int RC_FinishEncode(RangeCoder *rc)
 { 
-    DO(5) RC_ShiftLow(rc);
+    DO(5) RC_ShiftLowCheck(rc);
+    return rc->err;
 }
 
-static inline void RC_FinishDecode(RangeCoder *rc) {}
+static inline int RC_FinishDecode(RangeCoder *rc) {
+    return rc->err;
+}
 
 static inline void RC_Encode (RangeCoder *rc, uint32_t cumFreq, uint32_t freq, uint32_t totFreq) 
 {
@@ -105,7 +140,7 @@ static inline void RC_Encode (RangeCoder *rc, uint32_t cumFreq, uint32_t freq, u
 
     while (rc->range < TOP) {
         rc->range <<= 8;
-        RC_ShiftLow(rc);
+        RC_ShiftLowCheck(rc);
     }
 }
 
@@ -119,8 +154,10 @@ static inline void RC_Decode (RangeCoder *rc, uint32_t cumFreq, uint32_t freq, u
     rc->code -= cumFreq * rc->range;
     rc->range *= freq;
     while (rc->range < TOP) {
-        if (rc->in_buf >= rc->in_end)
-            return; // FIXME: could signal error, instead of caller just generating nonsense
+        if (rc->in_buf >= rc->in_end) {
+            rc->err = -1;
+            return;
+        }
         rc->code = (rc->code<<8) + *rc->in_buf++;
         rc->range <<= 8;
     }
diff --git a/htslib/htscodecs/htscodecs/fqzcomp_qual.c b/htslib/htscodecs/htscodecs/fqzcomp_qual.c
index 9b3610b45..a5b668774 100644
--- a/htslib/htscodecs/htscodecs/fqzcomp_qual.c
+++ b/htslib/htscodecs/htscodecs/fqzcomp_qual.c
@@ -911,7 +911,7 @@ int fqz_pick_parameters(fqz_gparams *gp,
         // NB: stab is already all zero
     }
 
-    if (gp->max_sel) {
+    if (gp->max_sel && s->num_records) {
         int max = 0;
         for (i = 0; i < s->num_records; i++) {
             if (max < (s->flags[i] >> 16))
@@ -1018,11 +1018,6 @@ unsigned char *compress_block_fqz2f(int vers,
     int comp_idx = 0;
     RangeCoder rc;
 
-    unsigned char *comp = (unsigned char *)malloc(in_size*1.1+100000);
-    unsigned char *compe = comp + (size_t)(in_size*1.1+100000);
-    if (!comp)
-        return NULL;
-
     // Pick and store params
     if (!gp) {
         gp = &local_gp;
@@ -1031,6 +1026,36 @@ unsigned char *compress_block_fqz2f(int vers,
         free_params = 1;
     }
 
+    // Worst case scenario assuming random input data and no way to compress
+    // is NBytes*growth for some small growth factor (arith_dynamic uses 1.05),
+    // plus fixed overheads for the header / params.  Growth can be high
+    // here as we're modelling things and pathological cases may trigger a
+    // bad probability model.
+    //
+    // Per read is 4-byte len if not fixed length (but less if avg smaller)
+    //             up to 1 byte for selection state (log2(max_sel) bits)
+    //             1-bit for reverse flag
+    //             1-bit for dup-last flag (but then no quals)
+    // Per qual is 1-byte (assuming QMAX==256)
+    //
+    // Header size is total guess, as depends on params, but it's almost
+    // always tiny, so a few K extra should be sufficient.
+    //
+    // => Total of (s->num_records*4.25 + in_size)*growth + hdr
+    int sel_bits = 0, sel = gp->max_sel;
+    while (sel) {
+        sel_bits++;
+        sel >>= 1;
+    }
+    double len_sz = gp->p[0].fixed_len ? 0.25 : 4.25;
+    len_sz += sel_bits / 8.0;
+    size_t comp_sz = (s->num_records*len_sz + in_size)*1.1 + 10000;
+
+    unsigned char *comp = (unsigned char *)malloc(comp_sz);
+    unsigned char *compe = comp + (size_t)comp_sz;
+    if (!comp)
+        return NULL;
+
     //dump_params(gp);
     comp_idx = var_put_u32(comp, compe, in_size);
     comp_idx += fqz_store_parameters(gp, comp+comp_idx);
@@ -1054,6 +1079,7 @@ unsigned char *compress_block_fqz2f(int vers,
         return NULL;
 
     RC_SetOutput(&rc, (char *)comp+comp_idx);
+    RC_SetOutputEnd(&rc, (char *)comp+comp_sz);
     RC_StartEncode(&rc);
 
     // For CRAM3.1, reverse upfront if needed
@@ -1091,6 +1117,12 @@ unsigned char *compress_block_fqz2f(int vers,
 
     for (i = 0; i < in_size; i++) {
         if (state.p == 0) {
+            if (state.rec >= s->num_records || s->len[state.rec] <= 0) {
+                free(comp);
+                comp = NULL;
+                goto err;
+            }
+
             if (compress_new_read(s, &state, gp, pm, &model, &rc,
                                   in, &i, /*&rec,*/ &last))
                 continue;
@@ -1151,7 +1183,12 @@ unsigned char *compress_block_fqz2f(int vers,
 #endif
     }
 
-    RC_FinishEncode(&rc);
+    if (RC_FinishEncode(&rc) < 0) {
+        free(comp);
+        comp = NULL;
+        *out_size = 0;
+        goto err;
+    }
 
     // For CRAM3.1, undo our earlier reversal step
     rec = state.rec;
@@ -1186,6 +1223,7 @@ unsigned char *compress_block_fqz2f(int vers,
     *out_size = comp_idx + RC_OutSize(&rc);
     //fprintf(stderr, "%d -> %d\n", (int)in_size, (int)*out_size);
 
+ err:
     fqz_destroy_models(&model);
     if (free_params)
         fqz_free_parameters(gp);
@@ -1550,7 +1588,9 @@ unsigned char *uncompress_block_fqz2f(fqz_slice *s,
         }
     }
 
-    RC_FinishDecode(&rc);
+    if (RC_FinishDecode(&rc) < 0)
+        goto err;
+
     fqz_destroy_models(&model);
     free(rev_a);
     free(len_a);
diff --git a/htslib/htscodecs/htscodecs/htscodecs.c b/htslib/htscodecs/htscodecs/htscodecs.c
index aad2c9cea..35fc6b4dc 100644
--- a/htslib/htscodecs/htscodecs/htscodecs.c
+++ b/htslib/htscodecs/htscodecs/htscodecs.c
@@ -39,6 +39,6 @@
  * NB: This is obtained from the auto-generated version.h, so
  * we can include release number and git hash.
  */
-const char *htscodecs_version() {
+const char *htscodecs_version(void) {
     return HTSCODECS_VERSION_TEXT;
 }
diff --git a/htslib/htscodecs/htscodecs/htscodecs.h b/htslib/htscodecs/htscodecs/htscodecs.h
index 2465aa298..8d67e67a7 100644
--- a/htslib/htscodecs/htscodecs/htscodecs.h
+++ b/htslib/htscodecs/htscodecs/htscodecs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Genome Research Ltd.
+ * Copyright (c) 2021-2024 Genome Research Ltd.
  * Author(s): James Bonfield
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,13 +43,13 @@
  * Note currently this needs manually editing as it isn't automatically
  * updated by autoconf.
  */
-#define HTSCODECS_VERSION 100501
+#define HTSCODECS_VERSION 100601
 
 /*
  * A const string form of the HTSCODECS_VERSION define.
  * NB: This is obtained from the auto-generated version.h, so
  * we can include release number and git hash.
  */
-const char *htscodecs_version();
+const char *htscodecs_version(void);
 
 #endif /* HTSCODECS_H */
diff --git a/htslib/htscodecs/htscodecs/pack.c b/htslib/htscodecs/htscodecs/pack.c
index 6b73bbc2b..eb8dac476 100644
--- a/htslib/htscodecs/htscodecs/pack.c
+++ b/htslib/htscodecs/htscodecs/pack.c
@@ -109,8 +109,8 @@ uint8_t *hts_pack(uint8_t *data, int64_t len,
         out[j] = 0;
         int s = len-i, x = 0;
         switch (s) {
-        case 3: out[j] |= p[data[i++]] << x; x+=2;
-        case 2: out[j] |= p[data[i++]] << x; x+=2;
+        case 3: out[j] |= p[data[i++]] << x; x+=2; // fall-through
+        case 2: out[j] |= p[data[i++]] << x; x+=2; // fall-through
         case 1: out[j] |= p[data[i++]] << x; x+=2;
             j++;
         }
@@ -125,12 +125,12 @@ uint8_t *hts_pack(uint8_t *data, int64_t len,
         out[j] = 0;
         int s = len-i, x = 0;
         switch (s) {
-        case 7: out[j] |= p[data[i++]] << x++;
-        case 6: out[j] |= p[data[i++]] << x++;
-        case 5: out[j] |= p[data[i++]] << x++;
-        case 4: out[j] |= p[data[i++]] << x++;
-        case 3: out[j] |= p[data[i++]] << x++;
-        case 2: out[j] |= p[data[i++]] << x++;
+        case 7: out[j] |= p[data[i++]] << x++; // fall-through
+        case 6: out[j] |= p[data[i++]] << x++; // fall-through
+        case 5: out[j] |= p[data[i++]] << x++; // fall-through
+        case 4: out[j] |= p[data[i++]] << x++; // fall-through
+        case 3: out[j] |= p[data[i++]] << x++; // fall-through
+        case 2: out[j] |= p[data[i++]] << x++; // fall-through
         case 1: out[j] |= p[data[i++]] << x++;
             j++;
         }
diff --git a/htslib/htscodecs/htscodecs/rANS_static.c b/htslib/htscodecs/htscodecs/rANS_static.c
index e629cb9a6..1399ee723 100644
--- a/htslib/htscodecs/htscodecs/rANS_static.c
+++ b/htslib/htscodecs/htscodecs/rANS_static.c
@@ -96,7 +96,7 @@ unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
         free(out_buf);
         return NULL;
     }
-    tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size;
+    tr = in_size ? ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size : 0;
 
  normalise_harder:
     // Normalise so T[i] == TOTFREQ
@@ -167,8 +167,11 @@ unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
 
     switch (i=(in_size&3)) {
     case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
+        // fall-through
     case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
+        // fall-through
     case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
+        // fall-through
     case 0:
         break;
     }
@@ -361,10 +364,13 @@ unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
     switch(out_sz&3) {
     case 3:
         out_buf[out_end + 2] = ssym[R[2] & mask];
+        // fall-through
     case 2:
         out_buf[out_end + 1] = ssym[R[1] & mask];
+        // fall-through
     case 1:
         out_buf[out_end] = ssym[R[0] & mask];
+        // fall-through
     default:
         break;
     }
diff --git a/htslib/htscodecs/htscodecs/rANS_static16_int.h b/htslib/htscodecs/htscodecs/rANS_static16_int.h
index 96dc84806..340df881f 100644
--- a/htslib/htscodecs/htscodecs/rANS_static16_int.h
+++ b/htslib/htscodecs/htscodecs/rANS_static16_int.h
@@ -294,8 +294,6 @@ static inline int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
                 dz++;
                 *cp++ = 0;
             }
-        } else {
-            assert(F[j] == 0);
         }
     }
     
@@ -313,7 +311,7 @@ static inline int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
 // Returns the desired TF_SHIFT; 10 or 12 bit, or -1 on error.
 static inline int encode_freq1(uint8_t *in, uint32_t in_size, int Nway,
                                RansEncSymbol syms[256][256], uint8_t **cp_p) {
-    int tab_size = 0, i, j, z;
+    int i, j, z;
     uint8_t *out = *cp_p, *cp = out;
 
     // Compute O1 frequency statistics
@@ -413,9 +411,6 @@ static inline int encode_freq1(uint8_t *in, uint32_t in_size, int Nway,
         free(c_freq);
     }
 
-    tab_size = cp - out;
-    assert(tab_size < 257*257*3);
-
     *cp_p = cp;
     htscodecs_tls_free(F);
     return shift;
@@ -542,15 +537,13 @@ static inline int decode_freq1(uint8_t *cp, uint8_t *cp_end, int shift,
 
 // Build s3 symbol lookup table.
 // This is 12 bit freq, 12 bit bias and 8 bit symbol.
-static inline int rans_F_to_s3(uint32_t *F, int shift, uint32_t *s3) {
-    int j, x, y;
+static inline int rans_F_to_s3(const uint32_t *F, int shift, uint32_t *s3) {
+    int j, x;
     for (j = x = 0; j < 256; j++) {
-        if (F[j]) {
-            if (F[j] > (1<<shift) - x)
-                return 1;
-            for (y = 0; y < F[j]; y++)
-                s3[y+x] = (((uint32_t)F[j])<<(shift+8))|(y<<8)|j;
-            x += F[j];
+        if (F[j] && F[j] <= (1<<shift) - x) {
+            uint32_t base = (((uint32_t)F[j])<<(shift+8))|j, y;
+            for (y = 0; y < F[j]; y++, x++)
+                s3[x] = base + (y<<8);
         }
     }
 
diff --git a/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx2.c b/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx2.c
index bb4eae16f..cf0457844 100644
--- a/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx2.c
+++ b/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx2.c
@@ -115,14 +115,41 @@ static inline __m256i _mm256_mulhi_epu32(__m256i a, __m256i b) {
 }
 #endif
 
-#if 0
-// Simulated gather.  This is sometimes faster as it can run on other ports.
+#ifndef USE_GATHER
+// Simulated gather.  This is sometimes faster if gathers are slow, either
+// due to the particular implementation (maybe on Zen4) or because of
+// a microcode patch such as Intel's Downfall fix.
 static inline __m256i _mm256_i32gather_epi32x(int *b, __m256i idx, int size) {
+    volatile // force the store to happen, hence forcing scalar loads
     int c[8] __attribute__((aligned(32)));
     _mm256_store_si256((__m256i *)c, idx);
-    return _mm256_set_epi32(b[c[7]], b[c[6]], b[c[5]], b[c[4]],
-                            b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
+
+    // Fast with modern gccs, and no change with clang.
+    // Equivalent to:
+    //     return _mm256_set_epi32(b[c[7]], b[c[6]], b[c[5]], b[c[4]],
+    //                             b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
+    register int bc1 = b[c[1]];
+    register int bc3 = b[c[3]];
+    register int bc5 = b[c[5]];
+    register int bc7 = b[c[7]];
+
+    __m128i x0a = _mm_cvtsi32_si128(b[c[0]]);
+    __m128i x1a = _mm_cvtsi32_si128(b[c[2]]);
+    __m128i x2a = _mm_cvtsi32_si128(b[c[4]]);
+    __m128i x3a = _mm_cvtsi32_si128(b[c[6]]);
+
+    __m128i x0 = _mm_insert_epi32(x0a, bc1, 1);
+    __m128i x1 = _mm_insert_epi32(x1a, bc3, 1);
+    __m128i x2 = _mm_insert_epi32(x2a, bc5, 1);
+    __m128i x3 = _mm_insert_epi32(x3a, bc7, 1);
+
+    __m128i x01 = _mm_unpacklo_epi64(x0, x1);
+    __m128i x23 = _mm_unpacklo_epi64(x2, x3);
+
+    __m256i y =_mm256_castsi128_si256(x01);
+    return _mm256_inserti128_si256(y, x23, 1);
 }
+
 #else
 #define _mm256_i32gather_epi32x _mm256_i32gather_epi32
 #endif
@@ -501,11 +528,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         //for (z = 0; z < NX; z++)
         //  m[z] = R[z] & mask;
         __m256i masked1 = _mm256_and_si256(Rv1, maskv);
-        __m256i masked2 = _mm256_and_si256(Rv2, maskv);
+	__m256i masked2 = _mm256_and_si256(Rv2, maskv);
+        __m256i masked3 = _mm256_and_si256(Rv3, maskv);
+        __m256i masked4 = _mm256_and_si256(Rv4, maskv);
 
-        //  S[z] = s3[m[z]];
-        __m256i Sv1 = _mm256_i32gather_epi32x((int *)s3, masked1, sizeof(*s3));
+	//  S[z] = s3[m[z]];
+	__m256i Sv1 = _mm256_i32gather_epi32x((int *)s3, masked1, sizeof(*s3));
         __m256i Sv2 = _mm256_i32gather_epi32x((int *)s3, masked2, sizeof(*s3));
+        __m256i Sv3 = _mm256_i32gather_epi32x((int *)s3, masked3, sizeof(*s3));
+        __m256i Sv4 = _mm256_i32gather_epi32x((int *)s3, masked4, sizeof(*s3));
 
         //  f[z] = S[z]>>(TF_SHIFT+8);
         __m256i fv1 = _mm256_srli_epi32(Sv1, TF_SHIFT+8);
@@ -527,15 +558,6 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
                   _mm256_mullo_epi32(
                       _mm256_srli_epi32(Rv2,TF_SHIFT), fv2), bv2);
 
-#ifdef __clang__
-        // Protect against running off the end of in buffer.
-        // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp > cp_end) {
-            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
-            cp_end = overflow + sizeof(overflow) - 64;
-        }
-#endif
         // Tricky one:  out[i+z] = s[z];
         //             ---h---g ---f---e  ---d---c ---b---a
         //             ---p---o ---n---m  ---l---k ---j---i
@@ -544,6 +566,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         // packs_epi16 ponmlkji ponmlkji  hgfedcba hgfedcba
         sv1 = _mm256_packus_epi32(sv1, sv2);
         sv1 = _mm256_permute4x64_epi64(sv1, 0xd8);
+
+        // Protect against running off the end of in buffer.
+        // We copy it to a worst-case local buffer when near the end.
+        if ((uint8_t *)sp > cp_end) {
+            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
+            sp = (uint16_t *)overflow;
+            cp_end = overflow + sizeof(overflow) - 64;
+        }
+
         __m256i Vv1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         sv1 = _mm256_packus_epi16(sv1, sv1);
 
@@ -584,18 +615,6 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Rv2 = _mm256_blendv_epi8(Rv2, Yv2, renorm_mask2);
 
         // ------------------------------------------------------------
-
-        //  m[z] = R[z] & mask;
-        //  S[z] = s3[m[z]];
-        __m256i masked3 = _mm256_and_si256(Rv3, maskv);
-        __m256i Sv3 = _mm256_i32gather_epi32x((int *)s3, masked3, sizeof(*s3));
-
-        *(uint64_t *)&out[i+0] = _mm256_extract_epi64(sv1, 0);
-        *(uint64_t *)&out[i+8] = _mm256_extract_epi64(sv1, 2);
-
-        __m256i masked4 = _mm256_and_si256(Rv4, maskv);
-        __m256i Sv4 = _mm256_i32gather_epi32x((int *)s3, masked4, sizeof(*s3));
-
         //  f[z] = S[z]>>(TF_SHIFT+8);
         __m256i fv3 = _mm256_srli_epi32(Sv3, TF_SHIFT+8);
         __m256i fv4 = _mm256_srli_epi32(Sv4, TF_SHIFT+8);
@@ -626,12 +645,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         renorm_mask3 = _mm256_cmplt_epu32_imm(Rv3, RANS_BYTE_L);
         sv3 = _mm256_packus_epi16(sv3, sv3);
         renorm_mask4 = _mm256_cmplt_epu32_imm(Rv4, RANS_BYTE_L);
-        
-        *(uint64_t *)&out[i+16] = _mm256_extract_epi64(sv3, 0);
-        *(uint64_t *)&out[i+24] = _mm256_extract_epi64(sv3, 2);
 
         // y = (R[z] << 16) | V[z];
         __m256i Vv3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
+
+        *(uint64_t *)&out[i+0]  = _mm256_extract_epi64(sv1, 0);
+        *(uint64_t *)&out[i+8]  = _mm256_extract_epi64(sv1, 2);
+        *(uint64_t *)&out[i+16] = _mm256_extract_epi64(sv3, 0);
+        *(uint64_t *)&out[i+24] = _mm256_extract_epi64(sv3, 2);
+
         __m256i Yv3 = _mm256_slli_epi32(Rv3, 16);
         unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3);
         __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]);
@@ -649,20 +671,6 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Yv3 = _mm256_or_si256(Yv3, Vv3);
         Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
         Yv4 = _mm256_or_si256(Yv4, Vv4);
-
-#ifndef __clang__
-        // 26% faster here than above for gcc10, but former location is
-        // better on clang.
-
-        // Protect against running off the end of in buffer.
-        // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp > cp_end) {
-            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
-            cp_end = overflow + sizeof(overflow) - 64;
-        }
-#endif
-
         sp += _mm_popcnt_u32(imask4);
 
         // R[z] = c ? Y[z] : R[z];
@@ -751,6 +759,10 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
 
     uint16_t *ptr16 = (uint16_t *)ptr;
 
+//       clang16      clang10      gcc7         gcc13
+//       587 435 381  588 438 403  504 386 415  527 381 394
+// simT  611 432 402  475 401 367  472 422 386  486 353 324
+
     LOAD(Rv, ransN);
 
     for (; iN[0] >= 0; ) {
@@ -784,72 +796,72 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         // [2] B2......
         // [3] ......B3  OR to get B2B1B0B3 and shuffle to B3B2B1B0
 
-        __m256i sh[16];
-        for (z = 0; z < 16; z+=4) {
-            int Z = z*2;
+        __m256i xmaxv[4];
+        __m256i rfv[4];
+        __m256i SDv[4];
+        __m256i biasv[4];
+
+        const __m256i xA = _mm256_set_epi32(0,0,0,-1, 0,0,0,-1);
+        const __m256i xB = _mm256_set_epi32(0,0,-1,0, 0,0,-1,0);
+        const __m256i xC = _mm256_set_epi32(0,-1,0,0, 0,-1,0,0);
+        const __m256i xD = _mm256_set_epi32(-1,0,0,0, -1,0,0,0);
 
+        for (z = 0; z < 32; z += 8) {
 #define m128_to_256 _mm256_castsi128_si256
-            __m256i t0, t1, t2, t3;
-            __m128i *s0, *s1, *s2, *s3;
-            s0 = (__m128i *)(&syms[in[iN[Z+0]]][lN[Z+0]]);
-            s1 = (__m128i *)(&syms[in[iN[Z+4]]][lN[Z+4]]);
-            s2 = (__m128i *)(&syms[in[iN[Z+1]]][lN[Z+1]]);
-            s3 = (__m128i *)(&syms[in[iN[Z+5]]][lN[Z+5]]);
+            __m128i *s0 = (__m128i *)(&syms[in[iN[z+0]]][lN[z+0]]);
+            __m128i *s1 = (__m128i *)(&syms[in[iN[z+4]]][lN[z+4]]);
+            __m128i *s2 = (__m128i *)(&syms[in[iN[z+1]]][lN[z+1]]);
+            __m128i *s3 = (__m128i *)(&syms[in[iN[z+5]]][lN[z+5]]);
 
+            __m256i t0, t1, t2, t3;
             t0 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s0)), 0xE4);
             t1 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s1)), 0xE4);
             t2 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s2)), 0x93);
             t3 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s3)), 0x93);
 
-            lN[Z+0] = in[iN[Z+0]];
-            lN[Z+4] = in[iN[Z+4]];
-            lN[Z+1] = in[iN[Z+1]];
-            lN[Z+5] = in[iN[Z+5]];
-
-            sh[z+0] = _mm256_permute2x128_si256(t0, t1, 0x20);
-            sh[z+1] = _mm256_permute2x128_si256(t2, t3, 0x20);
-
-            s0 = (__m128i *)(&syms[in[iN[Z+2]]][lN[Z+2]]);
-            s1 = (__m128i *)(&syms[in[iN[Z+6]]][lN[Z+6]]);
-            s2 = (__m128i *)(&syms[in[iN[Z+3]]][lN[Z+3]]);
-            s3 = (__m128i *)(&syms[in[iN[Z+7]]][lN[Z+7]]);
-
-            t0 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s0)), 0x4E);
-            t1 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s1)), 0x4E);
-            t2 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s2)), 0x39);
-            t3 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s3)), 0x39);
-
-            lN[Z+2] = in[iN[Z+2]];
-            lN[Z+6] = in[iN[Z+6]];
-            lN[Z+3] = in[iN[Z+3]];
-            lN[Z+7] = in[iN[Z+7]];
-
-            sh[z+2] = _mm256_permute2x128_si256(t0, t1, 0x20);
-            sh[z+3] = _mm256_permute2x128_si256(t2, t3, 0x20);
-
-            // potential to set xmax, rf, bias, and SD in-situ here, removing
-            // the need to hold sh[] in regs.  Doing so doesn't seem to speed
-            // things up though.
+            __m256i sh0 = _mm256_permute2x128_si256(t0, t1, 0x20);
+            __m256i sh1 = _mm256_permute2x128_si256(t2, t3, 0x20);
+
+            lN[z+0] = in[iN[z+0]];
+            lN[z+4] = in[iN[z+4]];
+            lN[z+1] = in[iN[z+1]];
+            lN[z+5] = in[iN[z+5]];
+
+            // Initialise first half of xmax, rf, SD and bias vectors
+            __m128i *s4 = (__m128i *)(&syms[in[iN[z+2]]][lN[z+2]]);
+            __m128i *s5 = (__m128i *)(&syms[in[iN[z+6]]][lN[z+6]]);
+            __m128i *s6 = (__m128i *)(&syms[in[iN[z+3]]][lN[z+3]]);
+            __m128i *s7 = (__m128i *)(&syms[in[iN[z+7]]][lN[z+7]]);
+
+            __m256i t4, t5, t6, t7;
+            t4 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s4)), 0x4E);
+            t5 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s5)), 0x4E);
+            t6 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s6)), 0x39);
+            t7 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s7)), 0x39);
+
+            __m256i sh2 = _mm256_permute2x128_si256(t4, t5, 0x20);
+            __m256i sh3 = _mm256_permute2x128_si256(t6, t7, 0x20);
+
+            lN[z+2] = in[iN[z+2]];
+            lN[z+6] = in[iN[z+6]];
+            lN[z+3] = in[iN[z+3]];
+            lN[z+7] = in[iN[z+7]];
+
+#define SH_LOAD(A, B, C, D)                                        \
+            _mm256_or_si256(_mm256_or_si256(_mm256_and_si256(sh0, A), \
+                                            _mm256_and_si256(sh1, B)),\
+                            _mm256_or_si256(_mm256_and_si256(sh2, C), \
+                                            _mm256_and_si256(sh3, D)))
+            xmaxv[z/8] = SH_LOAD(xA, xB, xC, xD);
+            rfv  [z/8] = SH_LOAD(xB, xC, xD, xA);
+            SDv  [z/8] = SH_LOAD(xD, xA, xB, xC);
+            biasv[z/8] = SH_LOAD(xC, xD, xA, xB);
+
+            rfv  [z/8] = _mm256_shuffle_epi32(rfv  [z/8],  0x39);
+            SDv  [z/8] = _mm256_shuffle_epi32(SDv  [z/8],  0x93);
+            biasv[z/8] = _mm256_shuffle_epi32(biasv[z/8],0x4E);
         }
 
-        __m256i xA = _mm256_set_epi32(0,0,0,-1, 0,0,0,-1);
-        __m256i xB = _mm256_set_epi32(0,0,-1,0, 0,0,-1,0);
-        __m256i xC = _mm256_set_epi32(0,-1,0,0, 0,-1,0,0);
-        __m256i xD = _mm256_set_epi32(-1,0,0,0, -1,0,0,0);
-
-        // Extract 32-bit xmax elements from syms[] data (in sh vec array)
-/*
-#define SYM_LOAD(x, A, B, C, D)                                         \
-        _mm256_or_si256(_mm256_or_si256(_mm256_and_si256(sh[x+0], A),   \
-                                        _mm256_and_si256(sh[x+1], B)),  \
-                        _mm256_or_si256(_mm256_and_si256(sh[x+2], C),   \
-                                        _mm256_and_si256(sh[x+3], D)))
-*/
-        __m256i xmax1 = SYM_LOAD( 0, xA, xB, xC, xD);
-        __m256i xmax2 = SYM_LOAD( 4, xA, xB, xC, xD);
-        __m256i xmax3 = SYM_LOAD( 8, xA, xB, xC, xD);
-        __m256i xmax4 = SYM_LOAD(12, xA, xB, xC, xD);
-
         // ------------------------------------------------------------
         //      for (z = NX-1; z >= 0; z--) {
         //          if (ransN[z] >= x_max[z]) {
@@ -857,10 +869,10 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         //              ransN[z] >>= 16;
         //          }
         //      }
-        __m256i cv1 = _mm256_cmpgt_epi32(Rv1, xmax1);
-        __m256i cv2 = _mm256_cmpgt_epi32(Rv2, xmax2);
-        __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmax3);
-        __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmax4);
+        __m256i cv1 = _mm256_cmpgt_epi32(Rv1, xmaxv[0]);
+        __m256i cv2 = _mm256_cmpgt_epi32(Rv2, xmaxv[1]);
+        __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmaxv[2]);
+        __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmaxv[3]);
 
         // Store bottom 16-bits at ptr16
         //
@@ -896,10 +908,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         V12 = _mm256_permute4x64_epi64(V12, 0xd8);
         V34 = _mm256_permute4x64_epi64(V34, 0xd8);
 
-        // Load rcp_freq ready for later
-        __m256i rfv1 = _mm256_shuffle_epi32(SYM_LOAD( 0, xB, xC, xD, xA),0x39);
-        __m256i rfv2 = _mm256_shuffle_epi32(SYM_LOAD( 4, xB, xC, xD, xA),0x39);
-
         // Now we have bottom N 16-bit values in each V12/V34 to flush
         __m128i f =  _mm256_extractf128_si256(V34, 1);
         _mm_storeu_si128((__m128i *)(ptr16-8), f);
@@ -917,9 +925,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         _mm_storeu_si128((__m128i *)(ptr16-8), f);
         ptr16 -= _mm_popcnt_u32(imask1);
 
-        __m256i rfv3 = _mm256_shuffle_epi32(SYM_LOAD( 8, xB, xC, xD, xA),0x39);
-        __m256i rfv4 = _mm256_shuffle_epi32(SYM_LOAD(12, xB, xC, xD, xA),0x39);
-
         __m256i Rs1, Rs2, Rs3, Rs4;
         Rs1 = _mm256_srli_epi32(Rv1,16);
         Rs2 = _mm256_srli_epi32(Rv2,16);
@@ -943,54 +948,41 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         // (AVX512 allows us to hold it all in 64-bit lanes and use mullo_epi64
         // plus a shift.  KNC has mulhi_epi32, but not sure if this is
         // available.)
-        rfv1 = _mm256_mulhi_epu32(Rv1, rfv1);
-        rfv2 = _mm256_mulhi_epu32(Rv2, rfv2);
-        rfv3 = _mm256_mulhi_epu32(Rv3, rfv3);
-        rfv4 = _mm256_mulhi_epu32(Rv4, rfv4);
+        rfv[0] = _mm256_mulhi_epu32(Rv1, rfv[0]);
+        rfv[1] = _mm256_mulhi_epu32(Rv2, rfv[1]);
+        rfv[2] = _mm256_mulhi_epu32(Rv3, rfv[2]);
+        rfv[3] = _mm256_mulhi_epu32(Rv4, rfv[3]);
 
-        // Load cmpl_freq / rcp_shift from syms
-        __m256i SDv1 = _mm256_shuffle_epi32(SYM_LOAD( 0, xD, xA, xB, xC),0x93);
-        __m256i SDv2 = _mm256_shuffle_epi32(SYM_LOAD( 4, xD, xA, xB, xC),0x93);
-        // Load bias from syms[]
-        __m256i biasv1=_mm256_shuffle_epi32(SYM_LOAD( 0, xC, xD, xA, xB),0x4E);
-        __m256i biasv2=_mm256_shuffle_epi32(SYM_LOAD( 4, xC, xD, xA, xB),0x4E);
-
-        __m256i shiftv1 = _mm256_srli_epi32(SDv1, 16);
-        __m256i shiftv2 = _mm256_srli_epi32(SDv2, 16);
-
-        __m256i SDv3 = _mm256_shuffle_epi32(SYM_LOAD( 8, xD, xA, xB, xC),0x93);
-        __m256i SDv4 = _mm256_shuffle_epi32(SYM_LOAD(12, xD, xA, xB, xC),0x93);
-        __m256i biasv3=_mm256_shuffle_epi32(SYM_LOAD( 8, xC, xD, xA, xB),0x4E);
-        __m256i biasv4=_mm256_shuffle_epi32(SYM_LOAD(12, xC, xD, xA, xB),0x4E);
-
-        __m256i shiftv3 = _mm256_srli_epi32(SDv3, 16);
-        __m256i shiftv4 = _mm256_srli_epi32(SDv4, 16);
+        __m256i shiftv1 = _mm256_srli_epi32(SDv[0], 16);
+        __m256i shiftv2 = _mm256_srli_epi32(SDv[1], 16);
+        __m256i shiftv3 = _mm256_srli_epi32(SDv[2], 16);
+        __m256i shiftv4 = _mm256_srli_epi32(SDv[3], 16);
 
         shiftv1 = _mm256_sub_epi32(shiftv1, _mm256_set1_epi32(32));
         shiftv2 = _mm256_sub_epi32(shiftv2, _mm256_set1_epi32(32));
         shiftv3 = _mm256_sub_epi32(shiftv3, _mm256_set1_epi32(32));
         shiftv4 = _mm256_sub_epi32(shiftv4, _mm256_set1_epi32(32));
 
-        __m256i qv1 = _mm256_srlv_epi32(rfv1, shiftv1);
-        __m256i qv2 = _mm256_srlv_epi32(rfv2, shiftv2);
+        __m256i qv1 = _mm256_srlv_epi32(rfv[0], shiftv1);
+        __m256i qv2 = _mm256_srlv_epi32(rfv[1], shiftv2);
 
-        __m256i freqv1 = _mm256_and_si256(SDv1, _mm256_set1_epi32(0xffff));
-        __m256i freqv2 = _mm256_and_si256(SDv2, _mm256_set1_epi32(0xffff));
+        __m256i freqv1 = _mm256_and_si256(SDv[0], _mm256_set1_epi32(0xffff));
+        __m256i freqv2 = _mm256_and_si256(SDv[1], _mm256_set1_epi32(0xffff));
         qv1 = _mm256_mullo_epi32(qv1, freqv1);
         qv2 = _mm256_mullo_epi32(qv2, freqv2);
 
-        __m256i qv3 = _mm256_srlv_epi32(rfv3, shiftv3);
-        __m256i qv4 = _mm256_srlv_epi32(rfv4, shiftv4);
+        __m256i qv3 = _mm256_srlv_epi32(rfv[2], shiftv3);
+        __m256i qv4 = _mm256_srlv_epi32(rfv[3], shiftv4);
 
-        __m256i freqv3 = _mm256_and_si256(SDv3, _mm256_set1_epi32(0xffff));
-        __m256i freqv4 = _mm256_and_si256(SDv4, _mm256_set1_epi32(0xffff));
+        __m256i freqv3 = _mm256_and_si256(SDv[2], _mm256_set1_epi32(0xffff));
+        __m256i freqv4 = _mm256_and_si256(SDv[3], _mm256_set1_epi32(0xffff));
         qv3 = _mm256_mullo_epi32(qv3, freqv3);
         qv4 = _mm256_mullo_epi32(qv4, freqv4);
 
-        qv1 = _mm256_add_epi32(qv1, biasv1);
-        qv2 = _mm256_add_epi32(qv2, biasv2);
-        qv3 = _mm256_add_epi32(qv3, biasv3);
-        qv4 = _mm256_add_epi32(qv4, biasv4);
+        qv1 = _mm256_add_epi32(qv1, biasv[0]);
+        qv2 = _mm256_add_epi32(qv2, biasv[1]);
+        qv3 = _mm256_add_epi32(qv3, biasv[2]);
+        qv4 = _mm256_add_epi32(qv4, biasv[3]);
 
         for (z = 0; z < NX; z++)
             iN[z]--;
@@ -1541,27 +1533,11 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             sv3 = _mm256_permute4x64_epi64(sv3, 0xd8); // shuffle;  AaBb
             sv3 = _mm256_packus_epi16(sv3, sv3);       // 16 to 8
 
-            // Method 1
             u.tbuf64[tidx][0] = _mm256_extract_epi64(sv1, 0);
             u.tbuf64[tidx][1] = _mm256_extract_epi64(sv1, 2);
             u.tbuf64[tidx][2] = _mm256_extract_epi64(sv3, 0);
             u.tbuf64[tidx][3] = _mm256_extract_epi64(sv3, 2);
 
-//          // Method 2
-//          sv1 = _mm256_permute4x64_epi64(sv1, 8); // x x 10 00
-//          _mm_storeu_si128((__m128i *)&u.tbuf64[tidx][0],
-//                           _mm256_extractf128_si256(sv1, 0));
-//          sv3 = _mm256_permute4x64_epi64(sv3, 8); // x x 10 00
-//          _mm_storeu_si128((__m128i *)&u.tbuf64[tidx][2],
-//                           _mm256_extractf128_si256(sv3, 0));
-
-//          // Method 3
-//          sv1 = _mm256_and_si256(sv1, _mm256_set_epi64x(0,-1,0,-1)); // AxBx
-//          sv3 = _mm256_and_si256(sv3, _mm256_set_epi64x(-1,0,-1,0)); // xCxD
-//          sv1 = _mm256_or_si256(sv1, sv3);                           // ACBD
-//          sv1 = _mm256_permute4x64_epi64(sv1, 0xD8); //rev 00 10 01 11; ABCD
-//          _mm256_storeu_si256((__m256i *)u.tbuf64[tidx], sv1);
-
             iN[0]++;
             if (++tidx == 32) {
                 iN[0]-=32;
diff --git a/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c b/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
index 84b4ea016..3563f8345 100644
--- a/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
+++ b/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c
@@ -55,6 +55,74 @@
 #include "varint.h"
 #include "utils.h"
 
+#ifndef USE_GATHER
+
+// Speds with Downfall mitigation Off/On and Zen4.
+//         <------ AVX512 --------->   <------- AVX2 -------->
+// -o4:    IntelOff IntelOn  AMDZen4   
+// gcc7    544/1673 562/1711 448/1818  649/1515 645/1525 875/1624
+// gcc13   557/1672 576/1711 582/1761  630/1623 629/1652 866/1762
+// clang10 541/1547 564/1630 807/1912  620/1456 637/1481 837/1606
+// clang16 533/1431 555/1510 890/1611  629/1370 627/1406 996/1432
+//
+// Zen4 encode is particularly slow with gcc, but AVX2 encode is
+// faster and we use that already.
+static inline __m512i _mm512_i32gather_epi32x(__m512i idx, void *v, int size) {
+    uint32_t *b = (uint32_t *)v;
+
+#ifndef __clang__
+    volatile
+#endif
+    int c[16] __attribute__((aligned(32)));
+
+    //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
+    _mm256_store_si256((__m256i *)(c),   _mm512_castsi512_si256(idx));
+    _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
+
+    __m128i x0 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[0]]), b[c[1]], 1);
+    __m128i x1 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[2]]), b[c[3]], 1);
+    __m128i x2 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[4]]), b[c[5]], 1);
+    __m128i x3 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[6]]), b[c[7]], 1);
+
+    __m128i x01 = _mm_unpacklo_epi64(x0, x1);
+    __m128i x23 = _mm_unpacklo_epi64(x2, x3);
+    __m256i y0 =_mm256_castsi128_si256(x01);
+
+    __m128i x4 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[ 8]]), b[c[ 9]], 1);
+    __m128i x5 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[10]]), b[c[11]], 1);
+    __m128i x6 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[12]]), b[c[13]], 1);
+    __m128i x7 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[14]]), b[c[15]], 1);
+
+    __m128i x45 = _mm_unpacklo_epi64(x4, x5);
+    __m128i x67 = _mm_unpacklo_epi64(x6, x7);
+    __m256i y1 =_mm256_castsi128_si256(x45);
+
+    y0 = _mm256_inserti128_si256(y0, x23, 1);
+    y1 = _mm256_inserti128_si256(y1, x67, 1);
+
+    return _mm512_inserti64x4(_mm512_castsi256_si512(y0), y1, 1);
+}
+
+// 32-bit indices, 8-bit quantities into 32-bit lanes
+static inline __m512i _mm512_i32gather_epi32x1(__m512i idx, void *v) {
+    uint8_t *b = (uint8_t *)v;
+    volatile int c[16] __attribute__((aligned(32)));
+
+    //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
+    _mm256_store_si256((__m256i *)(c),   _mm512_castsi512_si256(idx));
+    _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
+
+    return _mm512_set_epi32(b[c[15]], b[c[14]], b[c[13]], b[c[12]],
+                            b[c[11]], b[c[10]], b[c[9]], b[c[8]],
+                            b[c[7]], b[c[6]], b[c[5]], b[c[4]],
+                            b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
+}
+
+#else
+// real gathers
+#define _mm512_i32gather_epi32x _mm512_i32gather_epi32
+#endif
+
 unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
                                              unsigned int in_size,
                                              unsigned char *out,
@@ -149,8 +217,8 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0));
         __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1));
 #define SET512(a,b) \
-        __m512i a##1 = _mm512_i32gather_epi32(c1, b, 4); \
-        __m512i a##2 = _mm512_i32gather_epi32(c2, b, 4)
+        __m512i a##1 = _mm512_i32gather_epi32x(c1, b, 4); \
+        __m512i a##2 = _mm512_i32gather_epi32x(c2, b, 4)
 
         SET512(xmax, SB);
 
@@ -162,7 +230,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         SET512(SDv,  SD);
         int pc2 = _mm_popcnt_u32(gt_mask2);
 
-        //Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
 
@@ -237,7 +304,7 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
 
     for (z = 32-1; z >= 0; z--)
         RansEncFlush(&ransN[z], &ptr);
-    
+
  empty:
     // Finalise block size and return it
     *out_size = (out_end - ptr) + tab_size;
@@ -305,8 +372,8 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
     // loop for the next cycle so we can remove some of the instr. latency.
     __m512i masked1 = _mm512_and_epi32(R1, maskv);
     __m512i masked2 = _mm512_and_epi32(R2, maskv);
-    __m512i S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3));
-    __m512i S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3));
+    __m512i S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
+    __m512i S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
 
     uint8_t overflow[64+64] = {0};
     for (i=0; i < out_end; i+=32) {
@@ -334,13 +401,13 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
       R1 = _mm512_add_epi32(
                _mm512_mullo_epi32(
                    _mm512_srli_epi32(R1, TF_SHIFT), f1), b1);
+      __mmask16 renorm_mask1, renorm_mask2;
+      renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
       R2 = _mm512_add_epi32(
                _mm512_mullo_epi32(
                    _mm512_srli_epi32(R2, TF_SHIFT), f2), b2);
 
       // renorm. this is the interesting part:
-      __mmask16 renorm_mask1, renorm_mask2;
-      renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
       renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
       // advance by however many words we actually read
       sp += _mm_popcnt_u32(renorm_mask1);
@@ -349,34 +416,39 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
 
       // select masked only
       __m512i renorm_vals1, renorm_vals2;
-      renorm_vals1 = _mm512_maskz_expand_epi32(renorm_mask1, renorm_words1);
-      renorm_vals2 = _mm512_maskz_expand_epi32(renorm_mask2, renorm_words2);
-      // shift & add selected words
-      R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
-      R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
-      R1 = _mm512_add_epi32(R1, renorm_vals1);
-      R2 = _mm512_add_epi32(R2, renorm_vals2);
+
+      renorm_vals1 = _mm512_mask_expand_epi32(R1, renorm_mask1, renorm_words1);
+      renorm_vals2 = _mm512_mask_expand_epi32(R2, renorm_mask2, renorm_words2);
 
       // For start of next loop iteration.  This has been moved here
       // (and duplicated to before the loop starts) so we can do something
       // with the latency period of gather, such as finishing up the
-      // renorm offset and writing the results. 
+      // renorm offset and writing the results.
       __m512i S1_ = S1; // temporary copy for use in out[]=S later
       __m512i S2_ = S2;
 
-      masked1 = _mm512_and_epi32(R1, maskv);
-      masked2 = _mm512_and_epi32(R2, maskv);
-      // Gather is slow bit (half total time) - 30 cycle latency.
-      S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3));
-      S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3));
+      masked1 = _mm512_and_epi32(renorm_vals1, maskv);
+      S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
+      masked2 = _mm512_and_epi32(renorm_vals2, maskv);
+      S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
+
+      R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
+      R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
+
+      __m512i m16 = _mm512_set1_epi32(0xffff);
+      renorm_vals1 = _mm512_maskz_and_epi32(renorm_mask1, renorm_vals1, m16);
+      renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16);
 
       // advance by however many words we actually read
       sp += _mm_popcnt_u32(renorm_mask2);
 
+      R1 = _mm512_add_epi32(R1, renorm_vals1);
+      R2 = _mm512_add_epi32(R2, renorm_vals2);
+
       //out[i+z] = S;
       _mm_storeu_si128((__m128i *)(out+i),    _mm512_cvtepi32_epi8(S1_));
       _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_));
-    }      
+    }
 
     _mm512_store_epi32(&Rv[ 0], R1);
     _mm512_store_epi32(&Rv[16], R2);
@@ -424,14 +496,14 @@ static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32],
 //      iN[z] += 32;
 //  }
 
-    
+
     __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10,  9,  8,
                                    7,  6,  5,  4,  3,  2,  1,  0);
     v1 = _mm512_slli_epi32(v1, 5);
-    
+
     for (z = 0; z < 32; z++) {
-        __m512i t1 = _mm512_i32gather_epi32(v1, &t32[ 0][z], 4);
-        __m512i t2 = _mm512_i32gather_epi32(v1, &t32[16][z], 4);
+        __m512i t1 = _mm512_i32gather_epi32x(v1, &t32[ 0][z], 4);
+        __m512i t2 = _mm512_i32gather_epi32x(v1, &t32[16][z], 4);
         _mm_storeu_si128((__m128i*)(&out[iN[z]   ]), _mm512_cvtepi32_epi8(t1));
         _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2));
         iN[z] += 32;
@@ -470,7 +542,7 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
     }
 
     cp = out;
-    int shift = encode_freq1(in, in_size, 32, syms, &cp); 
+    int shift = encode_freq1(in, in_size, 32, syms, &cp);
     if (shift < 0) {
         free(out_free);
         htscodecs_tls_free(syms);
@@ -483,7 +555,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
 
     uint8_t* ptr = out_end;
 
-    int iN[32], isz4 = in_size/32;
+    int iN[32] __attribute__((aligned(64)));
+    int isz4 = in_size/32;
     for (z = 0; z < 32; z++)
         iN[z] = (z+1)*isz4-2;
 
@@ -503,26 +576,29 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
     LOAD512(Rv, ransN);
 
     uint16_t *ptr16 = (uint16_t *)ptr;
-    __m512i last2 = _mm512_set_epi32(lN[31], lN[30], lN[29], lN[28],
-                                     lN[27], lN[26], lN[25], lN[24],
-                                     lN[23], lN[22], lN[21], lN[20],
-                                     lN[19], lN[18], lN[17], lN[16]);
-    __m512i last1 = _mm512_set_epi32(lN[15], lN[14], lN[13], lN[12],
-                                     lN[11], lN[10], lN[ 9], lN[ 8],
-                                     lN[ 7], lN[ 6], lN[ 5], lN[ 4],
-                                     lN[ 3], lN[ 2], lN[ 1], lN[ 0]);
-    
-    __m512i iN2 = _mm512_set_epi32(iN[31], iN[30], iN[29], iN[28],
-                                   iN[27], iN[26], iN[25], iN[24],
-                                   iN[23], iN[22], iN[21], iN[20],
-                                   iN[19], iN[18], iN[17], iN[16]);
-    __m512i iN1 = _mm512_set_epi32(iN[15], iN[14], iN[13], iN[12],
-                                   iN[11], iN[10], iN[ 9], iN[ 8],
-                                   iN[ 7], iN[ 6], iN[ 5], iN[ 4],
-                                   iN[ 3], iN[ 2], iN[ 1], iN[ 0]);
-
-    __m512i c1 = _mm512_i32gather_epi32(iN1, in, 1);
-    __m512i c2 = _mm512_i32gather_epi32(iN2, in, 1);
+    LOAD512(iN, iN);
+    LOAD512(last, lN);
+
+    __m512i c1 = _mm512_i32gather_epi32x1(iN1, in);
+    __m512i c2 = _mm512_i32gather_epi32x1(iN2, in);
+
+    // We cache the next 64-bytes locally and transpose.
+    // This means we can load 32 ints from t32[x] with load instructions
+    // instead of gathers.  The copy, transpose and expand is easier done
+    // in scalar code.
+#define BATCH 64
+    uint8_t t32[BATCH][32] __attribute__((aligned(64)));
+    int next_batch;
+    if (iN[0] > BATCH) {
+        int i, j;
+        for (i = 0; i < BATCH; i++)
+            // memcpy(c[i], &in[iN[i]-32], 32); fast mode
+            for (j = 0; j < 32; j++)
+                t32[BATCH-1-i][j] = in[iN[j]-1-i];
+        next_batch = BATCH;
+    } else {
+        next_batch = -1;
+    }
 
     for (; iN[0] >= 0; iN[0]--) {
         // Note, consider doing the same approach for the AVX2 encoder.
@@ -533,8 +609,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         // FIXME: maybe we need to cope with in[31] read over-flow
         // on loop cycles 0, 1, 2 where gather reads 32-bits instead of
         // 8 bits.  Use set instead there on c2?
-        c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
-        c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
 
         // index into syms[0][0] array, used for x_max, rcp_freq, and bias
         __m512i vidx1 = _mm512_slli_epi32(c1, 8);
@@ -553,8 +627,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         //      }
 
 #define SET512x(a,x) \
-        __m512i a##1 = _mm512_i32gather_epi32(vidx1, &syms[0][0].x, 4); \
-        __m512i a##2 = _mm512_i32gather_epi32(vidx2, &syms[0][0].x, 4)
+        __m512i a##1 = _mm512_i32gather_epi32x(vidx1, &syms[0][0].x, 4); \
+        __m512i a##2 = _mm512_i32gather_epi32x(vidx2, &syms[0][0].x, 4)
 
         // Start of next loop, moved here to remove latency.
         // last[z] = c[z]
@@ -564,8 +638,54 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         last2 = c2;
         iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1));
         iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1));
-        c1 = _mm512_i32gather_epi32(iN1, in, 1);
-        c2 = _mm512_i32gather_epi32(iN2, in, 1);
+
+        // Code below is equivalent to this:
+        // c1 = _mm512_i32gather_epi32(iN1, in, 1);
+        // c2 = _mm512_i32gather_epi32(iN2, in, 1);
+
+        // Better when we have a power of 2
+        if (next_batch >= 0) {
+            if (--next_batch < 0 && iN[0] > BATCH) {
+                // Load 32 BATCH blocks of data.
+                // Executed once every BATCH cycles
+                int i, j;
+                uint8_t c[32][BATCH];
+                iN[0] += BATCH;
+                for (j = 0; j < 32; j++) {
+                    iN[j] -= BATCH;
+                    memcpy(c[j], &in[iN[j]-BATCH], BATCH);
+                }
+                // transpose matrix from 32xBATCH to BATCHx32
+                for (j = 0; j < 32; j++) {
+                    for (i = 0; i < BATCH; i+=16) {
+                        int z;
+                        for (z = 0; z < 16; z++)
+                            t32[i+z][j] = c[j][i+z];
+                    }
+                }
+                next_batch = BATCH-1;
+            }
+            if (next_batch >= 0) {
+                // Copy from our of our pre-loaded BATCHx32 tables
+                // Executed every cycles
+                __m128i c1_ = _mm_load_si128((__m128i *)&t32[next_batch][0]);
+                __m128i c2_ = _mm_load_si128((__m128i *)&t32[next_batch][16]);
+                c1 = _mm512_cvtepu8_epi32(c1_);
+                c2 = _mm512_cvtepu8_epi32(c2_);
+            }
+        }
+
+        if (next_batch < 0 && iN[0]) {
+            // no pre-loaded data as within BATCHx32 of input end
+            c1 = _mm512_i32gather_epi32x1(iN1, in);
+            c2 = _mm512_i32gather_epi32x1(iN2, in);
+
+            // Speeds up clang, even though not needed any more.
+            // Harmless to leave here.
+            c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
+            c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
+        }
+        // End of "equivalent to" code block
 
         SET512x(xmax, x_max); // high latency
 
@@ -758,10 +878,10 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
             _masked2 = _mm512_add_epi32(_masked2, _Lv2);
 
             // This is the biggest bottleneck
-            __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
-            __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
+            __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
+            __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
 
             //  f[z] = S[z]>>(TF_SHIFT_O1+8);
             __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8);
@@ -927,10 +1047,10 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
             _masked2 = _mm512_add_epi32(_masked2, _Lv2);
 
             // This is the biggest bottleneck
-            __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
-            __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
+            __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
+            __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
 
             //  f[z] = S[z]>>(TF_SHIFT_O1+8);
             __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8);
diff --git a/htslib/htscodecs/htscodecs/rANS_static32x16pr_sse4.c b/htslib/htscodecs/htscodecs/rANS_static32x16pr_sse4.c
index 054fd2f28..a9b9c4bd7 100644
--- a/htslib/htscodecs/htscodecs/rANS_static32x16pr_sse4.c
+++ b/htslib/htscodecs/htscodecs/rANS_static32x16pr_sse4.c
@@ -807,19 +807,59 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
 static inline void transpose_and_copy(uint8_t *out, int iN[32],
                                       uint8_t t[32][32]) {
     int z;
-#ifdef UBSAN
-    // Simplified version to avoid undefined behaviour sanitiser warnings.
+
+    // Simplified version from below.
+    // This is pretty good with zig cc, but slow on clang and very slow on
+    // gcc, even with -O3
+    /*
     for (z = 0; z < NX; z++) {
         int k;
         for (k = 0; k < 32; k++)
             out[iN[z]+k] = t[k][z];
         iN[z] += 32;
     }
-#else
-    // Unaligned access.  We know we can get away with this as this
-    // code is only ever executed on x86 platforms which permit this.
-    for (z = 0; z < NX; z+=4) {
-        *(uint64_t *)&out[iN[z]] =
+    */
+
+
+    // A better approach for clang and gcc can be had with some manual
+    // restructuring to attempt to do the two loops in explcit blocks.
+    // With gcc -O3 or -O2 -ftree-vectorize this is quite fast, as is clang
+    // and zig but neither beat the version below (or, for zig, the basic
+    // code above).
+    //
+    // It's left here incase we ever want to move to tidier code and
+    // to understand what auto-vectorises and what doesn't.
+    /*
+#define NZ 2
+#define NK 8
+    for (z = 0; z < 32; z+=NZ) {
+        for (int k = 0; k < 32; k+=NK) {
+            for (int z0 = z; z0 < z+NZ; z0++) {
+                uint8_t tmp[NK];// __attribute__((aligned(32)));
+                //uint8_t (*RESTRICT t0)[32] = &t[k];
+                for (int k0 = 0; k0 < NK; k0++)
+                    //tmp[k0] = t0[k0][z0];
+                    tmp[k0] = t[k+k0][z0];
+                memcpy(&out[iN[z0]+k], tmp, NK);
+            }
+        }
+        for (int z0 = z; z0 < z+NZ; z0++)
+            iN[z0] += 32;
+    }
+    */
+
+    // Manually unrolled code.
+    // This is fastest on gcc and clang and not far behind with zig cc.
+    // It also doesn't need aggressive gcc optimisation levels to be
+    // efficient.
+    //
+    // It works by constructing 64-bit ints and copying them with a single
+    // memory write.  The fixed size memcpys just boil down to a memory write,
+    // but unlike the earlier versions that did this direct, this isn't
+    // exploiting undefined behaviour.
+    for (z = 0; z < 32; z+=4) {
+        uint64_t i64;
+        i64 =
             ((uint64_t)(t[0][z])<< 0) +
             ((uint64_t)(t[1][z])<< 8) +
             ((uint64_t)(t[2][z])<<16) +
@@ -828,7 +868,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z])<<40) +
             ((uint64_t)(t[6][z])<<48) +
             ((uint64_t)(t[7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]] =
+        memcpy(&out[iN[z]], &i64, 8);
+        i64 =
             ((uint64_t)(t[0][z+1])<< 0) +
             ((uint64_t)(t[1][z+1])<< 8) +
             ((uint64_t)(t[2][z+1])<<16) +
@@ -837,7 +878,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z+1])<<40) +
             ((uint64_t)(t[6][z+1])<<48) +
             ((uint64_t)(t[7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]] =
+        memcpy(&out[iN[z+1]], &i64, 8);
+        i64 =
             ((uint64_t)(t[0][z+2])<< 0) +
             ((uint64_t)(t[1][z+2])<< 8) +
             ((uint64_t)(t[2][z+2])<<16) +
@@ -846,7 +888,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z+2])<<40) +
             ((uint64_t)(t[6][z+2])<<48) +
             ((uint64_t)(t[7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]] =
+        memcpy(&out[iN[z+2]], &i64, 8);
+        i64 =
             ((uint64_t)(t[0][z+3])<< 0) +
             ((uint64_t)(t[1][z+3])<< 8) +
             ((uint64_t)(t[2][z+3])<<16) +
@@ -855,8 +898,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z+3])<<40) +
             ((uint64_t)(t[6][z+3])<<48) +
             ((uint64_t)(t[7][z+3])<<56);
+        memcpy(&out[iN[z+3]], &i64, 8);
 
-        *(uint64_t *)&out[iN[z]+8] =
+        i64 =
             ((uint64_t)(t[8+0][z])<< 0) +
             ((uint64_t)(t[8+1][z])<< 8) +
             ((uint64_t)(t[8+2][z])<<16) +
@@ -865,7 +909,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z])<<40) +
             ((uint64_t)(t[8+6][z])<<48) +
             ((uint64_t)(t[8+7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]+8] =
+        memcpy(&out[iN[z]+8], &i64, 8);
+        i64 =
             ((uint64_t)(t[8+0][z+1])<< 0) +
             ((uint64_t)(t[8+1][z+1])<< 8) +
             ((uint64_t)(t[8+2][z+1])<<16) +
@@ -874,7 +919,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z+1])<<40) +
             ((uint64_t)(t[8+6][z+1])<<48) +
             ((uint64_t)(t[8+7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]+8] =
+        memcpy(&out[iN[z+1]+8], &i64, 8);
+        i64 =
             ((uint64_t)(t[8+0][z+2])<< 0) +
             ((uint64_t)(t[8+1][z+2])<< 8) +
             ((uint64_t)(t[8+2][z+2])<<16) +
@@ -883,7 +929,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z+2])<<40) +
             ((uint64_t)(t[8+6][z+2])<<48) +
             ((uint64_t)(t[8+7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]+8] =
+        memcpy(&out[iN[z+2]+8], &i64, 8);
+        i64 =
             ((uint64_t)(t[8+0][z+3])<< 0) +
             ((uint64_t)(t[8+1][z+3])<< 8) +
             ((uint64_t)(t[8+2][z+3])<<16) +
@@ -892,8 +939,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z+3])<<40) +
             ((uint64_t)(t[8+6][z+3])<<48) +
             ((uint64_t)(t[8+7][z+3])<<56);
+        memcpy(&out[iN[z+3]+8], &i64, 8);
 
-        *(uint64_t *)&out[iN[z]+16] =
+        i64 =
             ((uint64_t)(t[16+0][z])<< 0) +
             ((uint64_t)(t[16+1][z])<< 8) +
             ((uint64_t)(t[16+2][z])<<16) +
@@ -902,7 +950,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z])<<40) +
             ((uint64_t)(t[16+6][z])<<48) +
             ((uint64_t)(t[16+7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]+16] =
+        memcpy(&out[iN[z]+16], &i64, 8);
+        i64 =
             ((uint64_t)(t[16+0][z+1])<< 0) +
             ((uint64_t)(t[16+1][z+1])<< 8) +
             ((uint64_t)(t[16+2][z+1])<<16) +
@@ -911,7 +960,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z+1])<<40) +
             ((uint64_t)(t[16+6][z+1])<<48) +
             ((uint64_t)(t[16+7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]+16] =
+        memcpy(&out[iN[z+1]+16], &i64, 8);
+        i64 =
             ((uint64_t)(t[16+0][z+2])<< 0) +
             ((uint64_t)(t[16+1][z+2])<< 8) +
             ((uint64_t)(t[16+2][z+2])<<16) +
@@ -920,7 +970,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z+2])<<40) +
             ((uint64_t)(t[16+6][z+2])<<48) +
             ((uint64_t)(t[16+7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]+16] =
+        memcpy(&out[iN[z+2]+16], &i64, 8);
+        i64 =
             ((uint64_t)(t[16+0][z+3])<< 0) +
             ((uint64_t)(t[16+1][z+3])<< 8) +
             ((uint64_t)(t[16+2][z+3])<<16) +
@@ -929,8 +980,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z+3])<<40) +
             ((uint64_t)(t[16+6][z+3])<<48) +
             ((uint64_t)(t[16+7][z+3])<<56);
+        memcpy(&out[iN[z+3]+16], &i64, 8);
 
-        *(uint64_t *)&out[iN[z]+24] =
+        i64 =
             ((uint64_t)(t[24+0][z])<< 0) +
             ((uint64_t)(t[24+1][z])<< 8) +
             ((uint64_t)(t[24+2][z])<<16) +
@@ -939,7 +991,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z])<<40) +
             ((uint64_t)(t[24+6][z])<<48) +
             ((uint64_t)(t[24+7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]+24] =
+        memcpy(&out[iN[z]+24], &i64, 8);
+        i64 =
             ((uint64_t)(t[24+0][z+1])<< 0) +
             ((uint64_t)(t[24+1][z+1])<< 8) +
             ((uint64_t)(t[24+2][z+1])<<16) +
@@ -948,7 +1001,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z+1])<<40) +
             ((uint64_t)(t[24+6][z+1])<<48) +
             ((uint64_t)(t[24+7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]+24] =
+        memcpy(&out[iN[z+1]+24], &i64, 8);
+        i64 =
             ((uint64_t)(t[24+0][z+2])<< 0) +
             ((uint64_t)(t[24+1][z+2])<< 8) +
             ((uint64_t)(t[24+2][z+2])<<16) +
@@ -957,7 +1011,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z+2])<<40) +
             ((uint64_t)(t[24+6][z+2])<<48) +
             ((uint64_t)(t[24+7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]+24] =
+        memcpy(&out[iN[z+2]+24], &i64, 8);
+        i64 =
             ((uint64_t)(t[24+0][z+3])<< 0) +
             ((uint64_t)(t[24+1][z+3])<< 8) +
             ((uint64_t)(t[24+2][z+3])<<16) +
@@ -966,13 +1021,13 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z+3])<<40) +
             ((uint64_t)(t[24+6][z+3])<<48) +
             ((uint64_t)(t[24+7][z+3])<<56);
+        memcpy(&out[iN[z+3]+24], &i64, 8);
 
         iN[z+0] += 32;
         iN[z+1] += 32;
         iN[z+2] += 32;
         iN[z+3] += 32;
     }
-#endif
 }
 
 unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
@@ -1414,8 +1469,10 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             uint32_t S = s3[l[z]][m];
             unsigned char c = S & 0xff;
             out[i4[z]++] = c;
-            R[z] = (S>>(TF_SHIFT_O1+8)) * (R[z]>>TF_SHIFT_O1) +
-                ((S>>8) & ((1u<<TF_SHIFT_O1)-1));
+            int f = (S>>(TF_SHIFT_O1+8));
+            if (f == 0)
+                f = 4096;
+            R[z] = f * (R[z]>>TF_SHIFT_O1) + ((S>>8) & ((1u<<TF_SHIFT_O1)-1));
             RansDecRenormSafe(&R[z], &ptr, ptr_end);
             l[z] = c;
         }
diff --git a/htslib/htscodecs/htscodecs/rANS_static4x16pr.c b/htslib/htscodecs/htscodecs/rANS_static4x16pr.c
index 332d9e279..8c9a64ad2 100644
--- a/htslib/htscodecs/htscodecs/rANS_static4x16pr.c
+++ b/htslib/htscodecs/htscodecs/rANS_static4x16pr.c
@@ -176,8 +176,11 @@ unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
 
     switch (i=(in_size&3)) {
     case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
+        // fall-through
     case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
+        // fall-through
     case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
+        // fall-through
     case 0:
         break;
     }
@@ -822,16 +825,11 @@ unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size,
  */
 #include "rANS_static32x16pr.h"
 
-// Test interface for restricting the auto-detection methods so we
-// can forcibly compare different implementations on the same machine.
-// See RANS_CPU_ defines in rANS_static4x16.h
 static int rans_cpu = 0xFFFF; // all
-void rans_set_cpu(int opts) {
-    rans_cpu = opts;
-}
 
-#if (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__)
-// Icc and Clang both also set __GNUC__ on linux, but not on Windows.
+#if defined(__x86_64__) && \
+    defined(HAVE_DECL___CPUID_COUNT)   && HAVE_DECL___CPUID_COUNT && \
+    defined(HAVE_DECL___GET_CPUID_MAX) && HAVE_DECL___GET_CPUID_MAX
 #include <cpuid.h>
 
 #if defined(__clang__) && defined(__has_attribute)
@@ -861,6 +859,7 @@ static int have_avx2    UNUSED = 0;
 static int have_avx512f UNUSED = 0;
 static int is_amd       UNUSED = 0;
 
+#define HAVE_HTSCODECS_TLS_CPU_INIT
 static void htscodecs_tls_cpu_init(void) {
     unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
     // These may be unused, depending on HAVE_* config.h macros
@@ -892,10 +891,6 @@ static void htscodecs_tls_cpu_init(void) {
 
     if (!have_popcnt) have_avx512f = have_avx2 = have_sse4_1 = 0;
     if (!have_ssse3)  have_sse4_1 = 0;
-
-    if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_avx512f = 0;
-    if (!(rans_cpu & RANS_CPU_ENC_AVX2))   have_avx2 = 0;
-    if (!(rans_cpu & RANS_CPU_ENC_SSE4))   have_sse4_1 = 0;
 }
 
 static inline
@@ -904,6 +899,15 @@ unsigned char *(*rans_enc_func(int do_simd, int order))
      unsigned int in_size,
      unsigned char *out,
      unsigned int *out_size) {
+
+    int have_e_sse4_1  = have_sse4_1;
+    int have_e_avx2    = have_avx2;
+    int have_e_avx512f = have_avx512f;
+
+    if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_e_avx512f = 0;
+    if (!(rans_cpu & RANS_CPU_ENC_AVX2))   have_e_avx2    = 0;
+    if (!(rans_cpu & RANS_CPU_ENC_SSE4))   have_e_sse4_1  = 0;
+
     if (!do_simd) { // SIMD disabled
         return order & 1
             ? rans_compress_O1_4x16
@@ -922,30 +926,41 @@ unsigned char *(*rans_enc_func(int do_simd, int order))
 #endif
 
     if (order & 1) {
+        // With simulated gathers, the AVX512 is now slower than AVX2, so
+        // we avoid using it unless asking for the real avx512 gather.
+        // Note for testing we do -c 0x0404 to enable AVX512 and disable AVX2.
+        // We then need to call the avx512 func regardless.
+        int use_gather;
+#ifdef USE_GATHER
+        use_gather = 1;
+#else
+        use_gather = !have_e_avx2;
+#endif
+
 #if defined(HAVE_AVX512)
-        if (have_avx512f && (!is_amd || !have_avx2))
+        if (have_e_avx512f && (!is_amd || !have_e_avx2) && use_gather)
             return rans_compress_O1_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_e_avx2)
             return rans_compress_O1_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1) 
+        if (have_e_sse4_1)
             return rans_compress_O1_32x16;
 #endif
         return rans_compress_O1_32x16;
     } else {
 #if defined(HAVE_AVX512)
-        if (have_avx512f && (!is_amd || !have_avx2))
+        if (have_e_avx512f && (!is_amd || !have_e_avx2))
             return rans_compress_O0_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_e_avx2)
             return rans_compress_O0_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1)
+        if (have_e_sse4_1)
             return rans_compress_O0_32x16;
 #endif
         return rans_compress_O0_32x16;
@@ -959,6 +974,14 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
      unsigned char *out,
      unsigned int out_size) {
 
+    int have_d_sse4_1  = have_sse4_1;
+    int have_d_avx2    = have_avx2;
+    int have_d_avx512f = have_avx512f;
+
+    if (!(rans_cpu & RANS_CPU_DEC_AVX512)) have_d_avx512f = 0;
+    if (!(rans_cpu & RANS_CPU_DEC_AVX2))   have_d_avx2    = 0;
+    if (!(rans_cpu & RANS_CPU_DEC_SSE4))   have_d_sse4_1  = 0;
+
     if (!do_simd) { // SIMD disabled
         return order & 1
             ? rans_uncompress_O1_4x16
@@ -978,29 +1001,29 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
 
     if (order & 1) {
 #if defined(HAVE_AVX512)
-        if (have_avx512f)
+        if (have_d_avx512f)
             return rans_uncompress_O1_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_d_avx2)
             return rans_uncompress_O1_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1)
+        if (have_d_sse4_1)
             return rans_uncompress_O1_32x16_sse4;
 #endif
         return rans_uncompress_O1_32x16;
     } else {
 #if defined(HAVE_AVX512)
-        if (have_avx512f && (!is_amd || !have_avx2))
+        if (have_d_avx512f)
             return rans_uncompress_O0_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_d_avx2)
             return rans_uncompress_O0_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1)
+        if (have_d_sse4_1)
             return rans_uncompress_O0_32x16_sse4;
 #endif
         return rans_uncompress_O0_32x16;
@@ -1015,7 +1038,7 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
 #include <processthreadsapi.h>
 #endif
 
-static inline int have_neon() {
+static inline int have_neon(void) {
 #if defined(__linux__) && defined(__arm__)
     return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
 #elif defined(__linux__) && defined(__aarch64__) && defined(HWCAP_ASIMD)
@@ -1023,11 +1046,11 @@ static inline int have_neon() {
 #elif defined(__APPLE__)
     return 1;
 #elif defined(__FreeBSD__) && defined(__arm__)
-    u_long cap;
+    unsigned long cap;
     if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
     return (cap & HWCAP_NEON) != 0;
 #elif defined(__FreeBSD__) && defined(__aarch64__) && defined(HWCAP_ASIMD)
-    u_long cap;
+    unsigned long cap;
     if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
     return (cap & HWCAP_ASIMD) != 0;
 #elif defined(_WIN32)
@@ -1123,6 +1146,16 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
 
 #endif
 
+// Test interface for restricting the auto-detection methods so we
+// can forcibly compare different implementations on the same machine.
+// See RANS_CPU_ defines in rANS_static4x16.h
+void rans_set_cpu(int opts) {
+    rans_cpu = opts;
+#ifdef HAVE_HTSCODECS_TLS_CPU_INIT
+    htscodecs_tls_cpu_init();
+#endif
+}
+
 /*-----------------------------------------------------------------------------
  * Simple interface to the order-0 vs order-1 encoders and decoders.
  *
@@ -1158,9 +1191,10 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
 
     if (in_size <= 20)
         order &= ~RANS_ORDER_STRIPE;
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
     if (in_size <= 1000)
         order &= ~RANS_ORDER_X32;
-
+#endif
     if (order & RANS_ORDER_STRIPE) {
         int N = (order>>8) & 0xff;
         if (N == 0) N = 4; // default for compatibility with old tests
@@ -1267,7 +1301,8 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
         out[0] = RANS_ORDER_CAT;
         c_meta_len = 1;
         c_meta_len += var_put_u32(&out[1], out_end, in_size);
-        memcpy(out+c_meta_len, in, in_size);
+        if (in_size)
+            memcpy(out+c_meta_len, in, in_size);
         *out_size = c_meta_len + in_size;
         return out;
     }
@@ -1380,7 +1415,8 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
     if (*out_size >= in_size) {
         out[0] &= ~3;
         out[0] |= RANS_ORDER_CAT | no_size;
-        memcpy(out+c_meta_len, in, in_size);
+        if (in_size)
+            memcpy(out+c_meta_len, in, in_size);
         *out_size = in_size;
     }
 
diff --git a/htslib/htscodecs/htscodecs/tokenise_name3.c b/htslib/htscodecs/htscodecs/tokenise_name3.c
index b92dc7b15..749357905 100644
--- a/htslib/htscodecs/htscodecs/tokenise_name3.c
+++ b/htslib/htscodecs/htscodecs/tokenise_name3.c
@@ -232,15 +232,15 @@ static void free_context(name_context *ctx) {
 // Returns number of bytes written.
 static int append_uint32_fixed(char *cp, uint32_t i, uint8_t l) {
     switch (l) {
-    case 9:*cp++ = i / 100000000 + '0', i %= 100000000;
-    case 8:*cp++ = i / 10000000  + '0', i %= 10000000;
-    case 7:*cp++ = i / 1000000   + '0', i %= 1000000;
-    case 6:*cp++ = i / 100000    + '0', i %= 100000;
-    case 5:*cp++ = i / 10000     + '0', i %= 10000;
-    case 4:*cp++ = i / 1000      + '0', i %= 1000;
-    case 3:*cp++ = i / 100       + '0', i %= 100;
-    case 2:*cp++ = i / 10        + '0', i %= 10;
-    case 1:*cp++ = i             + '0';
+    case 9:*cp++ = i / 100000000 + '0', i %= 100000000; // fall-through
+    case 8:*cp++ = i / 10000000  + '0', i %= 10000000;  // fall-through
+    case 7:*cp++ = i / 1000000   + '0', i %= 1000000;   // fall-through
+    case 6:*cp++ = i / 100000    + '0', i %= 100000;    // fall-through
+    case 5:*cp++ = i / 10000     + '0', i %= 10000;     // fall-through
+    case 4:*cp++ = i / 1000      + '0', i %= 1000;      // fall-through
+    case 3:*cp++ = i / 100       + '0', i %= 100;       // fall-through
+    case 2:*cp++ = i / 10        + '0', i %= 10;        // fall-through
+    case 1:*cp++ = i             + '0';                 // fall-throuhg
     case 0:break;
     }
     return l;
@@ -489,11 +489,11 @@ int build_trie(name_context *ctx, char *data, size_t len, int n) {
     for (nlines = i = 0; i < len; i++, nlines++) {
         t = ctx->t_head;
         t->count++;
-        while (i < len && data[i] > '\n') {
+        while (i < len && (unsigned char)data[i] > '\n') {
             unsigned char c = data[i++];
             if (c & 0x80)
                 //fprintf(stderr, "8-bit ASCII is unsupported\n");
-                abort();
+                return -1;
             c &= 127;
 
 
@@ -653,7 +653,7 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
             unsigned char c = data[i++];
             if (c & 0x80)
                 //fprintf(stderr, "8-bit ASCII is unsupported\n");
-                abort();
+                return -1;
             c &= 127;
 
             trie_t *x = t->next;
@@ -756,6 +756,8 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
 
     for (; i < len; i++) {
         if (ntok >= ctx->max_tok) {
+            if (ctx->max_tok >= MAX_TOKENS)
+                return -1;
             memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
             memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
             memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int));
@@ -763,19 +765,20 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
         }
 
         /* Determine data type of this segment */
-        if (isalpha(name[i])) {
+        if (isalpha((uint8_t)name[i])) {
             int s = i+1;
 //          int S = i+1;
 
 //          // FIXME: try which of these is best.  alnum is good sometimes.
-//          while (s < len && isalpha(name[s]))
-            while (s < len && (isalpha(name[s]) || ispunct(name[s])))
+//          while (s < len && isalpha((uint8_t)name[s]))
+            while (s < len && (isalpha((uint8_t)name[s]) ||
+                               ispunct((uint8_t)name[s])))
 //          while (s < len && name[s] != ':')
-//          while (s < len && !isdigit(name[s]) && name[s] != ':')
+//          while (s < len && !isdigit((uint8_t)name[s]) && name[s] != ':')
                 s++;
 
 //          if (!is_fixed) {
-//              while (S < len && isalnum(name[S]))
+//              while (S < len && isalnum((uint8_t)name[S]))
 //                  S++;
 //              if (s < S)
 //                  s = S;
@@ -819,7 +822,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
             uint32_t v = 0;
             int d = 0;
 
-            while (s < len && isdigit(name[s]) && s-i < 9) {
+            while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) {
                 v = v*10 + name[s] - '0';
                 //putchar(name[s]);
                 s++;
@@ -837,14 +840,14 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
                     //ctx->lc[pnum].last[ntok].token_delta=0;
                 } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
 #ifdef ENC_DEBUG
-                    fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last[ntok].token_int, v);
+                    fprintf(stderr, "Tok %d (dig0-delta, %d / %d)\n", N_DDELTA0, ctx->lc[pnum].last[ntok].token_int, v);
 #endif
                     //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
                     if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
                     //ctx->lc[pnum].last[ntok].token_delta=1;
                 } else {
 #ifdef ENC_DEBUG
-                    fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last[ntok].token_int, v);
+                    fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, s-i);
 #endif
                     if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
                     if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
@@ -852,7 +855,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
                 }
             } else {
 #ifdef ENC_DEBUG
-                fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
+                fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, s-i);
 #endif
                 if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
                 if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
@@ -864,13 +867,13 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
             ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;
 
             i = s-1;
-        } else if (isdigit(name[i])) {
+        } else if (isdigit((uint8_t)name[i])) {
             // digits starting 1-9; encode value
             uint32_t s = i;
             uint32_t v = 0;
             int d = 0;
 
-            while (s < len && isdigit(name[s]) && s-i < 9) {
+            while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) {
                 v = v*10 + name[s] - '0';
                 //putchar(name[s]);
                 s++;
@@ -936,7 +939,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
             i = s-1;
         } else {
         n_char:
-            //if (!isalpha(name[i])) putchar(name[i]);
+            //if (!isalpha((uint8_t)name[i])) putchar(name[i]);
             if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_CHAR) {
                 if (name[i] == ctx->lc[pnum].last[ntok].token_int) {
 #ifdef ENC_DEBUG
@@ -968,6 +971,8 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
     fprintf(stderr, "Tok %d (end)\n", N_END);
 #endif
     if (ntok >= ctx->max_tok) {
+        if (ctx->max_tok >= MAX_TOKENS)
+            return -1;
         memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
         memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
         memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int));
@@ -1464,11 +1469,17 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
 
     // Encode name
     for (i = j = 0; i < len; j=++i) {
-        while (i < len && blk[i] > '\n')
+        while (i < len && (signed char)blk[i] >= ' ') // non-ASCII check
             i++;
         if (i >= len)
             break;
 
+        if (blk[i] != '\0' && blk[i] != '\n') {
+            // Names must be 7-bit ASCII printable
+            free_context(ctx);
+            return NULL;
+        }
+
         blk[i] = '\0';
         // try both 0 and 1 and pick best?
         if (encode_name(ctx, &blk[j], i-j, 1) < 0) {
@@ -1567,7 +1578,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
             ctx->desc[i].dup_from = j;
             tot_size += 3; // flag, dup_from, ttype
         } else {
-            ctx->desc[i].dup_from = 0;
+            ctx->desc[i].dup_from = -1;
             tot_size += out_len + 1; // ttype
         }
     }
@@ -1575,7 +1586,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
 #if 0
     for (i = 0; i < ctx->max_tok*16; i++) {
         char fn[1024];
-        if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue;
+        if (!ctx->desc[i].buf_l && ctx->desc[i].dup_from == -1) continue;
         sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i);
         FILE *fp = fopen(fn, "w");
         fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
@@ -1613,7 +1624,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
             ttype8 |= 128;
             last_tnum = ctx->desc[i].tnum;
         }
-        if (ctx->desc[i].dup_from) {
+        if (ctx->desc[i].dup_from >= 0) {
             //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l);
             *cp++ = ttype8 | 64;
             *cp++ = ctx->desc[i].dup_from >> 4;
@@ -1675,7 +1686,7 @@ uint8_t *tok3_decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) {
     while (o < sz) {
         uint8_t ttype = in[o++];
         if (ttype & 64) {
-            if (o+2 >= sz) goto err;
+            if (o+2 > sz) goto err;
             int j = in[o++]<<4;
             j += in[o++];
             if (ttype & 128) {
diff --git a/htslib/htscodecs/htscodecs/varint.h b/htslib/htscodecs/htscodecs/varint.h
index a4b148aec..c4a516824 100644
--- a/htslib/htscodecs/htscodecs/varint.h
+++ b/htslib/htscodecs/htscodecs/varint.h
@@ -115,14 +115,14 @@ int var_put_u64(uint8_t *cp, const uint8_t *endp, uint64_t i) {
         *cp++ = ((i>> 7) & 0x7f) | 128;
         *cp++ =   i      & 0x7f;
         return 4;
-    } else if (i < (1LL<<35)) {
+    } else if (i < (1ULL<<35)) {
         *cp++ = ((i>>28) & 0x7f) | 128;
         *cp++ = ((i>>21) & 0x7f) | 128;
         *cp++ = ((i>>14) & 0x7f) | 128;
         *cp++ = ((i>> 7) & 0x7f) | 128;
         *cp++ =   i      & 0x7f;
         return 5;
-    } else if (i < (1LL<<42)) {
+    } else if (i < (1ULL<<42)) {
         *cp++ = ((i>>35) & 0x7f) | 128;
         *cp++ = ((i>>28) & 0x7f) | 128;
         *cp++ = ((i>>21) & 0x7f) | 128;
@@ -130,7 +130,7 @@ int var_put_u64(uint8_t *cp, const uint8_t *endp, uint64_t i) {
         *cp++ = ((i>> 7) & 0x7f) | 128;
         *cp++ =   i      & 0x7f;
         return 6;
-    } else if (i < (1LL<<49)) {
+    } else if (i < (1ULL<<49)) {
         *cp++ = ((i>>42) & 0x7f) | 128;
         *cp++ = ((i>>35) & 0x7f) | 128;
         *cp++ = ((i>>28) & 0x7f) | 128;
@@ -139,7 +139,7 @@ int var_put_u64(uint8_t *cp, const uint8_t *endp, uint64_t i) {
         *cp++ = ((i>> 7) & 0x7f) | 128;
         *cp++ =   i      & 0x7f;
         return 7;
-    } else if (i < (1LL<<56)) {
+    } else if (i < (1ULL<<56)) {
         *cp++ = ((i>>49) & 0x7f) | 128;
         *cp++ = ((i>>42) & 0x7f) | 128;
         *cp++ = ((i>>35) & 0x7f) | 128;
@@ -149,7 +149,7 @@ int var_put_u64(uint8_t *cp, const uint8_t *endp, uint64_t i) {
         *cp++ = ((i>> 7) & 0x7f) | 128;
         *cp++ =   i      & 0x7f;
         return 8;
-    } else if (i < (1LL<<63)) {
+    } else if (i < (1ULL<<63)) {
         *cp++ = ((i>>56) & 0x7f) | 128;
         *cp++ = ((i>>49) & 0x7f) | 128;
         *cp++ = ((i>>42) & 0x7f) | 128;
@@ -241,7 +241,7 @@ int var_get_u64(uint8_t *cp, const uint8_t *endp, uint64_t *i) {
     uint8_t *op = cp, c;
     uint64_t j = 0;
 
-    if (!endp || endp - cp >= 10) {
+    if (!endp || endp - cp >= 11) {
         int n = 10;
         do {
             c = *cp++;
diff --git a/htslib/htscodecs/htscodecs/version.h b/htslib/htscodecs/htscodecs/version.h
index f56282785..048dcab54 100644
--- a/htslib/htscodecs/htscodecs/version.h
+++ b/htslib/htscodecs/htscodecs/version.h
@@ -1 +1 @@
-#define HTSCODECS_VERSION_TEXT "1.5.1"
+#define HTSCODECS_VERSION_TEXT "1.6.1"
diff --git a/htslib/htsfile.c b/htslib/htsfile.c
deleted file mode 100644
index 9f7bf4531..000000000
--- a/htslib/htsfile.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*  htsfile.c -- file identifier and minimal viewer.
-
-    Copyright (C) 2014-2019 Genome Research Ltd.
-
-    Author: John Marshall <jm18@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.  */
-
-#include <config.h>
-
-#include <ctype.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <getopt.h>
-#include <unistd.h>
-
-#include "htslib/hfile.h"
-#include "htslib/hts.h"
-#include "htslib/sam.h"
-#include "htslib/vcf.h"
-
-#ifndef EFTYPE
-#define EFTYPE ENOEXEC
-#endif
-
-enum { identify, view_headers, view_all, copy } mode = identify;
-int show_headers = 1;
-int verbose = 0;
-int status = EXIT_SUCCESS;  /* Exit status from main */
-
-void error(const char *format, ...)
-{
-    int err = errno;
-    va_list args;
-    va_start(args, format);
-    fflush(stdout);
-    fprintf(stderr, "htsfile: ");
-    vfprintf(stderr, format, args);
-    if (err) fprintf(stderr, ": %s\n", strerror(err));
-    else fprintf(stderr, "\n");
-    fflush(stderr);
-    va_end(args);
-    status = EXIT_FAILURE;
-}
-
-static htsFile *dup_stdout(const char *mode)
-{
-    int fd = dup(STDOUT_FILENO);
-    hFILE *hfp = (fd >= 0)? hdopen(fd, mode) : NULL;
-    return hfp? hts_hopen(hfp, "-", mode) : NULL;
-}
-
-static void view_sam(samFile *in, const char *filename)
-{
-    bam1_t *b = NULL;
-    sam_hdr_t *hdr = NULL;
-    samFile *out = NULL;
-
-    hdr = sam_hdr_read(in);
-    if (hdr == NULL) {
-        errno = 0; error("reading headers from \"%s\" failed", filename);
-        goto clean;
-    }
-
-    out = dup_stdout("w");
-    if (out == NULL) { error("reopening standard output failed"); goto clean; }
-
-    if (show_headers) {
-        if (sam_hdr_write(out, hdr) != 0) {
-            error("writing headers to standard output failed");
-            goto clean;
-        }
-    }
-
-    if (mode == view_all) {
-        int ret;
-
-        b = bam_init1();
-        if (b == NULL) { error("can't create record"); goto clean; }
-
-        while ((ret = sam_read1(in, hdr, b)) >= 0) {
-            if (sam_write1(out, hdr, b) < 0) {
-                error("writing to standard output failed");
-                goto clean;
-            }
-        }
-
-        if (ret < -1) { error("reading \"%s\" failed", filename); goto clean; }
-    }
-
- clean:
-    sam_hdr_destroy(hdr);
-    bam_destroy1(b);
-    if (out) hts_close(out);
-}
-
-static void view_vcf(vcfFile *in, const char *filename)
-{
-    bcf1_t *rec = NULL;
-    bcf_hdr_t *hdr = NULL;
-    vcfFile *out = NULL;
-
-    hdr = bcf_hdr_read(in);
-    if (hdr == NULL) {
-        errno = 0; error("reading headers from \"%s\" failed", filename);
-        goto clean;
-    }
-
-    out = dup_stdout("w");
-    if (out == NULL) { error("reopening standard output failed"); goto clean; }
-
-    if (show_headers) {
-        if (bcf_hdr_write(out, hdr) != 0) {
-            error("writing headers to standard output failed");
-            goto clean;
-        }
-    }
-
-    if (mode == view_all) {
-        int ret;
-
-        rec = bcf_init();
-        if (rec == NULL) { error("can't create record"); goto clean; }
-
-        while ((ret = bcf_read(in, hdr, rec)) >= 0) {
-            if (bcf_write(out, hdr, rec) < 0) {
-                error("writing to standard output failed");
-                goto clean;
-            }
-        }
-
-        if (ret < -1) { error("reading \"%s\" failed", filename); goto clean; }
-    }
-
- clean:
-    if (hdr) bcf_hdr_destroy(hdr);
-    if (rec) bcf_destroy(rec);
-    if (out) hts_close(out);
-}
-
-static void view_raw(hFILE *fp, const char *filename)
-{
-    int c, prev;
-    for (prev = '\n'; (c = hgetc(fp)) != EOF; prev = c)
-        if (isprint(c) || c == '\n' || c == '\t') putchar(c);
-        else if (c == '\r') fputs("\\r", stdout);
-        else if (c == '\0') fputs("\\0", stdout);
-        else printf("\\x%02x", c);
-
-    if (prev != '\n') putchar('\n');
-
-    if (herrno(fp)) {
-        errno = herrno(fp);
-        error("reading \"%s\" failed", filename);
-    }
-}
-
-static void copy_raw(const char *srcfilename, const char *destfilename)
-{
-    hFILE *src = hopen(srcfilename, "r");
-    if (src == NULL) {
-        error("can't open \"%s\"", srcfilename);
-        return;
-    }
-
-    size_t bufsize = 1048576;
-    char *buffer = malloc(bufsize);
-    if (buffer == NULL) {
-        error("can't allocate copy buffer");
-        hclose_abruptly(src);
-        return;
-    }
-
-    hFILE *dest = hopen(destfilename, "w");
-    if (dest == NULL) {
-        error("can't create \"%s\"", destfilename);
-        hclose_abruptly(src);
-        free(buffer);
-        return;
-    }
-
-    ssize_t n;
-    while ((n = hread(src, buffer, bufsize)) > 0)
-        if (hwrite(dest, buffer, n) != n) {
-            error("writing to \"%s\" failed", destfilename);
-            hclose_abruptly(dest);
-            dest = NULL;
-            break;
-        }
-
-    if (n < 0) {
-        error("reading from \"%s\" failed", srcfilename);
-        hclose_abruptly(src);
-        src = NULL;
-    }
-
-    if (dest && hclose(dest) < 0) error("closing \"%s\" failed", destfilename);
-    if (src && hclose(src) < 0)   error("closing \"%s\" failed", srcfilename);
-    free(buffer);
-}
-
-static void usage(FILE *fp, int status)
-{
-    fprintf(fp,
-"Usage: htsfile [-chHv] FILE...\n"
-"       htsfile --copy [-v] FILE DESTFILE\n"
-"Options:\n"
-"  -c, --view         Write textual form of FILEs to standard output\n"
-"  -C, --copy         Copy the exact contents of FILE to DESTFILE\n"
-"  -h, --header-only  Display only headers in view mode, not records\n"
-"  -H, --no-header    Suppress header display in view mode\n"
-"  -v, --verbose      Increase verbosity of warnings and diagnostics\n");
-    exit(status);
-}
-
-int main(int argc, char **argv)
-{
-    static const struct option options[] = {
-        { "copy", no_argument, NULL, 'C' },
-        { "header-only", no_argument, NULL, 'h' },
-        { "no-header", no_argument, NULL, 'H' },
-        { "view", no_argument, NULL, 'c' },
-        { "verbose", no_argument, NULL, 'v' },
-        { "help", no_argument, NULL, 2 },
-        { "version", no_argument, NULL, 1 },
-        { NULL, 0, NULL, 0 }
-    };
-
-    int c, i;
-
-    status = EXIT_SUCCESS;
-    while ((c = getopt_long(argc, argv, "cChHv", options, NULL)) >= 0)
-        switch (c) {
-        case 'c': mode = view_all; break;
-        case 'C': mode = copy; break;
-        case 'h': mode = view_headers; show_headers = 1; break;
-        case 'H': show_headers = 0; break;
-        case 'v': hts_verbose++; verbose++; break;
-        case 1:
-            printf(
-"htsfile (htslib) %s\n"
-"Copyright (C) 2023 Genome Research Ltd.\n",
-                   hts_version());
-            exit(EXIT_SUCCESS);
-            break;
-        case 2:   usage(stdout, EXIT_SUCCESS); break;
-        default:  usage(stderr, EXIT_FAILURE); break;
-        }
-
-    if (optind == argc) usage(stderr, EXIT_FAILURE);
-
-    if (mode == copy) {
-        if (optind + 2 != argc) usage(stderr, EXIT_FAILURE);
-        copy_raw(argv[optind], argv[optind + 1]);
-        return status;
-    }
-
-    for (i = optind; i < argc; i++) {
-        hFILE *fp = hopen(argv[i], "r");
-        if (fp == NULL) {
-            error("can't open \"%s\"", argv[i]);
-            continue;
-        }
-
-        if (mode == identify) {
-            htsFormat fmt;
-            if (hts_detect_format2(fp, argv[i], &fmt) < 0) {
-                error("detecting \"%s\" format failed", argv[i]);
-                hclose_abruptly(fp);
-                continue;
-            }
-
-            char *description = hts_format_description(&fmt);
-            printf("%s:\t%s\n", argv[i], description);
-            free(description);
-        }
-        else {
-            htsFile *hts = hts_hopen(fp, argv[i], "r");
-            if (hts) {
-                switch (hts_get_format(hts)->category) {
-                case sequence_data:
-                    view_sam(hts, argv[i]);
-                    break;
-                case variant_data:
-                    view_vcf(hts, argv[i]);
-                    break;
-                default:
-                    if (verbose)
-                        view_raw(fp, argv[i]);
-                    else {
-                        errno = 0;
-                        error("can't view \"%s\": unknown format", argv[i]);
-                    }
-                    break;
-                }
-
-                if (hts_close(hts) < 0) error("closing \"%s\" failed", argv[i]);
-                fp = NULL;
-            }
-            else if ((errno == EFTYPE || errno == ENOEXEC) && verbose)
-                view_raw(fp, argv[i]);
-            else
-                error("can't view \"%s\"", argv[i]);
-        }
-
-        if (fp && hclose(fp) < 0) error("closing \"%s\" failed", argv[i]);
-    }
-
-    return status;
-}
diff --git a/htslib/htslib.mk b/htslib/htslib.mk
index 9c60ffc2b..57dffae29 100644
--- a/htslib/htslib.mk
+++ b/htslib/htslib.mk
@@ -176,6 +176,9 @@ $(HTSDIR)/htsfile: $(HTSSRCDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/lib
 $(HTSDIR)/tabix: $(HTSSRCDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a
 	+cd $(HTSDIR) && $(MAKE) tabix
 
+$(HTSDIR)/annot-tsv: $(HTSSRCDIR)/annot-tsv.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a
+	+cd $(HTSDIR) && $(MAKE) annot-tsv
+
 $(HTSDIR)/htslib_static.mk: $(HTSDIR)/htslib.pc.tmp
 	+cd $(HTSDIR) && $(MAKE) htslib_static.mk
 
diff --git a/htslib/htslib/bgzf.h b/htslib/htslib/bgzf.h
index ea4ec3ece..87d4c6a3b 100644
--- a/htslib/htslib/bgzf.h
+++ b/htslib/htslib/bgzf.h
@@ -3,7 +3,7 @@
 /*
    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
                  2011, 2012 Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd
+   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2024 Genome Research Ltd
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,7 @@
 #define HTSLIB_BGZF_H
 
 #include <stdint.h>
+#include <string.h>
 #include <sys/types.h>
 
 #include "hts_defs.h"
@@ -143,6 +144,26 @@ typedef struct BGZF BGZF;
     HTSLIB_EXPORT
     ssize_t bgzf_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED;
 
+/**
+ * bgzf_read optimised for small quantities, as a static inline
+ * See bgzf_read() normal function for return values.
+ */
+static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
+    // A block length of 0 implies current block isn't loaded (see
+    // bgzf_seek_common).  That gives negative available so careful on types
+    if ((ssize_t)length < fp->block_length - fp->block_offset) {
+        // Short cut the common and easy mode
+        memcpy((uint8_t *)data,
+               (uint8_t *)fp->uncompressed_block + fp->block_offset,
+               length);
+        fp->block_offset += length;
+        fp->uncompressed_address += length;
+        return length;
+    } else {
+        return bgzf_read(fp, data, length);
+    }
+}
+
     /**
      * Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
      * the complete _length_ bytes will be written (or queued for writing).
@@ -155,6 +176,24 @@ typedef struct BGZF BGZF;
     HTSLIB_EXPORT
     ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED;
 
+/**
+ * bgzf_write optimised for small quantities, as a static inline
+ * See bgzf_write() normal function for return values.
+ */
+static inline
+ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) {
+    if (fp->is_compressed
+        && (size_t) (BGZF_BLOCK_SIZE - fp->block_offset) > length) {
+        // Short cut the common and easy mode
+        memcpy((uint8_t *)fp->uncompressed_block + fp->block_offset,
+               data, length);
+        fp->block_offset += length;
+        return length;
+    } else {
+        return bgzf_write(fp, data, length);
+    }
+}
+
     /**
      * Write _length_ bytes from _data_ to the file, the index will be used to
      * decide the amount of uncompressed data to be written to each bgzip block.
diff --git a/htslib/htslib/cram.h b/htslib/htslib/cram.h
index e0b51839c..ddc44bbba 100644
--- a/htslib/htslib/cram.h
+++ b/htslib/htslib/cram.h
@@ -1,7 +1,7 @@
 /// @file htslib/cram.h
 /// CRAM format-specific API functions.
 /*
-    Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd.
+    Copyright (C) 2015, 2016, 2018-2020, 2022-2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -209,6 +209,11 @@ HTSLIB_EXPORT
 int cram_container_is_empty(cram_fd *fd);
 
 
+/* Returns chromosome and start/span from container struct */
+HTSLIB_EXPORT
+void cram_container_get_coords(cram_container *c,
+                               int *refid, hts_pos_t *start, hts_pos_t *span);
+
 /*
  *-----------------------------------------------------------------------------
  * cram_block
@@ -329,6 +334,18 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out,
 HTSLIB_EXPORT
 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice);
 
+/*
+ * Copies a container, but filtering it down to a specific region (as
+ * already specified in 'in'
+ *
+ * Returns 0 on success
+ *        -1 on EOF
+ *        -2 on error
+ */
+HTSLIB_EXPORT
+int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
+                          int *ref_id);
+
 /*
  * Decodes a CRAM block compression header.
  * Returns header ptr on success
@@ -744,6 +761,62 @@ static inline void sam_hdr_free(SAM_hdr *hdr) { sam_hdr_destroy(hdr); }
 HTSLIB_EXPORT
 refs_t *cram_get_refs(htsFile *fd);
 
+/*!
+ * Returns the file offsets of CRAM slices covering a specific region
+ * query.  Note both offsets are the START of the slice.
+ *
+ * first will point to the start of the first overlapping slice
+ * last will point to the start of the last overlapping slice
+ *
+ * @return
+ * Returns 0 on success
+ *        <0 on failure
+ */
+HTSLIB_EXPORT
+int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
+                       off_t *first, off_t *last);
+
+/*! Returns the total number of containers in the CRAM index.
+ *
+ * Note the index is not required to have an entry for every container, but it
+ * will always have an index entry for the start of each chromosome.
+ * (Although in practice our indices do container one entry per container.)
+ *
+ * This is equivalent to cram_num_containers_between(fd, 0, 0, NULL, NULL)
+ */
+HTSLIB_EXPORT
+int64_t cram_num_containers(cram_fd *fd);
+
+/*! Returns the number of containers in the CRAM index within given offsets.
+ *
+ * The cstart and cend offsets are the locations of the start of containers
+ * as returned by index_container_offset.
+ *
+ * If non-NULL, first and last will hold the inclusive range of container
+ * numbers, counting from zero.
+ *
+ * @return
+ * Returns the number of containers, equivalent to *last-*first+1.
+ */
+HTSLIB_EXPORT
+int64_t cram_num_containers_between(cram_fd *fd,
+                                    off_t cstart, off_t cend,
+                                    int64_t *first, int64_t *last);
+
+/*! Returns the byte offset for the start of the n^th container.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+HTSLIB_EXPORT
+off_t cram_container_num2offset(cram_fd *fd, int64_t n);
+
+/*! Returns the container number for the first container at offset >= pos.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+HTSLIB_EXPORT
+int64_t cram_container_offset2num(cram_fd *fd, off_t pos);
+
 /**@}*/
 
 #ifdef __cplusplus
diff --git a/htslib/htslib/hfile.h b/htslib/htslib/hfile.h
index bc86757a2..e851faf43 100644
--- a/htslib/htslib/hfile.h
+++ b/htslib/htslib/hfile.h
@@ -57,7 +57,7 @@ typedef struct hFILE {
     char *buffer, *begin, *end, *limit;
     const struct hFILE_backend *backend;
     off_t offset;
-    unsigned at_eof:1, mobile:1, readonly:1;
+    unsigned at_eof:1, mobile:1, readonly:1, preserve:1;
     int has_errno;
     // @endcond
 } hFILE;
diff --git a/htslib/htslib/hts.h b/htslib/htslib/hts.h
index 4baad1e0d..4f85424cf 100644
--- a/htslib/htslib/hts.h
+++ b/htslib/htslib/hts.h
@@ -489,7 +489,7 @@ const char *hts_version(void);
 // Immediately after release, bump ZZ to 90 to distinguish in-development
 // Git repository builds from the release; you may wish to increment this
 // further when significant features are merged.
-#define HTS_VERSION 101800
+#define HTS_VERSION 102100
 
 /*! @abstract Introspection on the features enabled in htslib
  *
@@ -1534,6 +1534,13 @@ static inline int hts_bin_bot(int bin, int n_lvls)
     return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
 }
 
+/// Compute the (0-based exclusive) maximum position covered by a binning index
+static inline hts_pos_t hts_bin_maxpos(int min_shift, int n_lvls)
+{
+    hts_pos_t one = 1;
+    return one << (min_shift + n_lvls * 3);
+}
+
 /**************
  * Endianness *
  **************/
diff --git a/htslib/htslib/hts_defs.h b/htslib/htslib/hts_defs.h
index 7719215c1..b5cded341 100644
--- a/htslib/htslib/hts_defs.h
+++ b/htslib/htslib/hts_defs.h
@@ -1,6 +1,6 @@
 /*  hts_defs.h -- Miscellaneous definitions.
 
-    Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd.
+    Copyright (C) 2013-2015,2017, 2019-2020, 2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
@@ -58,6 +58,21 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_NORETURN
 #endif
 
+// Enable optimisation level 3, especially for gcc.  To be used
+// where we want to force vectorisation in hot loops and the default -O2
+// just doesn't cut it.
+#if HTS_COMPILER_HAS(optimize) || HTS_GCC_AT_LEAST(4,4)
+#define HTS_OPT3 __attribute__((optimize("O3")))
+#else
+#define HTS_OPT3
+#endif
+
+#if HTS_COMPILER_HAS(aligned) || HTS_GCC_AT_LEAST(4,3)
+#define HTS_ALIGN32 __attribute__((aligned(32)))
+#else
+#define HTS_ALIGN32
+#endif
+
 // GCC introduced warn_unused_result in 3.4 but added -Wno-unused-result later
 #if HTS_COMPILER_HAS(__warn_unused_result__) || HTS_GCC_AT_LEAST(4,5)
 #define HTS_RESULT_USED __attribute__ ((__warn_unused_result__))
diff --git a/htslib/htslib/hts_endian.h b/htslib/htslib/hts_endian.h
index 30ad8055d..12effab7b 100644
--- a/htslib/htslib/hts_endian.h
+++ b/htslib/htslib/hts_endian.h
@@ -100,7 +100,7 @@ DEALINGS IN THE SOFTWARE.  */
 #endif
 
 #if HTS_ALLOW_UNALIGNED != 0
-#    if defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+#    if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) || defined(__clang__)
 // This prevents problems with gcc's vectoriser generating the wrong
 // instructions for unaligned data.
 typedef uint16_t uint16_u __attribute__ ((__aligned__ (1)));
diff --git a/htslib/htslib/khash.h b/htslib/htslib/khash.h
index 4cea91020..02e4917c8 100644
--- a/htslib/htslib/khash.h
+++ b/htslib/htslib/khash.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2014-2015, 2018 Genome Research Ltd.
+   Copyright (C) 2014-2015, 2018, 2024 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -356,7 +356,39 @@ static const double __ac_HASH_UPPER = 0.77;
 			__ac_set_isdel_true(h->flags, x);							\
 			--h->size;													\
 		}																\
-	}
+	}                                                                   \
+    SCOPE int kh_stats_##name(kh_##name##_t *h, khint_t *empty,         \
+                              khint_t *deleted, khint_t *hist_size,     \
+                              khint_t **hist_out)                       \
+    {                                                                   \
+        khint_t i, *hist = NULL, dist_max = 0, k, dist, step;           \
+        khint_t mask = h->n_buckets - 1;                                \
+        *empty = *deleted = *hist_size = 0;                             \
+        hist = (khint_t *) calloc(1, sizeof(*hist));                    \
+        if (!hist) { return -1; }                                       \
+        for (i = kh_begin(h); i < kh_end(h); ++i) {                     \
+            if (__ac_isempty(h->flags, i)) { (*empty)++; continue; }      \
+            if (__ac_isdel(h->flags, i)) { (*deleted)++; continue; }      \
+            k = __hash_func(h->keys[i]) & (h->n_buckets - 1);           \
+            dist = 0;                                                   \
+            step = 0;                                                   \
+            while (k != i) {                                            \
+                dist++;                                                 \
+                k = (k + (++step)) & mask;                              \
+            }                                                           \
+            if (dist_max <= dist) {                                     \
+                khint_t *new_hist = (khint_t *) realloc(hist, sizeof(*new_hist) * (dist + 1)); \
+                if (!new_hist) { free(hist); return -1; }               \
+                for (k = dist_max + 1; k <= dist; k++) new_hist[k] = 0; \
+                hist = new_hist;                                        \
+                dist_max = dist;                                        \
+            }                                                           \
+            hist[dist]++;                                               \
+        }                                                               \
+        *hist_out = hist;                                               \
+        *hist_size = dist_max + 1;                                      \
+        return 0;                                                       \
+    }
 
 #define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
 	__KHASH_TYPE(name, khkey_t, khval_t) 								\
@@ -391,6 +423,7 @@ static const double __ac_HASH_UPPER = 0.77;
   @abstract     64-bit integer comparison function
  */
 #define kh_int64_hash_equal(a, b) ((a) == (b))
+
 /*! @function
   @abstract     const char* hash function
   @param  s     Pointer to a null terminated string
@@ -402,12 +435,28 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s)
 	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
 	return h;
 }
+
+/*! @function
+  @abstract     const char* FNV1a hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_FNV1a_hash_string(const char *s)
+{
+	const khint_t offset_basis = 2166136261;
+	const khint_t FNV_prime = 16777619;
+	khint_t h = offset_basis;
+	for (; *s; ++s) h = (h ^ (uint8_t) *s) * FNV_prime;
+	return h;
+}
+
 /*! @function
   @abstract     Another interface to const char* hash function
   @param  key   Pointer to a nul terminated string [const char*]
   @return       The hash value [khint_t]
  */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+#define kh_str_hash_func(key) __ac_FNV1a_hash_string(key)
+
 /*! @function
   @abstract     Const char* comparison function
  */
@@ -426,12 +475,29 @@ static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks)
 		h = (h << 5) - h + (khint_t)ks.s[i];
 	return h;
 }
+
+/*! @function
+  @abstract     Kstring hash function
+  @param  s     Pointer to a kstring
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_FNV1a_hash_kstring(const kstring_t ks)
+{
+	const khint_t offset_basis = 2166136261;
+	const khint_t FNV_prime = 16777619;
+	khint_t h = offset_basis;
+	size_t i;
+	for (i = 0; i < ks.l; i++)
+		h = (h ^ (uint8_t) ks.s[i]) * FNV_prime;
+	return h;
+}
+
 /*! @function
   @abstract     Interface to kstring hash function.
   @param  key   Pointer to a khash; permits hashing on non-nul terminated strings.
   @return       The hash value [khint_t]
  */
-#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key)
+#define kh_kstr_hash_func(key) __ac_FNV1a_hash_kstring(key)
 /*! @function
   @abstract     kstring comparison function
  */
@@ -604,6 +670,19 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
 		code;												\
 	} }
 
+/*! @function
+  @abstract  Gather hash table statistics
+  @param  name            Name of the hash table [symbol]
+  @param  h               Pointer to the hash table [khash_t(name)*]
+  @param  empty[out]      Number of empty hash bins
+  @param  deleted[out]    Number of hash bins with the deleted flag
+  @param  hist_size[out]  Size of @p hist array
+  @param  hist[out]       Probe count histogram
+  @return 0 on success; -1 on failure
+ */
+#define kh_stats(name, h, empty, deleted, hist_size, hist) \
+    kh_stats_##name(h, empty, deleted, hist_size, hist)
+
 /* More convenient interfaces */
 
 /*! @function
diff --git a/htslib/htslib/kseq.h b/htslib/htslib/kseq.h
index e9fed44cb..5913f35ad 100644
--- a/htslib/htslib/kseq.h
+++ b/htslib/htslib/kseq.h
@@ -113,8 +113,8 @@
 				unsigned char *sep = (unsigned char *)memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \
 				i = sep != NULL ? sep - ks->buf : ks->end; \
 			} else if (delimiter > KS_SEP_MAX) { \
-				for (i = ks->begin; i < ks->end; ++i) \
-					if (ks->buf[i] == delimiter) break; \
+				unsigned char *sep = (unsigned char *)memchr(ks->buf + ks->begin, delimiter, ks->end - ks->begin); \
+				i = sep != NULL ? sep - ks->buf : ks->end; \
 			} else if (delimiter == KS_SEP_SPACE) { \
 				for (i = ks->begin; i < ks->end; ++i) \
 					if (isspace(ks->buf[i])) break; \
diff --git a/htslib/htslib/kstring.h b/htslib/htslib/kstring.h
index 53a19806d..ebb2f9363 100644
--- a/htslib/htslib/kstring.h
+++ b/htslib/htslib/kstring.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd.
+   Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -375,17 +375,63 @@ static inline int kputw(int c, kstring_t *s)
 
 static inline int kputll(long long c, kstring_t *s)
 {
-	char buf[32];
-	int i, l = 0;
-	unsigned long long x = c;
-	if (c < 0) x = -x;
-	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
-	if (c < 0) buf[l++] = '-';
-	if (ks_resize(s, s->l + l + 2) < 0)
-		return EOF;
-	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
-	s->s[s->l] = 0;
-	return 0;
+    // Worst case expansion.  One check reduces function size
+    // and aids inlining chance.  Memory overhead is minimal.
+    if (ks_resize(s, s->l + 23) < 0)
+	return EOF;
+
+    unsigned long long x = c;
+    if (c < 0) {
+	x = -x;
+        s->s[s->l++] = '-';
+    }
+
+    if (x <= UINT32_MAX)
+	return kputuw(x, s);
+
+    static const char kputull_dig2r[] =
+        "00010203040506070809"
+        "10111213141516171819"
+        "20212223242526272829"
+        "30313233343536373839"
+        "40414243444546474849"
+        "50515253545556575859"
+        "60616263646566676869"
+        "70717273747576777879"
+        "80818283848586878889"
+        "90919293949596979899";
+    unsigned int l, j;
+    char *cp;
+
+    // Find out how long the number is (could consider clzll)
+    uint64_t m = 1;
+    l = 0;
+    if (sizeof(long long)==sizeof(uint64_t) && x >= 10000000000000000000ULL) {
+	// avoids overflow below
+	l = 20;
+    } else {
+	do {
+	    l++;
+	    m *= 10;
+	} while (x >= m);
+    }
+
+    // Add digits two at a time
+    j = l;
+    cp = s->s + s->l;
+    while (x >= 10) {
+        const char *d = &kputull_dig2r[2*(x%100)];
+        x /= 100;
+        memcpy(&cp[j-=2], d, 2);
+    }
+
+    // Last one (if necessary).  We know that x < 10 by now.
+    if (j == 1)
+        cp[0] = x + '0';
+
+    s->l += l;
+    s->s[s->l] = 0;
+    return 0;
 }
 
 static inline int kputl(long c, kstring_t *s) {
diff --git a/htslib/htslib/sam.h b/htslib/htslib/sam.h
index cffa04701..0da5f047f 100644
--- a/htslib/htslib/sam.h
+++ b/htslib/htslib/sam.h
@@ -1133,6 +1133,12 @@ ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *
                        can be NULL
  @param  b       [in/out]  address of the destination bam1_t struct
  @return         number of processed CIGAR operators; -1 on error
+
+ @discussion The BAM record may be partial and empty of existing cigar, seq
+ and quality, as is the case during SAM parsing, or it may be an existing
+ BAM record in which case this function replaces the existing CIGAR field
+ and shuffles data accordingly.  A CIGAR of "*" will remove the CIGAR,
+ returning zero.
  */
 HTSLIB_EXPORT
 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b);
@@ -1707,11 +1713,11 @@ static inline int bam_aux_get_str(const bam1_t *b,
 HTSLIB_EXPORT
 int64_t bam_aux2i(const uint8_t *s);
 
-/// Get an integer aux value
+/// Get a float aux value
 /** @param s Pointer to the tag data, as returned by bam_aux_get()
-    @return The value, or 0 if the tag was not an integer type
+    @return The value, or 0 if the tag was not a float type
     If the tag is not an numeric type, errno is set to EINVAL.  The value of
-    integer flags will be returned cast to a double.
+    the float will be returned cast to a double.
 */
 HTSLIB_EXPORT
 double bam_aux2f(const uint8_t *s);
@@ -2210,7 +2216,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag);
  @field canonical_base    The canonical base referred to in the MM tag.
                           One of A, C, G, T or N.  Note this may not be the
                           explicit base recorded in the SEQ column (esp. if N).
- @field stran             0 or 1, indicating + or - strand from MM tag.
+ @field strand            0 or 1, indicating + or - strand from MM tag.
  @field qual              Quality code (256*probability), or -1 if unknown
 
  @discussion
@@ -2224,10 +2230,10 @@ typedef struct hts_base_mod {
     int qual;
 } hts_base_mod;
 
-#define HTS_MOD_UNKNOWN   -1  // In MM but no ML
+#define HTS_MOD_UNKNOWN   -1  // In MM but not ML
 #define HTS_MOD_UNCHECKED -2  // Not in MM and in explicit mode
 
-// Flags for hts_parse_basemod2
+// Flags for bam_parse_basemod2
 #define HTS_MOD_REPORT_UNCHECKED 1
 
 /// Allocates an hts_base_mode_state.
@@ -2253,7 +2259,7 @@ hts_base_mod_state *hts_base_mod_state_alloc(void);
 HTSLIB_EXPORT
 void hts_base_mod_state_free(hts_base_mod_state *state);
 
-/// Parses the Mm and Ml tags out of a bam record.
+/// Parses the MM and ML tags out of a bam record.
 /**
  * @param b        BAM alignment record
  * @param state    The base modification state pointer.
@@ -2262,11 +2268,12 @@ void hts_base_mod_state_free(hts_base_mod_state *state);
  *
  * This fills out the contents of the modification state, resetting the
  * iterator location to the first sequence base.
+ * (Parses the draft Mm/Ml tags instead if MM and/or ML are not present.)
  */
 HTSLIB_EXPORT
 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state);
 
-/// Parses the Mm and Ml tags out of a bam record.
+/// Parses the MM and ML tags out of a bam record.
 /**
  * @param b        BAM alignment record
  * @param state    The base modification state pointer.
@@ -2277,6 +2284,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state);
  *
  * This fills out the contents of the modification state, resetting the
  * iterator location to the first sequence base.
+ * (Parses the draft Mm/Ml tags instead if MM and/or ML are not present.)
  */
 HTSLIB_EXPORT
 int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
diff --git a/htslib/htslib/tbx.h b/htslib/htslib/tbx.h
index 3d2037cbb..f4b5bd856 100644
--- a/htslib/htslib/tbx.h
+++ b/htslib/htslib/tbx.h
@@ -38,6 +38,7 @@ extern "C" {
 #define TBX_GENERIC 0
 #define TBX_SAM     1
 #define TBX_VCF     2
+#define TBX_GAF     3
 #define TBX_UCSC    0x10000
 
 typedef struct tbx_conf_t {
@@ -53,7 +54,7 @@ typedef struct tbx_t {
 } tbx_t;
 
 HTSLIB_EXPORT
-extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf;
+extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf, tbx_conf_gaf;
 
     #define tbx_itr_destroy(iter) hts_itr_destroy(iter)
     #define tbx_itr_queryi(tbx, tid, beg, end) hts_itr_query((tbx)->idx, (tid), (beg), (end), tbx_readrec)
diff --git a/htslib/htslib/vcf.h b/htslib/htslib/vcf.h
index 83659ae12..9a36cab05 100644
--- a/htslib/htslib/vcf.h
+++ b/htslib/htslib/vcf.h
@@ -596,7 +596,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     int bcf_hdr_append(bcf_hdr_t *h, const char *line);
 
     HTSLIB_EXPORT
-    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
+    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
+    HTS_FORMAT(HTS_PRINTF_FMT, 2, 3);
 
     /** VCF version, e.g. VCFv4.2 */
     HTSLIB_EXPORT
@@ -1456,7 +1457,14 @@ which works for both BCF and VCF.
 #define bcf_int16_missing    (-32767-1)      /* INT16_MIN */
 #define bcf_int32_missing    (-2147483647-1) /* INT32_MIN */
 #define bcf_int64_missing    (-9223372036854775807LL - 1LL)  /* INT64_MIN */
-#define bcf_str_missing      0x07
+
+// All of the above are values, which may occur multiple times in lists of
+// integers or lists of floating point.  Strings in VCF don't have
+// lists - a list of strings is just another (comma-separated) string.
+//
+// Hence bcf_str_missing is the whole string being missing rather than
+// an element of a list.  Ie a string of length zero: (0<<4)|BCF_BT_CHAR.
+#define bcf_str_missing      BCF_BT_CHAR
 
 // Limits on BCF values stored in given types.  Max values are the same
 // as for the underlying type.  Min values are slightly different as
@@ -1522,26 +1530,37 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
 
 static inline int bcf_enc_size(kstring_t *s, int size, int type)
 {
-    uint32_t e = 0;
-    uint8_t x[4];
-    if (size >= 15) {
-        e |= kputc(15<<4|type, s) < 0;
-        if (size >= 128) {
-            if (size >= 32768) {
-                i32_to_le(size, x);
-                e |= kputc(1<<4|BCF_BT_INT32, s) < 0;
-                e |= kputsn((char*)&x, 4, s) < 0;
-            } else {
-                i16_to_le(size, x);
-                e |= kputc(1<<4|BCF_BT_INT16, s) < 0;
-                e |= kputsn((char*)&x, 2, s) < 0;
-            }
+    // Most common case is first
+    if (size < 15) {
+        if (ks_resize(s, s->l + 1) < 0)
+            return -1;
+        uint8_t *p = (uint8_t *)s->s + s->l;
+        *p++ = (size<<4) | type;
+        s->l++;
+        return 0;
+    }
+
+    if (ks_resize(s, s->l + 6) < 0)
+        return -1;
+    uint8_t *p = (uint8_t *)s->s + s->l;
+    *p++ = 15<<4|type;
+
+    if (size < 128) {
+        *p++ = 1<<4|BCF_BT_INT8;
+        *p++ = size;
+        s->l += 3;
+    } else {
+        if (size < 32768) {
+            *p++ = 1<<4|BCF_BT_INT16;
+            i16_to_le(size, p);
+            s->l += 4;
         } else {
-            e |= kputc(1<<4|BCF_BT_INT8, s) < 0;
-            e |= kputc(size, s) < 0;
+            *p++ = 1<<4|BCF_BT_INT32;
+            i32_to_le(size, p);
+            s->l += 6;
         }
-    } else e |= kputc(size<<4|type, s) < 0;
-    return e == 0 ? 0 : -1;
+    }
+    return 0;
 }
 
 static inline int bcf_enc_inttype(long x)
@@ -1553,27 +1572,35 @@ static inline int bcf_enc_inttype(long x)
 
 static inline int bcf_enc_int1(kstring_t *s, int32_t x)
 {
-    uint32_t e = 0;
-    uint8_t z[4];
+    if (ks_resize(s, s->l + 5) < 0)
+        return -1;
+    uint8_t *p = (uint8_t *)s->s + s->l;
+
     if (x == bcf_int32_vector_end) {
-        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
-        e |= kputc(bcf_int8_vector_end, s) < 0;
+        // An inline implementation of bcf_enc_size with size==1 and
+        // memory allocation already accounted for.
+        *p = (1<<4) | BCF_BT_INT8;
+        p[1] = bcf_int8_vector_end;
+        s->l+=2;
     } else if (x == bcf_int32_missing) {
-        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
-        e |= kputc(bcf_int8_missing, s) < 0;
+        *p = (1<<4) | BCF_BT_INT8;
+        p[1] = bcf_int8_missing;
+        s->l+=2;
     } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) {
-        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
-        e |= kputc(x, s) < 0;
+        *p = (1<<4) | BCF_BT_INT8;
+        p[1] = x;
+        s->l+=2;
     } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) {
-        i16_to_le(x, z);
-        e |= bcf_enc_size(s, 1, BCF_BT_INT16);
-        e |= kputsn((char*)&z, 2, s) < 0;
+        *p = (1<<4) | BCF_BT_INT16;
+        i16_to_le(x, p+1);
+        s->l+=3;
     } else {
-        i32_to_le(x, z);
-        e |= bcf_enc_size(s, 1, BCF_BT_INT32);
-        e |= kputsn((char*)&z, 4, s) < 0;
+        *p = (1<<4) | BCF_BT_INT32;
+        i32_to_le(x, p+1);
+        s->l+=5;
     }
-    return e == 0 ? 0 : -1;
+
+    return 0;
 }
 
 /// Return the value of a single typed integer.
diff --git a/htslib/kstring.c b/htslib/kstring.c
index 71facf975..9a6142e80 100644
--- a/htslib/kstring.c
+++ b/htslib/kstring.c
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2013-2018, 2020-2021 Genome Research Ltd.
+   Copyright (C) 2013-2018, 2020-2021, 2023 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -63,78 +63,77 @@ int kputd(double d, kstring_t *s) {
 		return len;
 	}
 
-	uint64_t i = d*10000000000LL;
 	// Correction for rounding - rather ugly
-
 	// Optimised for small numbers.
-	// Better still would be __builtin_clz on hi/lo 32 and get the
-	// starting point very rapidly.
-	if (d<.0001)
-		i+=0;
-	else if (d<0.001)
-		i+=5;
-	else if (d < 0.01)
-		i+=50;
-	else if (d < 0.1)
-		i+=500;
-	else if (d < 1)
-		i+=5000;
-	else if (d < 10)
-		i+=50000;
-	else if (d < 100)
-		i+=500000;
-	else if (d < 1000)
-		i+=5000000;
-	else if (d < 10000)
-		i+=50000000;
-	else if (d < 100000)
-		i+=500000000;
-	else
-		i+=5000000000LL;
-
-	do {
-		*--cp = '0' + i%10;
-		i /= 10;
-	} while (i >= 1);
-	buf[20] = 0;
+
+	uint32_t i;
+	if (d<0.001)         i = rint(d*1000000000), cp -= 1;
+	else if (d < 0.01)   i = rint(d*100000000),  cp -= 2;
+	else if (d < 0.1)    i = rint(d*10000000),   cp -= 3;
+	else if (d < 1)      i = rint(d*1000000),    cp -= 4;
+	else if (d < 10)     i = rint(d*100000),     cp -= 5;
+	else if (d < 100)    i = rint(d*10000),      cp -= 6;
+	else if (d < 1000)   i = rint(d*1000),       cp -= 7;
+	else if (d < 10000)  i = rint(d*100),        cp -= 8;
+	else if (d < 100000) i = rint(d*10),         cp -= 9;
+	else                 i = rint(d),            cp -= 10;
+
+	// integer i is always 6 digits, so print it 2 at a time.
+	static const char kputuw_dig2r[] =
+		"00010203040506070809"
+		"10111213141516171819"
+		"20212223242526272829"
+		"30313233343536373839"
+		"40414243444546474849"
+		"50515253545556575859"
+		"60616263646566676869"
+		"70717273747576777879"
+		"80818283848586878889"
+		"90919293949596979899";
+
+	memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
+	memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2); i /= 100;
+	memcpy(cp-=2, &kputuw_dig2r[2*(i%100)], 2);
+
+	// Except when it rounds up (d=0.009999999 is i=1000000)
+	if (i >= 100)
+		*--cp = '0' + (i/100);
+
+
 	int p = buf+20-cp;
-	if (p <= 10) { // d < 1
-		//assert(d/1);
-		cp[6] = 0; ep = cp+5;// 6 precision
-		while (p < 10) {
+	if (p <= 10) { /* d < 1 */
+		// 0.00123 is 123, so add leading zeros and 0.
+		ep = cp+5; // 6 precision
+		while (p < 10) { // aka d < 1
 			*--cp = '0';
 			p++;
 		}
 		*--cp = '.';
 		*--cp = '0';
 	} else {
+		// 123.001 is 123001 with p==13, so move 123 down and add "."
+		// Equiv to memmove(cp-1, cp, p-10); cp--;
 		char *xp = --cp;
+		ep = cp+6;
 		while (p > 10) {
 			xp[0] = xp[1];
-			p--;
 			xp++;
+			p--;
 		}
 		xp[0] = '.';
-		cp[7] = 0; ep=cp+6;
-		if (cp[6] == '.') cp[6] = 0;
 	}
 
 	// Cull trailing zeros
 	while (*ep == '0' && ep > cp)
 		ep--;
-	char *z = ep+1;
-	while (ep > cp) {
-		if (*ep == '.') {
-			if (z[-1] == '.')
-				z[-1] = 0;
-			else
-				z[0] = 0;
-			break;
-		}
-		ep--;
-	}
 
-	int sl = strlen(cp);
+	// End can be 1 out due to the mostly-6 but occasionally 7 (i==1) case.
+	// Also code with "123." which should be "123"
+	if (*ep && *ep != '.')
+		ep++;
+	*ep = 0;
+
+	int sl = ep-cp;
 	len += sl;
 	kputsn(cp, sl, s);
 	return len;
@@ -204,8 +203,17 @@ char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
 		for (p = start; *p; ++p)
 			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
 	} else {
-		for (p = start; *p; ++p)
-			if (*p == aux->sep) break;
+		// Using strchr is fast for next token, but slower for
+		// last token due to extra pass from strlen.  Overall
+		// on a VCF parse this func was 146% faster with // strchr.
+		// Equiv to:
+		// for (p = start; *p; ++p) if (*p == aux->sep) break;
+
+		// NB: We could use strchrnul() here from glibc if detected,
+		// which is ~40% faster again, but it's not so portable.
+		// i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
+		uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
+		p = p2 ? p2 : start + strlen((char *)start);
 	}
 	aux->p = (const char *) p; // end of token
 	if (*p == 0) aux->finished = 1; // no more tokens
diff --git a/htslib/m4/hts_check_compile_flags_needed.m4 b/htslib/m4/hts_check_compile_flags_needed.m4
index fb668e86f..7c1b6dec5 100644
--- a/htslib/m4/hts_check_compile_flags_needed.m4
+++ b/htslib/m4/hts_check_compile_flags_needed.m4
@@ -39,7 +39,7 @@
 #   and this notice are preserved.  This file is offered as-is, without any
 #   warranty.
 
-#   AX_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#   HTS_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAGS, [INPUT], [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
 
 AC_DEFUN([HTS_CHECK_COMPILE_FLAGS_NEEDED],
 [AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
@@ -50,7 +50,7 @@ AC_CACHE_CHECK([_AC_LANG compiler flags needed for $1], CACHEVAR, [
     [ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
      _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $6 $2"
      AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])],
-       [AS_VAR_SET(CACHEVAR,[$2])],
+       [AS_VAR_SET(CACHEVAR,["$2"])],
        [AS_VAR_SET(CACHEVAR,[unsupported])])
      _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])])
 AS_VAR_IF(CACHEVAR,unsupported, [
diff --git a/htslib/regidx.c b/htslib/regidx.c
index 67b356825..602edebf3 100644
--- a/htslib/regidx.c
+++ b/htslib/regidx.c
@@ -135,6 +135,8 @@ static inline int cmp_regs(reg_t *a, reg_t *b)
     if ( a->beg > b->beg ) return 1;
     if ( a->end < b->end ) return 1;    // longer intervals come first
     if ( a->end > b->end ) return -1;
+    if ( a < b ) return -1; // this is are just for qsort reproducibility across platforms
+    if ( a > b ) return 1;
     return 0;
 }
 static int cmp_reg_ptrs(const void *a, const void *b)
diff --git a/htslib/sam.c b/htslib/sam.c
index fc4e677df..7e58da6e7 100644
--- a/htslib/sam.c
+++ b/htslib/sam.c
@@ -1,6 +1,6 @@
 /*  sam.c -- SAM and BAM file I/O and manipulation.
 
-    Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd.
+    Copyright (C) 2008-2010, 2012-2024 Genome Research Ltd.
     Copyright (C) 2010, 2012, 2013 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -37,6 +37,10 @@ DEALINGS IN THE SOFTWARE.  */
 #include <inttypes.h>
 #include <unistd.h>
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#include "fuzz_settings.h"
+#endif
+
 // Suppress deprecation message for cigar_tab, which we initialise
 #include "htslib/hts_defs.h"
 #undef HTS_DEPRECATED
@@ -100,7 +104,7 @@ const int8_t bam_cigar_table[256] = {
     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
 };
 
-sam_hdr_t *sam_hdr_init()
+sam_hdr_t *sam_hdr_init(void)
 {
     sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
     if (bh == NULL) return NULL;
@@ -251,6 +255,9 @@ sam_hdr_t *bam_hdr_read(BGZF *fp)
 
     bufsize = h->l_text + 1;
     if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
+#endif
     h->text = (char*)malloc(bufsize);
     if (!h->text) goto nomem;
     h->text[h->l_text] = 0; // make sure it is NULL terminated
@@ -264,6 +271,10 @@ sam_hdr_t *bam_hdr_read(BGZF *fp)
     if (h->n_targets < 0) goto invalid;
 
     // read reference sequence names and lengths
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
+        goto nomem;
+#endif
     if (h->n_targets > 0) {
         h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
         if (!h->target_name) goto nomem;
@@ -410,7 +421,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
  *** BAM alignment I/O ***
  *************************/
 
-bam1_t *bam_init1()
+bam1_t *bam_init1(void)
 {
     return (bam1_t*)calloc(1, sizeof(bam1_t));
 }
@@ -420,11 +431,18 @@ int sam_realloc_bam_data(bam1_t *b, size_t desired)
     uint32_t new_m_data;
     uint8_t *new_data;
     new_m_data = desired;
-    kroundup32(new_m_data);
+    kroundup32(new_m_data); // next power of 2
+    new_m_data += 32; // reduces malloc arena migrations?
     if (new_m_data < desired) {
         errno = ENOMEM; // Not strictly true but we can't store the size
         return -1;
     }
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (new_m_data > FUZZ_ALLOC_LIMIT) {
+        errno = ENOMEM;
+        return -1;
+    }
+#endif
     if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
         new_data = realloc(b->data, new_m_data);
     } else {
@@ -655,25 +673,36 @@ hts_pos_t bam_endpos(const bam1_t *b)
 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
 {
     bam1_core_t *c = &b->core;
-    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
-    uint8_t *CG;
 
-    // test where there is a real CIGAR in the CG tag to move
-    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
-    cigar0 = bam_get_cigar(b);
-    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
-    fake_bytes = c->n_cigar * 4;
+    // Bail out as fast as possible for the easy case
+    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
+    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
+        return 0;
+
+    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
+    // but this is much less likely so do as a secondary check.
+    if (c->tid < 0 || c->pos < 0)
+        return 0;
+
+    // Do we have a CG tag?
+    uint8_t *CG = bam_aux_get(b, "CG");
     int saved_errno = errno;
-    CG = bam_aux_get(b, "CG");
     if (!CG) {
         if (errno != ENOENT) return -1;  // Bad aux data
         errno = saved_errno; // restore errno on expected no-CG-tag case
         return 0;
     }
+
+    // Now we start with the serious work migrating CG to CIGAR
+    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
+        *cigar0, CG_len, fake_bytes;
+    cigar0 = bam_get_cigar(b);
+    fake_bytes = c->n_cigar * 4;
     if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
         return 0; // not of type B,I
     CG_len = le_to_u32(CG + 2);
-    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
+    // don't move if the real CIGAR length is shorter than the fake cigar length
+    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
 
     // move from the CG tag to the right position
     cigar_st = (uint8_t*)cigar0 - b->data;
@@ -682,16 +711,19 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0
     CG_st = CG - b->data - 2;
     CG_en = CG_st + 8 + n_cigar4;
     if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
-    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
-    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
-    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
+    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
+    b->l_data = b->l_data - fake_bytes + n_cigar4;
+    // insert c->n_cigar-fake_bytes empty space to make room
+    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
+    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
+    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
     if (ori_len > CG_en) // move data after the CG tag
         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
     if (recal_bin)
         b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
     if (give_warning)
-        hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
+        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
     return 1;
 }
 
@@ -746,27 +778,41 @@ int bam_read1(BGZF *fp, bam1_t *b)
 {
     bam1_core_t *c = &b->core;
     int32_t block_len, ret, i;
-    uint32_t x[8], new_l_data;
+    uint32_t new_l_data;
+    uint8_t tmp[32], *x;
 
     b->l_data = 0;
 
-    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
+    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
         if (ret == 0) return -1; // normal end-of-file
         else return -2; // truncated
     }
     if (fp->is_be)
         ed_swap_4p(&block_len);
     if (block_len < 32) return -4;  // block_len includes core data
-    if (bgzf_read(fp, x, 32) != 32) return -3;
-    if (fp->is_be) {
-        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
+    if (fp->block_length - fp->block_offset > 32) {
+        // Avoid bgzf_read and a temporary copy to a local buffer
+        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
+        fp->block_offset += 32;
+    } else {
+        x = tmp;
+        if (bgzf_read(fp, x, 32) != 32) return -3;
     }
-    c->tid = x[0]; c->pos = (int32_t)x[1];
-    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+
+    c->tid        = le_to_u32(x);
+    c->pos        = le_to_i32(x+4);
+    uint32_t x2   = le_to_u32(x+8);
+    c->bin        = x2>>16;
+    c->qual       = x2>>8&0xff;
+    c->l_qname    = x2&0xff;
     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
-    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
-    c->l_qseq = x[4];
-    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
+    uint32_t x3   = le_to_u32(x+12);
+    c->flag       = x3>>16;
+    c->n_cigar    = x3&0xffff;
+    c->l_qseq     = le_to_u32(x+16);
+    c->mtid       = le_to_u32(x+20);
+    c->mpos       = le_to_i32(x+24);
+    c->isize      = le_to_i32(x+28);
 
     new_l_data = block_len - 32 + c->l_extranul;
     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
@@ -776,19 +822,20 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (realloc_bam_data(b, new_l_data) < 0) return -4;
     b->l_data = new_l_data;
 
-    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
-    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
+    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
+    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
         if (fixup_missing_qname_nul(b) < 0) return -4;
     }
     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
     c->l_qname += c->l_extranul;
     if (b->l_data < c->l_qname ||
-        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
+        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
         return -4;
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     if (bam_tag2cigar(b, 0, 0) < 0)
         return -4;
 
+    // TODO: consider making this conditional
     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
         hts_pos_t rlen, qlen;
         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
@@ -835,15 +882,15 @@ int bam_write1(BGZF *fp, const bam1_t *b)
     if (fp->is_be) {
         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
         y = block_len;
-        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
         swap_data(c, b->l_data, b->data, 1);
     } else {
-        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
     }
-    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
-    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
+    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
+    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
-        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
         uint8_t buf[8];
         uint32_t cigar_st, cigar_en, cigar[2];
@@ -862,12 +909,12 @@ int bam_write1(BGZF *fp, const bam1_t *b)
         cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
         u32_to_le(cigar[0], buf);
         u32_to_le(cigar[1], buf + 4);
-        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
-        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
-        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
+        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
+        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
+        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
         u32_to_le(c->n_cigar, buf);
-        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
-        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
+        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
+        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
     }
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     return ok? 4 + block_len : -1;
@@ -887,8 +934,6 @@ static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
         return -1;
     if (!bfp->mt)
         hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
-    else
-        bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp));
 
     int ret = bam_write1(bfp, b);
     if (ret < 0)
@@ -1084,7 +1129,7 @@ int sam_idx_save(htsFile *fp) {
         if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
             return -1;
 
-        return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
+        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
 
     } else if (fp->format.format == cram) {
         // flushed and closed by cram_close
@@ -1255,6 +1300,26 @@ static int bam_sym_lookup(void *data, char *str, char **end,
         }
         break;
 
+    case 'h':
+        if (memcmp(str, "hclen", 5) == 0) {
+            int hclen = 0;
+            uint32_t *cigar = bam_get_cigar(b);
+            uint32_t ncigar = b->core.n_cigar;
+
+            // left
+            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
+                hclen = bam_cigar_oplen(cigar[0]);
+
+            // right
+            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
+                hclen += bam_cigar_oplen(cigar[ncigar-1]);
+
+            *end = str+5;
+            res->d = hclen;
+            return 0;
+        }
+        break;
+
     case 'l':
         if (memcmp(str, "library", 7) == 0) {
             *end = str+7;
@@ -2348,9 +2413,175 @@ int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
  *** SAM record I/O ***
  **********************/
 
-static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
-                            char *r, bam1_t *b)
-{
+// The speed of this code can vary considerably depending on minor code
+// changes elsewhere as some of the tight loops are particularly prone to
+// speed changes when the instruction blocks are split over a 32-byte
+// boundary.  To protect against this, we explicitly specify an alignment
+// for this function.  If this is insufficient, we may also wish to
+// consider alignment of blocks within this function via
+// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
+// However it's not very portable.
+// Instead we break into separate functions so we can explicitly specify
+// use __attribute__((aligned(32))) instead and force consistent loop
+// alignment.
+static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
+    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
+    if (*n > INT32_MAX*0.666) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    size_t bytes = (size_t)size * (size_t)(*n>>1);
+    if (possibly_expand_bam_data(b, bytes) < 0) {
+        hts_log_error("Out of memory");
+        return -1;
+    }
+
+    (*n)+=*n>>1;
+    return 0;
+}
+
+
+// This ensures that q always ends up at the next comma after
+// reading a number even if it's followed by junk.  It
+// prevents the possibility of trying to read more than n items.
+#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
+
+HTS_ALIGN32
+static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 1) < 0)
+                return NULL;
+        }
+        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
+        b->l_data++;
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 1) < 0)
+                return NULL;
+        }
+        if (q[1] != '-') {
+            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
+            b->l_data++;
+        } else {
+            *overflow = 1;
+            q++;
+            skip_to_comma_(q);
+        }
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 2) < 0)
+                return NULL;
+        }
+        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
+                  b->data + b->l_data);
+        b->l_data += 2;
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 2) < 0)
+                return NULL;
+        }
+        if (q[1] != '-') {
+            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
+                      b->data + b->l_data);
+            b->l_data += 2;
+        } else {
+            *overflow = 1;
+            q++;
+            skip_to_comma_(q);
+        }
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 4) < 0)
+                return NULL;
+        }
+        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
+                  b->data + b->l_data);
+        b->l_data += 4;
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 4) < 0)
+                return NULL;
+        }
+        if (q[1] != '-') {
+            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
+                      b->data + b->l_data);
+            b->l_data += 4;
+        } else {
+            *overflow = 1;
+            q++;
+            skip_to_comma_(q);
+        }
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
+                               uint32_t *nalloc, int *overflow) {
+    while (*q == ',') {
+        if ((*nused)++ >= (*nalloc)) {
+            if (grow_B_array(b, nalloc, 4) < 0)
+                return NULL;
+        }
+        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
+        b->l_data += 4;
+    }
+    return q;
+}
+
+HTS_ALIGN32
+static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
+                              char **end, bam1_t *b,
+                              int *ctr) {
+    // Protect against infinite recursion when dealing with invalid input.
+    // An example string is "XX:B:C,-".  The lack of a number means min=0,
+    // but it overflowed due to "-" and so we repeat ad-infinitum.
+    //
+    // Loop detection is the safest solution incase there are other
+    // strange corner cases with malformed inputs.
+    if (++(*ctr) > 2) {
+        hts_log_error("Malformed data in B:%c array", type);
+        return -1;
+    }
+
     int orig_l = b->l_data;
     char *q = in;
     int32_t size;
@@ -2363,80 +2594,60 @@ static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
         return -1;
     }
 
-    // Ensure space for type + values
-    bytes = (size_t) n * (size_t) size;
-    if (bytes / size != n
+    // Ensure space for type + values.
+    // The first pass through here we don't know the number of entries and
+    // nalloc == 0.  We start with a small working set and then parse the
+    // data, growing as needed.
+    //
+    // If we have a second pass through we do know the number of entries
+    // and nalloc is already known.  We have no need to expand the bam data.
+    if (!nalloc)
+         nalloc=7;
+
+    // Ensure allocated memory is big enough (for current nalloc estimate)
+    bytes = (size_t) nalloc * (size_t) size;
+    if (bytes / size != nalloc
         || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
         hts_log_error("Out of memory");
         return -1;
     }
 
+    uint32_t nused = 0;
+
     b->data[b->l_data++] = 'B';
     b->data[b->l_data++] = type;
-    i32_to_le(n, b->data + b->l_data);
+    // 32-bit B-array length is inserted later once we know it.
+    int b_len_idx = b->l_data;
     b->l_data += sizeof(uint32_t);
-    // This ensures that q always ends up at the next comma after
-    // reading a number even if it's followed by junk.  It
-    // prevents the possibility of trying to read more than n items.
-#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
+
     if (type == 'c') {
-        while (q < r) {
-            *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
-            b->l_data++;
-            skip_to_comma_(q);
-        }
+        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
     } else if (type == 'C') {
-        while (q < r) {
-            if (*q != '-') {
-                *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
-                b->l_data++;
-            } else {
-                overflow = 1;
-            }
-            skip_to_comma_(q);
-        }
+        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
     } else if (type == 's') {
-        while (q < r) {
-            i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
-            b->l_data += 2;
-            skip_to_comma_(q);
-        }
+        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
     } else if (type == 'S') {
-        while (q < r) {
-            if (*q != '-') {
-                u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
-                b->l_data += 2;
-            } else {
-                overflow = 1;
-            }
-            skip_to_comma_(q);
-        }
+        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
     } else if (type == 'i') {
-        while (q < r) {
-            i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
-            b->l_data += 4;
-            skip_to_comma_(q);
-        }
+        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
     } else if (type == 'I') {
-        while (q < r) {
-            if (*q != '-') {
-                u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
-                b->l_data += 4;
-            } else {
-                overflow = 1;
-            }
-            skip_to_comma_(q);
-        }
+        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
     } else if (type == 'f') {
-        while (q < r) {
-            float_to_le(strtod(q + 1, &q), b->data + b->l_data);
-            b->l_data += 4;
-            skip_to_comma_(q);
-        }
-    } else {
-        hts_log_error("Unrecognized type B:%c", type);
+        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
+            return -1;
+    }
+    if (*q != '\t' && *q != '\0') {
+        // Unknown B array type or junk in the numbers
+        hts_log_error("Malformed B:%c", type);
         return -1;
     }
+    i32_to_le(nused, b->data + b_len_idx);
 
     if (!overflow) {
         *end = q;
@@ -2444,6 +2655,7 @@ static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
     } else {
         int64_t max = 0, min = 0, val;
         // Given type was incorrect.  Try to rescue the situation.
+        char *r = q;
         q = in;
         overflow = 0;
         b->l_data = orig_l;
@@ -2458,19 +2670,19 @@ static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
         if (!overflow) {
             if (min < 0) {
                 if (min >= INT8_MIN && max <= INT8_MAX) {
-                    return sam_parse_B_vals('c', n, in, end, r, b);
+                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
                 } else if (min >= INT16_MIN && max <= INT16_MAX) {
-                    return sam_parse_B_vals('s', n, in, end, r, b);
+                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
                 } else if (min >= INT32_MIN && max <= INT32_MAX) {
-                    return sam_parse_B_vals('i', n, in, end, r, b);
+                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
                 }
             } else {
                 if (max < UINT8_MAX) {
-                    return sam_parse_B_vals('C', n, in, end, r, b);
+                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
                 } else if (max <= UINT16_MAX) {
-                    return sam_parse_B_vals('S', n, in, end, r, b);
+                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
                 } else if (max <= UINT32_MAX) {
-                    return sam_parse_B_vals('I', n, in, end, r, b);
+                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
                 }
             }
         }
@@ -2481,6 +2693,14 @@ static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
 #undef skip_to_comma_
 }
 
+HTS_ALIGN32
+static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
+{
+    int ctr = 0;
+    uint32_t nalloc = 0;
+    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
+}
+
 static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
     if (*v >= '1' && *v <= '9') {
         return hts_str2uint(v, rv, 16, overflow);
@@ -2624,16 +2844,11 @@ static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
             b->data[b->l_data++] = '\0';
             q = end;
         } else if (type == 'B') {
-            uint32_t n;
-            char *r;
             type = *q++; // q points to the first ',' following the typing byte
             _parse_err(*q && *q != ',' && *q != '\t',
                        "B aux field type not followed by ','");
 
-            for (r = q, n = 0; *r > '\t'; ++r)
-                if (*r == ',') ++n;
-
-            if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
+            if (sam_parse_B_vals(type, q, &q, b) < 0)
                 goto err_ret;
         } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
 
@@ -2732,7 +2947,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
     } else c->tid = -1;
 
     // pos
-    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
+    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
     if (*p++ != '\t') goto err_ret;
     if (c->pos < 0 && c->tid >= 0) {
         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
@@ -2750,7 +2965,6 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
         int n_cigar = bam_parse_cigar(p, &p, b);
         if (n_cigar < 1 || *p++ != '\t') goto err_ret;
         cigar = (uint32_t *)(b->data + old_l_data);
-        c->n_cigar = n_cigar;
 
         // can't use bam_endpos() directly as some fields not yet set up
         cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
@@ -2776,15 +2990,16 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
         _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
     }
     // mpos
-    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
+    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
     if (*p++ != '\t') goto err_ret;
     if (c->mpos < 0 && c->mtid >= 0) {
         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
         c->mtid = -1;
     }
     // tlen
-    c->isize = hts_str2int(p, &p, 64, &overflow);
+    c->isize = hts_str2int(p, &p, 63, &overflow);
     if (*p++ != '\t') goto err_ret;
+    _parse_err(overflow, "number outside allowed range");
     // seq
     q = _read_token(p);
     if (strcmp(q, "*")) {
@@ -2926,20 +3141,36 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
     }
     if (end) *end = (char *)in;
 
-    if (*in == '*') {
-        if (end) (*end)++;
+    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
+    if (!n_cigar && b->core.n_cigar == 0) {
+        if (end) *end = (char *)in+1;
         return 0;
     }
-    n_cigar = read_ncigar(in);
-    if (!n_cigar) return 0;
-    if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
+
+    ssize_t cig_diff = n_cigar - b->core.n_cigar;
+    if (cig_diff > 0 &&
+        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
         hts_log_error("Memory allocation error");
         return -1;
     }
 
-    if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
-    b->l_data += (n_cigar * sizeof(uint32_t));
-    if (end) *end = (char *)in+diff;
+    uint32_t *cig = bam_get_cigar(b);
+    if ((uint8_t *)cig != b->data + b->l_data) {
+        // Modifying an BAM existing BAM record
+        uint8_t  *seq = bam_get_seq(b);
+        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
+    }
+
+    if (n_cigar) {
+        if (!(diff = parse_cigar(in, cig, n_cigar)))
+            return -1;
+    } else {
+        diff = 1; // handle "*"
+    }
+
+    b->l_data += cig_diff * sizeof(uint32_t);
+    b->core.n_cigar = n_cigar;
+    if (end) *end = (char *)in + diff;
 
     return n_cigar;
 }
@@ -4097,6 +4328,9 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
 
             fd->curr_bam = NULL;
             fd->curr_idx = 0;
+        // Consider prefetching next record?  I.e.
+        // } else {
+        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
         }
 
         ret = 0;
@@ -4167,6 +4401,15 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
     return pass_filter < 0 ? -2 : ret;
 }
 
+// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
+// this code isn't vectorised and runs far slower than is necessary (even
+// with the restrict keyword being used).
+static inline void HTS_OPT3
+add33(uint8_t *a, const uint8_t * b, int32_t len) {
+    uint32_t i;
+    for (i = 0; i < len; i++)
+        a[i] = b[i]+33;
+}
 
 static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
 {
@@ -4217,10 +4460,8 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st
         if (s[0] == 0xff) {
             cp[i++] = '*';
         } else {
-            // local copy of c->l_qseq to aid unrolling
-            uint32_t lqseq = c->l_qseq;
-            for (i = 0; i < lqseq; ++i)
-                cp[i]=s[i]+33;
+            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
+            i = c->l_qseq;
         }
         cp[i] = 0;
         cp += i;
@@ -4241,8 +4482,8 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st
     return str->l;
 
  bad_aux:
-    hts_log_error("Corrupted aux data for read %.*s",
-                  b->core.l_qname, bam_get_qname(b));
+    hts_log_error("Corrupted aux data for read %.*s flag %d",
+                  b->core.l_qname, bam_get_qname(b), b->core.flag);
     errno = EINVAL;
     return -1;
 
@@ -4636,7 +4877,7 @@ uint8_t *bam_aux_first(const bam1_t *b)
 {
     uint8_t *s = bam_get_aux(b);
     uint8_t *end = b->data + b->l_data;
-    if (s >= end) { errno = ENOENT; return NULL; }
+    if (end - s <= 2) { errno = ENOENT; return NULL; }
     return s+2;
 }
 
@@ -4645,11 +4886,12 @@ uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
     uint8_t *end = b->data + b->l_data;
     uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
     if (next == NULL) goto bad_aux;
-    if (next >= end) { errno = ENOENT; return NULL; }
+    if (end - next <= 2) { errno = ENOENT; return NULL; }
     return next+2;
 
  bad_aux:
-    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
+    hts_log_error("Corrupted aux data for read %s flag %d",
+                  bam_get_qname(b), b->core.flag);
     errno = EINVAL;
     return NULL;
 }
@@ -4671,7 +4913,8 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
     return NULL;
 
  bad_aux:
-    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
+    hts_log_error("Corrupted aux data for read %s flag %d",
+                  bam_get_qname(b), b->core.flag);
     errno = EINVAL;
     return NULL;
 }
@@ -4695,7 +4938,8 @@ uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
     return s;
 
  bad_aux:
-    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
+    hts_log_error("Corrupted aux data for read %s flag %d",
+                  bam_get_qname(b), b->core.flag);
     errno = EINVAL;
     return NULL;
 }
@@ -5500,6 +5744,8 @@ void bam_plp_destroy(bam_plp_t iter)
     lbnode_t *p, *pnext;
     if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
     for (p = iter->head; p != NULL; p = pnext) {
+        if (iter->plp_destruct && p != iter->tail)
+            iter->plp_destruct(iter->data, &p->b, &p->cd);
         pnext = p->next;
         mp_free(iter->mp, p);
     }
@@ -5649,8 +5895,7 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
     // Loop over the overlapping region nulling qualities in either
     // seq a or b.
     int err = 0;
-    while ( 1 )
-    {
+    while ( 1 ) {
         // Step to next matching reference position in a and b
         while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
             a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
@@ -5659,8 +5904,6 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
             err = a_ret<-1?-1:0;
             break;
         }
-        if ( iref < a_iref + a->core.pos )
-            iref = a_iref + a->core.pos;
 
         while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
             b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
@@ -5669,14 +5912,55 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
             err = b_ret<-1?-1:0;
             break;
         }
+
+        if ( iref < a_iref + a->core.pos )
+            iref = a_iref + a->core.pos;
+
         if ( iref < b_iref + b->core.pos )
             iref = b_iref + b->core.pos;
 
         iref++;
 
-        if ( a_iref+a->core.pos != b_iref+b->core.pos )
-            // only CMATCH positions, don't know what to do with indels
-            continue;
+        // If A or B has a deletion then we catch up the other to this point.
+        // We also amend quality values using the same rules for mismatch.
+        if (a_iref+a->core.pos != b_iref+b->core.pos) {
+            if (a_iref+a->core.pos < b_iref+b->core.pos
+                && b_cigar > bam_get_cigar(b)
+                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
+                // Del in B means it's moved on further than A
+                do {
+                    a_qual[a_iseq] = amul
+                        ? a_qual[a_iseq]*0.8
+                        : 0;
+                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
+                                                 &a_icig, &a_iseq, &a_iref);
+                    if (a_ret < 0)
+                        return -(a_ret<-1); // 0 or -1
+                } while (a_iref + a->core.pos < b_iref+b->core.pos);
+            } else if (a_cigar > bam_get_cigar(a)
+                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
+                // Del in A means it's moved on further than B
+                do {
+                    b_qual[b_iseq] = bmul
+                        ? b_qual[b_iseq]*0.8
+                        : 0;
+                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
+                                                 &b_icig, &b_iseq, &b_iref);
+                    if (b_ret < 0)
+                        return -(b_ret<-1); // 0 or -1
+                } while (b_iref + b->core.pos < a_iref+a->core.pos);
+            } else {
+                // Anything else, eg ref-skip, we don't support here
+                continue;
+            }
+        }
+
+        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
+        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
+        //         a_cigar-bam_get_cigar(a), a_icig,
+        //         b_cigar-bam_get_cigar(b), b_icig,
+        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
+        //         a_iseq, b_iseq);
 
         if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
             // Fell off end of sequence, bad CIGAR?
diff --git a/htslib/sam_internal.h b/htslib/sam_internal.h
index b1fce9fe4..135b881b1 100644
--- a/htslib/sam_internal.h
+++ b/htslib/sam_internal.h
@@ -1,6 +1,6 @@
 /*  sam_internal.h -- internal functions; not part of the public API.
 
-    Copyright (C) 2019-2020 Genome Research Ltd.
+    Copyright (C) 2019-2020, 2023-2024 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
 
 #include <errno.h>
 #include <stdint.h>
+
 #include "htslib/sam.h"
 
 #ifdef __cplusplus
@@ -68,7 +69,7 @@ static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) {
  * for (i = 0; i < len; i++)
  *    seq[i] = seq_nt16_str[bam_seqi(nib, i)];
  */
-static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
     static const char code2base[512] =
         "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N"
         "A=AAACAMAGARASAVATAWAYAHAKADABAN"
@@ -98,6 +99,21 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
         seq[i] = seq_nt16_str[bam_seqi(nib, i)];
 }
 
+#if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \
+    ((defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET_SSSE3 && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3) || \
+     (defined __ARM_NEON))
+#define BUILDING_SIMD_NIBBLE2BASE
+#endif
+
+static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+    extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len);
+    htslib_nibble2base(nib, seq, len);
+#else
+    nibble2base_default(nib, seq, len);
+#endif
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/htslib/sam_mods.c b/htslib/sam_mods.c
index fe8db85f7..e45f26d91 100644
--- a/htslib/sam_mods.c
+++ b/htslib/sam_mods.c
@@ -1,6 +1,6 @@
 /*  sam_mods.c -- Base modification handling in SAM and BAM.
 
-    Copyright (C) 2020-2023 Genome Research Ltd.
+    Copyright (C) 2020-2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -24,6 +24,7 @@ DEALINGS IN THE SOFTWARE.  */
 
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
 #include <config.h>
+#include <assert.h>
 
 #include "htslib/sam.h"
 #include "textutils_internal.h"
@@ -245,7 +246,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
     }
 
     uint8_t *mi = bam_aux_get(b, "MN");
-    if (mi && bam_aux2i(mi) != b->core.l_qseq) {
+    if (mi && bam_aux2i(mi) != b->core.l_qseq && b->core.l_qseq) {
         // bam_aux2i with set errno = EINVAL and return 0 if the tag
         // isn't integer, but 0 will be a seq-length mismatch anyway so
         // triggers an error here too.
@@ -359,7 +360,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
                 if (!cp_end) {
                     // empty list
                     delta = INT_MAX;
-                    cp_end = cp+1;
+                    cp_end = cp;
                 }
             }
             // Now delta is first in list or computed remainder,
@@ -378,7 +379,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
                 }
                 state->MMcount  [mod_num] = delta;
                 if (b->core.flag & BAM_FREVERSE) {
-                    state->MM   [mod_num] = cp+1;
+                    state->MM   [mod_num] = me+1;
                     state->MMend[mod_num] = cp_end;
                     state->ML   [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL;
                 } else {
@@ -426,6 +427,10 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
             }
         }
     }
+    if (ml && ml != ml_end) {
+        hts_log_error("%s: Too many entries in ML tag", bam_get_qname(b));
+        return -1;
+    }
 
     state->nmods = mod_num;
 
@@ -496,6 +501,11 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
         if (b->core.flag & BAM_FREVERSE) {
             // process MM list backwards
             char *cp;
+            if (state->MMend[i]-1 < state->MM[i]) {
+                // Should be impossible to hit if coding is correct
+                hts_log_error("Assert failed while processing base modification states");
+                return -1;
+            }
             for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--)
                 if (*cp == ',')
                     break;
@@ -544,9 +554,6 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
  */
 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
                      hts_base_mod *mods, int n_mods, int *pos) {
-    if (state->seq_pos >= b->core.l_qseq)
-        return 0;
-
     // Look through state->MMcount arrays to see when the next lowest is
     // per base type;
     int next[16], freq[16] = {0}, i;
@@ -579,18 +586,6 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
     }
     *pos = state->seq_pos = i;
 
-    if (i >= b->core.l_qseq) {
-        // Check for more MM elements than bases present.
-        for (i = 0; i < state->nmods; i++) {
-            if (!(b->core.flag & BAM_FREVERSE) &&
-                state->MMcount[i] < 0x7f000000) {
-                hts_log_warning("MM tag refers to bases beyond sequence length");
-                return -1;
-            }
-        }
-        return 0;
-    }
-
     if (b->core.flag & BAM_FREVERSE) {
         for (i = 0; i < state->nmods; i++)
             state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
@@ -599,6 +594,23 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
             state->MMcount[i] -= freq[state->canonical[i]];
     }
 
+    if (b->core.l_qseq && state->seq_pos >= b->core.l_qseq &&
+        !(b->core.flag & BAM_FREVERSE)) {
+        // Spots +ve orientation run-overs.
+        // The -ve orientation is spotted in bam_parse_basemod2
+        int i;
+        for (i = 0; i < state->nmods; i++) {
+            // Check if any remaining items in MM after hitting the end
+            // of the sequence.
+            if (state->MMcount[i] < 0x7f000000 ||
+                (*state->MM[i]!=0 && *state->MM[i]!=';')) {
+                hts_log_warning("MM tag refers to bases beyond sequence length");
+                return -1;
+            }
+        }
+        return 0;
+    }
+
     int r = bam_mods_at_next_pos(b, state, mods, n_mods);
     return r > 0 ? r : 0;
 }
diff --git a/htslib/simd.c b/htslib/simd.c
new file mode 100644
index 000000000..865dd887e
--- /dev/null
+++ b/htslib/simd.c
@@ -0,0 +1,222 @@
+/*  simd.c -- SIMD optimised versions of various internal functions.
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
+#include <config.h>
+
+// These must be defined before the first system include to ensure that legacy
+// BSD types needed by <sys/sysctl.h> remain defined when _XOPEN_SOURCE is set.
+#if defined __APPLE__
+#define _DARWIN_C_SOURCE
+#elif defined __NetBSD__
+#define _NETBSD_SOURCE
+#endif
+
+#include "htslib/sam.h"
+#include "sam_internal.h"
+
+#if defined __x86_64__
+#include <immintrin.h>
+#elif defined __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __arm__ || defined __aarch64__
+
+#if defined __linux__ || defined __FreeBSD__
+#include <sys/auxv.h>
+#elif defined __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif defined __NetBSD__
+#include <stddef.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#ifdef __aarch64__
+#include <aarch64/armreg.h>
+#else
+#include <arm/armreg.h>
+#endif
+#elif defined _WIN32
+#include <processthreadsapi.h>
+#endif
+
+static inline int cpu_supports_neon(void) {
+#if defined __linux__ && defined __arm__ && defined HWCAP_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+#elif defined __linux__ && defined __arm__ && defined HWCAP_ARM_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+#elif defined __linux__ && defined __aarch64__ && defined HWCAP_ASIMD
+    return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0;
+#elif defined __APPLE__ && defined __aarch64__
+    int32_t ctl;
+    size_t ctlsize = sizeof ctl;
+    if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0;
+    if (ctlsize != sizeof ctl) return 0;
+    return ctl;
+#elif defined __FreeBSD__ && defined __arm__ && defined HWCAP_NEON
+    unsigned long cap;
+    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
+    return (cap & HWCAP_NEON) != 0;
+#elif defined __FreeBSD__ && defined __aarch64__ && defined HWCAP_ASIMD
+    unsigned long cap;
+    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
+    return (cap & HWCAP_ASIMD) != 0;
+#elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK
+    uint32_t buf[16];
+    size_t buflen = sizeof buf;
+    if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0;
+    if (buflen < sizeof(uint32_t)) return 0;
+    return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002;
+#elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD
+    struct aarch64_sysctl_cpu_id buf;
+    size_t buflen = sizeof buf;
+    if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0;
+    if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0;
+    return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0;
+#elif defined _WIN32
+    return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0;
+#else
+    return 0;
+#endif
+}
+
+#endif
+
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+
+void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
+
+#if defined __x86_64__
+
+/*
+ * Convert a nibble encoded BAM sequence to a string of bases.
+ *
+ * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
+ * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
+ * be converted to the IUPAC characters.
+ * It falls back on the nibble2base_default function for the remainder.
+ */
+
+__attribute__((target("ssse3")))
+static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
+    const char *seq_end_ptr = seq + len;
+    char *seq_cursor = seq;
+    uint8_t *nibble_cursor = nib;
+    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1);
+    __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
+    /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
+       as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
+       together with the pshufb instruction as a lookup table. The most efficient
+       way is to use bitwise AND and shift to create two vectors. One with all
+       the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
+       The lookup can then be performed and the resulting vectors can be
+       interleaved again using the unpack instructions. */
+    while (seq_cursor < seq_vec_end_ptr) {
+        __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
+        __m128i encoded_upper = _mm_srli_epi64(encoded, 4);
+        encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
+        __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
+        __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
+        __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
+        __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
+        __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
+        _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
+        _mm_storeu_si128((__m128i *)(seq_cursor + sizeof(__m128i)),
+                         second_nucleotides);
+        nibble_cursor += sizeof(__m128i);
+        seq_cursor += 2 * sizeof(__m128i);
+    }
+    nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
+}
+
+__attribute__((constructor))
+static void nibble2base_resolve(void) {
+    if (__builtin_cpu_supports("ssse3")) {
+        htslib_nibble2base = nibble2base_ssse3;
+    }
+}
+
+#elif defined __ARM_NEON
+
+static void nibble2base_neon(uint8_t *nib, char *seq0, int len) {
+    uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f);
+    uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str);
+#ifndef __aarch64__
+    uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }};
+#endif
+
+    uint8_t *seq = (uint8_t *) seq0;
+    int blocks;
+
+    for (blocks = len / 32; blocks > 0; --blocks) {
+        uint8x16_t encoded = vld1q_u8(nib);
+        nib += 16;
+
+        /* Translate the high and low nibbles to nucleotide letters separately,
+           then interleave them back together via vzipq for writing. */
+
+        uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4);
+        uint8x16_t low_nibbles  = vandq_u8(encoded, low_nibbles_mask);
+
+#ifdef __aarch64__
+        uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles);
+        uint8x16_t low_nucleotides  = vqtbl1q_u8(nuc_lookup_vec, low_nibbles);
+#else
+        uint8x8_t high_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles));
+        uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles));
+        uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high);
+
+        uint8x8_t low_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles));
+        uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles));
+        uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high);
+#endif
+
+#ifdef __aarch64__
+        vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides));
+#else
+        // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM
+        uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }};
+        vst2q_u8(seq, nucleotides);
+#endif
+        seq += 32;
+    }
+
+    if (len % 32 != 0)
+        nibble2base_default(nib, (char *) seq, len % 32);
+}
+
+static __attribute__((constructor)) void nibble2base_resolve(void) {
+    if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon;
+}
+
+#endif
+
+#endif // BUILDING_SIMD_NIBBLE2BASE
+
+// Potentially useful diagnostic, and prevents "empty translation unit" errors
+const char htslib_simd[] =
+    "SIMD functions present:"
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+    " nibble2base"
+#endif
+    ".";
diff --git a/htslib/synced_bcf_reader.c b/htslib/synced_bcf_reader.c
index a43ab15ae..1835ea2d6 100644
--- a/htslib/synced_bcf_reader.c
+++ b/htslib/synced_bcf_reader.c
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
 #include <config.h>
 
+#include <stdlib.h>
 #include <assert.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -71,6 +72,7 @@ typedef struct
 }
 aux_t;
 
+static bcf_sr_regions_t *bcf_sr_regions_alloc(void);
 static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end);
 static bcf_sr_regions_t *_regions_init_string(const char *str);
 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
@@ -368,13 +370,22 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
     if ( !files->explicit_regs && !files->streaming )
     {
         int n = 0, i;
-        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
-        for (i=0; i<n; i++)
+        const char **names;
+
+        if ( !files->regions )
         {
+            files->regions = bcf_sr_regions_alloc();
             if ( !files->regions )
-                files->regions = _regions_init_string(names[i]);
-            else
-                _regions_add(files->regions, names[i], -1, -1);
+            {
+                hts_log_error("Cannot allocate regions data structure");
+                return 0;
+            }
+        }
+
+        names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
+        for (i=0; i<n; i++)
+        {
+            _regions_add(files->regions, names[i], -1, -1);
         }
         free(names);
         _regions_sort_and_merge(files->regions);
@@ -532,7 +543,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_
     }
     if (!reader->itr) {
         hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1);
-        assert(0);
+        abort();
     }
     return 0;
 }
@@ -956,6 +967,17 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
     return 1;
 }
 
+// Allocate a new region list structure.
+static bcf_sr_regions_t *bcf_sr_regions_alloc(void)
+{
+    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
+    if ( !reg ) return NULL;
+
+    reg->start = reg->end = -1;
+    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    return reg;
+}
+
 // Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based,
 // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions().
 static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
@@ -1037,9 +1059,8 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
 // wouldn't learn the chromosome name.
 static bcf_sr_regions_t *_regions_init_string(const char *str)
 {
-    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->start = reg->end = -1;
-    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    bcf_sr_regions_t *reg = bcf_sr_regions_alloc();
+    if ( !reg ) return NULL;
 
     kstring_t tmp = {0,0,0};
     const char *sp = str, *ep = str;
@@ -1189,9 +1210,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
         return reg;
     }
 
-    reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->start = reg->end = -1;
-    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    reg = bcf_sr_regions_alloc();
+    if ( !reg ) return NULL;
 
     reg->file = hts_open(regions, "rb");
     if ( !reg->file )
diff --git a/htslib/tabix.c b/htslib/tabix.c
deleted file mode 100644
index 0798b279f..000000000
--- a/htslib/tabix.c
+++ /dev/null
@@ -1,720 +0,0 @@
-/*  tabix.c -- Generic indexer for TAB-delimited genome position files.
-
-    Copyright (C) 2009-2011 Broad Institute.
-    Copyright (C) 2010-2012, 2014-2020 Genome Research Ltd.
-
-    Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.  */
-
-#include <config.h>
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <strings.h>
-#include <getopt.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <errno.h>
-#include "htslib/tbx.h"
-#include "htslib/sam.h"
-#include "htslib/vcf.h"
-#include "htslib/kseq.h"
-#include "htslib/bgzf.h"
-#include "htslib/hts.h"
-#include "htslib/regidx.h"
-#include "htslib/hts_defs.h"
-#include "htslib/hts_log.h"
-
-typedef struct
-{
-    char *regions_fname, *targets_fname;
-    int print_header, header_only, cache_megs, download_index, separate_regs;
-}
-args_t;
-
-static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
-error(const char *format, ...)
-{
-    va_list ap;
-    fflush(stdout);
-    va_start(ap, format);
-    vfprintf(stderr, format, ap);
-    va_end(ap);
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-}
-
-static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
-error_errno(const char *format, ...)
-{
-    va_list ap;
-    int eno = errno;
-    fflush(stdout);
-    if (format) {
-        va_start(ap, format);
-        vfprintf(stderr, format, ap);
-        va_end(ap);
-    }
-    if (eno) {
-        fprintf(stderr, "%s%s\n", format ? ": " : "", strerror(eno));
-    } else {
-        fprintf(stderr, "\n");
-    }
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-}
-
-
-#define IS_GFF  (1<<0)
-#define IS_BED  (1<<1)
-#define IS_SAM  (1<<2)
-#define IS_VCF  (1<<3)
-#define IS_BCF  (1<<4)
-#define IS_BAM  (1<<5)
-#define IS_CRAM (1<<6)
-#define IS_TXT  (IS_GFF|IS_BED|IS_SAM|IS_VCF)
-
-int file_type(const char *fname)
-{
-    int l = strlen(fname);
-    if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
-    else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
-    else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
-    else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
-    else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
-    else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
-    else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
-
-    htsFile *fp = hts_open(fname,"r");
-    if (!fp) {
-        if (errno == ENOEXEC) {
-            // hts_open() uses this to report that it didn't understand the
-            // file format.
-            error("Couldn't understand format of \"%s\"\n", fname);
-        } else {
-            error_errno("Couldn't open \"%s\"", fname);
-        }
-    }
-    enum htsExactFormat format = hts_get_format(fp)->format;
-    hts_close(fp);
-    if ( format == bcf ) return IS_BCF;
-    if ( format == bam ) return IS_BAM;
-    if ( format == cram ) return IS_CRAM;
-    if ( format == vcf ) return IS_VCF;
-
-    return 0;
-}
-
-static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
-{
-    kstring_t str = {0,0,0};
-    int iseq = 0, ireg = 0;
-    char **regs = NULL;
-    *nregs = argc;
-
-    if ( regions_fname )
-    {
-        // improve me: this is a too heavy machinery for parsing regions...
-
-        regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
-        if ( !idx ) {
-            error_errno("Could not build region list for \"%s\"", regions_fname);
-        }
-        regitr_t *itr = regitr_init(idx);
-        if ( !itr ) {
-            error_errno("Could not initialize an iterator over \"%s\"",
-                        regions_fname);
-        }
-
-        (*nregs) += regidx_nregs(idx);
-        regs = (char**) malloc(sizeof(char*)*(*nregs));
-        if (!regs) error_errno(NULL);
-
-        int nseq;
-        char **seqs = regidx_seq_names(idx, &nseq);
-        for (iseq=0; iseq<nseq; iseq++)
-        {
-            if (regidx_overlap(idx, seqs[iseq], 0, HTS_POS_MAX, itr) < 0)
-                error_errno("Failed to build overlapping regions list");
-
-            while ( regitr_overlap(itr) )
-            {
-                str.l = 0;
-                if (ksprintf(&str, "%s:%"PRIhts_pos"-%"PRIhts_pos, seqs[iseq], itr->beg+1, itr->end+1) < 0) {
-                    error_errno(NULL);
-                }
-                regs[ireg] = strdup(str.s);
-                if (!regs[ireg]) error_errno(NULL);
-                ireg++;
-            }
-        }
-        regidx_destroy(idx);
-        regitr_destroy(itr);
-    }
-    free(str.s);
-
-    if ( !ireg )
-    {
-        if ( argc )
-        {
-            regs = (char**) malloc(sizeof(char*)*argc);
-            if (!regs) error_errno(NULL);
-        }
-        else
-        {
-            regs = (char**) malloc(sizeof(char*));
-            if (!regs) error_errno(NULL);
-            regs[0] = strdup(".");
-            if (!regs[0]) error_errno(NULL);
-            *nregs = 1;
-        }
-    }
-
-    for (iseq=0; iseq<argc; iseq++, ireg++) {
-        regs[ireg] = strdup(argv[iseq]);
-        if (!regs[ireg]) error_errno(NULL);
-    }
-    return regs;
-}
-static int query_regions(args_t *args, tbx_conf_t *conf, char *fname, char **regs, int nregs)
-{
-    int i;
-    htsFile *fp = hts_open(fname,"r");
-    if ( !fp ) error_errno("Could not open \"%s\"", fname);
-    enum htsExactFormat format = hts_get_format(fp)->format;
-
-    if (args->cache_megs)
-        hts_set_cache_size(fp, args->cache_megs * 1048576);
-
-    regidx_t *reg_idx = NULL;
-    if ( args->targets_fname )
-    {
-        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
-        if (!reg_idx)
-            error_errno("Could not build region list for \"%s\"",
-                        args->targets_fname);
-    }
-
-    if ( format == bcf )
-    {
-        htsFile *out = hts_open("-","w");
-        if ( !out ) error_errno("Could not open stdout");
-        hts_idx_t *idx = bcf_index_load3(fname, NULL, args->download_index ? HTS_IDX_SAVE_REMOTE : 0);
-        if ( !idx ) error_errno("Could not load .csi index of \"%s\"", fname);
-
-        bcf_hdr_t *hdr = bcf_hdr_read(fp);
-        if ( !hdr ) error_errno("Could not read the header from \"%s\"", fname);
-
-        if ( args->print_header ) {
-            if ( bcf_hdr_write(out,hdr)!=0 )
-                error_errno("Failed to write to stdout");
-        }
-        if ( !args->header_only )
-        {
-            assert(regs != NULL);
-            bcf1_t *rec = bcf_init();
-            if (!rec) error_errno(NULL);
-            for (i=0; i<nregs; i++)
-            {
-                int ret, found = 0;
-                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
-                if (!itr) continue;
-                while ((ret = bcf_itr_next(fp, itr, rec)) >=0 )
-                {
-                    if ( reg_idx )
-                    {
-                        const char *chr = bcf_seqname(hdr,rec);
-                        if (!chr) {
-                            error("Bad BCF record in \"%s\" : "
-                                  "Invalid CONTIG id %d\n",
-                                  fname, rec->rid);
-                        }
-                        if ( !regidx_overlap(reg_idx,chr,rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
-                    }
-                    if (!found) {
-                        if (args->separate_regs) printf("%c%s\n", conf->meta_char, regs[i]);
-                        found = 1;
-                    }
-                    if ( bcf_write(out,hdr,rec)!=0 ) {
-                        error_errno("Failed to write to stdout");
-                    }
-                }
-
-                if (ret < -1) {
-                    error_errno("Reading \"%s\" failed", fname);
-                }
-                bcf_itr_destroy(itr);
-            }
-            bcf_destroy(rec);
-        }
-        if ( hts_close(out) )
-            error_errno("hts_close returned non-zero status for stdout");
-
-        bcf_hdr_destroy(hdr);
-        hts_idx_destroy(idx);
-    }
-    else if ( format==vcf || format==sam || format==bed || format==text_format || format==unknown_format )
-    {
-        tbx_t *tbx = tbx_index_load3(fname, NULL, args->download_index ? HTS_IDX_SAVE_REMOTE : 0);
-        if ( !tbx ) error_errno("Could not load .tbi/.csi index of %s", fname);
-        kstring_t str = {0,0,0};
-        if ( args->print_header )
-        {
-            int ret;
-            while ((ret = hts_getline(fp, KS_SEP_LINE, &str)) >= 0)
-            {
-                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
-                if (puts(str.s) < 0)
-                    error_errno("Error writing to stdout");
-            }
-            if (ret < -1) error_errno("Reading \"%s\" failed", fname);
-        }
-        if ( !args->header_only )
-        {
-            int nseq;
-            const char **seq = NULL;
-            if ( reg_idx ) {
-                seq = tbx_seqnames(tbx, &nseq);
-                if (!seq) error_errno("Failed to get sequence names list");
-            }
-            for (i=0; i<nregs; i++)
-            {
-                int ret, found = 0;
-                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
-                if ( !itr ) continue;
-                while ((ret = tbx_itr_next(fp, tbx, itr, &str)) >= 0)
-                {
-                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end-1, NULL) ) continue;
-                    if (!found) {
-                        if (args->separate_regs) printf("%c%s\n", conf->meta_char, regs[i]);
-                        found = 1;
-                    }
-                    if (puts(str.s) < 0)
-                        error_errno("Failed to write to stdout");
-                }
-                if (ret < -1) error_errno("Reading \"%s\" failed", fname);
-                tbx_itr_destroy(itr);
-            }
-            free(seq);
-        }
-        free(str.s);
-        tbx_destroy(tbx);
-    }
-    else if ( format==bam )
-        error("Please use \"samtools view\" for querying BAM files.\n");
-
-    if ( reg_idx ) regidx_destroy(reg_idx);
-    if ( hts_close(fp) )
-        error_errno("hts_close returned non-zero status: %s", fname);
-
-    for (i=0; i<nregs; i++) free(regs[i]);
-    free(regs);
-    return 0;
-}
-static int query_chroms(char *fname, int download)
-{
-    const char **seq;
-    int i, nseq, ftype = file_type(fname);
-    if ( ftype & IS_TXT || !ftype )
-    {
-        tbx_t *tbx = tbx_index_load3(fname, NULL, download ? HTS_IDX_SAVE_REMOTE : 0);
-        if ( !tbx ) error_errno("Could not load .tbi index of %s", fname);
-        seq = tbx_seqnames(tbx, &nseq);
-        if (!seq) error_errno("Couldn't get list of sequence names");
-        for (i=0; i<nseq; i++) {
-            if (printf("%s\n", seq[i]) < 0)
-                error_errno("Couldn't write to stdout");
-        }
-        free(seq);
-        tbx_destroy(tbx);
-    }
-    else if ( ftype==IS_BCF )
-    {
-        htsFile *fp = hts_open(fname,"r");
-        if ( !fp ) error_errno("Could not open \"%s\"", fname);
-        bcf_hdr_t *hdr = bcf_hdr_read(fp);
-        if ( !hdr ) error_errno("Could not read the header: \"%s\"", fname);
-        hts_close(fp);
-        hts_idx_t *idx = bcf_index_load3(fname, NULL, download ? HTS_IDX_SAVE_REMOTE : 0);
-        if ( !idx ) error_errno("Could not load .csi index of \"%s\"", fname);
-        seq = bcf_index_seqnames(idx, hdr, &nseq);
-        if (!seq) error_errno("Couldn't get list of sequence names");
-        for (i=0; i<nseq; i++) {
-            if (printf("%s\n", seq[i]) < 0)
-                error_errno("Couldn't write to stdout");
-        }
-        free(seq);
-        bcf_hdr_destroy(hdr);
-        hts_idx_destroy(idx);
-    }
-    else if ( ftype==IS_BAM )   // todo: BAM
-        error("BAM: todo\n");
-    return 0;
-}
-
-int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
-{
-    if ( ftype & IS_TXT || !ftype )
-    {
-        BGZF *fp = bgzf_open(fname,"r");
-        if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
-
-        char *buffer = fp->uncompressed_block;
-        int skip_until = 0;
-
-        // Skip the header: find out the position of the data block
-        if ( buffer[0]==conf->meta_char )
-        {
-            skip_until = 1;
-            while (1)
-            {
-                if ( buffer[skip_until]=='\n' )
-                {
-                    skip_until++;
-                    if ( skip_until>=fp->block_length )
-                    {
-                        if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
-                        skip_until = 0;
-                    }
-                    // The header has finished
-                    if ( buffer[skip_until]!=conf->meta_char ) break;
-                }
-                skip_until++;
-                if ( skip_until>=fp->block_length )
-                {
-                    if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
-                    skip_until = 0;
-                }
-            }
-        }
-
-        // Output the new header
-        FILE *hdr  = fopen(header,"r");
-        if ( !hdr ) error("%s: %s", header,strerror(errno));
-        const size_t page_size = 32768;
-        char *buf = malloc(page_size);
-        BGZF *bgzf_out = bgzf_open("-", "w");
-        ssize_t nread;
-
-        if (!buf) error("%s\n", strerror(errno));
-        if (!bgzf_out)
-            error_errno("Couldn't open output stream");
-        while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
-        {
-            if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
-            if (bgzf_write(bgzf_out, buf, nread) < 0)
-                error_errno("Write error %d", bgzf_out->errcode);
-        }
-        if ( ferror(hdr) ) error_errno("Failed to read \"%s\"", header);
-        if ( fclose(hdr) ) error_errno("Closing \"%s\" failed", header);
-
-        // Output all remaining data read with the header block
-        if ( fp->block_length - skip_until > 0 )
-        {
-            if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error_errno("Write error %d",fp->errcode);
-        }
-        if (bgzf_flush(bgzf_out) < 0)
-            error_errno("Write error %d", bgzf_out->errcode);
-
-        while (1)
-        {
-            nread = bgzf_raw_read(fp, buf, page_size);
-            if ( nread<=0 ) break;
-
-            int count = bgzf_raw_write(bgzf_out, buf, nread);
-            if (count != nread) error_errno("Write failed, wrote %d instead of %d bytes", count,(int)nread);
-        }
-        if (nread < 0) error_errno("Error reading \"%s\"", fname);
-        if (bgzf_close(bgzf_out) < 0)
-            error_errno("Error %d closing output", bgzf_out->errcode);
-        if (bgzf_close(fp) < 0)
-            error_errno("Error %d closing \"%s\"", bgzf_out->errcode, fname);
-        free(buf);
-    }
-    else
-        error("todo: reheader BCF, BAM\n");  // BCF is difficult, records contain pointers to the header.
-    return 0;
-}
-
-static int usage(FILE *fp, int status)
-{
-    fprintf(fp, "\n");
-    fprintf(fp, "Version: %s\n", hts_version());
-    fprintf(fp, "Usage:   tabix [OPTIONS] [FILE] [REGION [...]]\n");
-    fprintf(fp, "\n");
-    fprintf(fp, "Indexing Options:\n");
-    fprintf(fp, "   -0, --zero-based           coordinates are zero-based\n");
-    fprintf(fp, "   -b, --begin INT            column number for region start [4]\n");
-    fprintf(fp, "   -c, --comment CHAR         skip comment lines starting with CHAR [null]\n");
-    fprintf(fp, "   -C, --csi                  generate CSI index for VCF (default is TBI)\n");
-    fprintf(fp, "   -e, --end INT              column number for region end (if no end, set INT to -b) [5]\n");
-    fprintf(fp, "   -f, --force                overwrite existing index without asking\n");
-    fprintf(fp, "   -m, --min-shift INT        set minimal interval size for CSI indices to 2^INT [14]\n");
-    fprintf(fp, "   -p, --preset STR           gff, bed, sam, vcf\n");
-    fprintf(fp, "   -s, --sequence INT         column number for sequence names (suppressed by -p) [1]\n");
-    fprintf(fp, "   -S, --skip-lines INT       skip first INT lines [0]\n");
-    fprintf(fp, "\n");
-    fprintf(fp, "Querying and other options:\n");
-    fprintf(fp, "   -h, --print-header         print also the header lines\n");
-    fprintf(fp, "   -H, --only-header          print only the header lines\n");
-    fprintf(fp, "   -l, --list-chroms          list chromosome names\n");
-    fprintf(fp, "   -r, --reheader FILE        replace the header with the content of FILE\n");
-    fprintf(fp, "   -R, --regions FILE         restrict to regions listed in the file\n");
-    fprintf(fp, "   -T, --targets FILE         similar to -R but streams rather than index-jumps\n");
-    fprintf(fp, "   -D                         do not download the index file\n");
-    fprintf(fp, "       --cache INT            set cache size to INT megabytes (0 disables) [10]\n");
-    fprintf(fp, "       --separate-regions     separate the output by corresponding regions\n");
-    fprintf(fp, "       --verbosity INT        set verbosity [3]\n");
-    fprintf(fp, "\n");
-    return status;
-}
-
-int main(int argc, char *argv[])
-{
-    int c, detect = 1, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
-    tbx_conf_t conf = tbx_conf_gff;
-    char *reheader = NULL;
-    args_t args;
-    memset(&args,0,sizeof(args_t));
-    args.cache_megs = 10;
-    args.download_index = 1;
-    int32_t new_line_skip = -1;
-
-    static const struct option loptions[] =
-    {
-        {"help", no_argument, NULL, 2},
-        {"regions", required_argument, NULL, 'R'},
-        {"targets", required_argument, NULL, 'T'},
-        {"csi", no_argument, NULL, 'C'},
-        {"zero-based", no_argument, NULL, '0'},
-        {"print-header", no_argument, NULL, 'h'},
-        {"only-header", no_argument, NULL, 'H'},
-        {"begin", required_argument, NULL, 'b'},
-        {"comment", required_argument, NULL, 'c'},
-        {"end", required_argument, NULL, 'e'},
-        {"force", no_argument, NULL, 'f'},
-        {"min-shift", required_argument, NULL, 'm'},
-        {"preset", required_argument, NULL, 'p'},
-        {"sequence", required_argument, NULL, 's'},
-        {"skip-lines", required_argument, NULL, 'S'},
-        {"list-chroms", no_argument, NULL, 'l'},
-        {"reheader", required_argument, NULL, 'r'},
-        {"version", no_argument, NULL, 1},
-        {"verbosity", required_argument, NULL, 3},
-        {"cache", required_argument, NULL, 4},
-        {"separate-regions", no_argument, NULL, 5},
-        {NULL, 0, NULL, 0}
-    };
-
-    char *tmp;
-    while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:D", loptions,NULL)) >= 0)
-    {
-        switch (c)
-        {
-            case 'R': args.regions_fname = optarg; break;
-            case 'T': args.targets_fname = optarg; break;
-            case 'C': do_csi = 1; break;
-            case 'r': reheader = optarg; break;
-            case 'h': args.print_header = 1; break;
-            case 'H': args.print_header = 1; args.header_only = 1; break;
-            case 'l': list_chroms = 1; break;
-            case '0': conf.preset |= TBX_UCSC; detect = 0; break;
-            case 'b':
-                conf.bc = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: -b %s\n", optarg);
-                detect = 0;
-                break;
-            case 'e':
-                conf.ec = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: -e %s\n", optarg);
-                detect = 0;
-                break;
-            case 'c': conf.meta_char = *optarg; detect = 0; break;
-            case 'f': is_force = 1; break;
-            case 'm':
-                min_shift = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: -m %s\n", optarg);
-                break;
-            case 'p':
-                detect = 0;
-                if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
-                else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
-                else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
-                else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
-                else if (strcmp(optarg, "bcf") == 0) detect = 1; // bcf is autodetected, preset is not needed
-                else if (strcmp(optarg, "bam") == 0) detect = 1; // same as bcf
-                else error("The preset string not recognised: '%s'\n", optarg);
-                break;
-            case 's':
-                conf.sc = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: -s %s\n", optarg);
-                detect = 0;
-                break;
-            case 'S':
-                new_line_skip = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: -S %s\n", optarg);
-                detect = 0;
-                break;
-            case 'D':
-                args.download_index = 0;
-                break;
-            case 1:
-                printf(
-"tabix (htslib) %s\n"
-"Copyright (C) 2023 Genome Research Ltd.\n", hts_version());
-                return EXIT_SUCCESS;
-            case 2:
-                return usage(stdout, EXIT_SUCCESS);
-            case 3: {
-                int v = atoi(optarg);
-                if (v < 0) v = 0;
-                hts_set_log_level(v);
-                break;
-            }
-            case 4:
-                args.cache_megs = atoi(optarg);
-                if (args.cache_megs < 0) {
-                    args.cache_megs = 0;
-                } else if (args.cache_megs >= INT_MAX / 1048576) {
-                    args.cache_megs = INT_MAX / 1048576;
-                }
-                break;
-            case 5:
-                args.separate_regs = 1;
-                break;
-            default: return usage(stderr, EXIT_FAILURE);
-        }
-    }
-
-    if (new_line_skip >= 0)
-        conf.line_skip = new_line_skip;
-
-    if ( optind==argc ) return usage(stderr, EXIT_FAILURE);
-
-    if ( list_chroms )
-        return query_chroms(argv[optind], args.download_index);
-
-    char *fname = argv[optind];
-    int ftype = file_type(fname);
-    if ( detect )  // no preset given
-    {
-        if ( ftype==IS_GFF ) conf = tbx_conf_gff;
-        else if ( ftype==IS_BED ) conf = tbx_conf_bed;
-        else if ( ftype==IS_SAM ) conf = tbx_conf_sam;
-        else if ( ftype==IS_VCF )
-        {
-            conf = tbx_conf_vcf;
-            if ( !min_shift && do_csi ) min_shift = 14;
-        }
-        else if ( ftype==IS_BCF )
-        {
-            if ( !min_shift ) min_shift = 14;
-        }
-        else if ( ftype==IS_BAM )
-        {
-            if ( !min_shift ) min_shift = 14;
-        }
-    }
-    if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
-    {
-        int nregs = 0;
-        char **regs = NULL;
-        if ( !args.header_only )
-            regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
-        return query_regions(&args, &conf, fname, regs, nregs);
-    }
-    if ( do_csi )
-    {
-        if ( !min_shift ) min_shift = 14;
-        min_shift *= do_csi;  // positive for CSIv2, negative for CSIv1
-    }
-    if ( min_shift!=0 && !do_csi ) do_csi = 1;
-
-    if ( reheader )
-        return reheader_file(fname, reheader, ftype, &conf);
-
-    char *suffix = ".tbi";
-    if ( do_csi ) suffix = ".csi";
-    else if ( ftype==IS_BAM ) suffix = ".bai";
-    else if ( ftype==IS_CRAM ) suffix = ".crai";
-
-    char *idx_fname = calloc(strlen(fname) + 6, 1);
-    if (!idx_fname) error("%s\n", strerror(errno));
-    strcat(strcpy(idx_fname, fname), suffix);
-
-    struct stat stat_tbi, stat_file;
-    if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
-    {
-        // Before complaining about existing index, check if the VCF file isn't
-        // newer. This is a common source of errors, people tend not to notice
-        // that tabix failed
-        stat(fname, &stat_file);
-        if ( stat_file.st_mtime <= stat_tbi.st_mtime )
-            error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
-    }
-    free(idx_fname);
-
-    int ret;
-    if ( ftype==IS_CRAM )
-    {
-        if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
-        return 0;
-    }
-    else if ( do_csi )
-    {
-        if ( ftype==IS_BCF )
-        {
-            if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
-            return 0;
-        }
-        if ( ftype==IS_BAM )
-        {
-            if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
-            return 0;
-        }
-
-        switch (ret = tbx_index_build(fname, min_shift, &conf))
-        {
-            case 0:
-                return 0;
-            case -2:
-                error("[tabix] the compression of '%s' is not BGZF\n", fname);
-            default:
-                error("tbx_index_build failed: %s\n", fname);
-        }
-    }
-    else    // TBI index
-    {
-        switch (ret = tbx_index_build(fname, min_shift, &conf))
-        {
-            case 0:
-                return 0;
-            case -2:
-                error("[tabix] the compression of '%s' is not BGZF\n", fname);
-            default:
-                error("tbx_index_build failed: %s\n", fname);
-        }
-    }
-
-    return 0;
-}
diff --git a/htslib/tbx.c b/htslib/tbx.c
index c2c5c6f9d..662500549 100644
--- a/htslib/tbx.c
+++ b/htslib/tbx.c
@@ -53,6 +53,7 @@ const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
 
 HTSLIB_EXPORT
 const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
+const tbx_conf_t tbx_conf_gaf = { TBX_GAF, 1, 6, 0, '#', 0 };
 
 typedef struct {
     int64_t beg, end;
@@ -64,6 +65,7 @@ static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
 {
     khint_t k;
     khash_t(s2i) *d;
+    if ((tbx->conf.preset&0xffff) == TBX_GAF) return(0);
     if (tbx->dict == 0) tbx->dict = kh_init(s2i);
     if (!tbx->dict) return -1; // Out of memory
     d = (khash_t(s2i)*)tbx->dict;
@@ -103,24 +105,45 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
                 intv->ss = line + b; intv->se = line + i;
             } else if (id == conf->bc) {
                 // here ->beg is 0-based.
-                intv->beg = strtoll(line + b, &s, 0);
+                if ((conf->preset&0xffff) == TBX_GAF){
+                    // if gaf find the smallest and largest node id
+                    char *t;
+                    int64_t nodeid = -1;
+                    for (s = line + b + 1; s < line + i;) {
+                        nodeid = strtoll(s, &t, 0);
+                        if(intv->beg == -1){
+                            intv->beg = intv->end = nodeid;
+                        } else {
+                            if(nodeid < intv->beg){
+                                intv->beg = nodeid;
+                            }
 
-                if (conf->bc <= conf->ec) // don't overwrite an already set end point
-                    intv->end = intv->beg;
+                            if(nodeid > intv->end){
+                                intv->end = nodeid;
+                            }
+                        }
+                        s = t + 1;
+                    }
+                } else {
+                    intv->beg = strtoll(line + b, &s, 0);
 
-                if ( s==line+b ) return -1; // expected int
+                    if (conf->bc <= conf->ec) // don't overwrite an already set end point
+                        intv->end = intv->beg;
 
-                if (!(conf->preset&TBX_UCSC))
-                    --intv->beg;
-                else if (conf->bc <= conf->ec)
-                    ++intv->end;
+                    if ( s==line+b ) return -1; // expected int
 
-                if (intv->beg < 0) {
-                    hts_log_warning("Coordinate <= 0 detected. "
-                                    "Did you forget to use the -0 option?");
-                    intv->beg = 0;
+                    if (!(conf->preset&TBX_UCSC))
+                        --intv->beg;
+                    else if (conf->bc <= conf->ec)
+                        ++intv->end;
+
+                    if (intv->beg < 0) {
+                        hts_log_warning("Coordinate <= 0 detected. "
+                                        "Did you forget to use the -0 option?");
+                        intv->beg = 0;
+                    }
+                    if (intv->end < 1) intv->end = 1;
                 }
-                if (intv->end < 1) intv->end = 1;
             } else {
                 if ((conf->preset&0xffff) == TBX_GENERIC) {
                     if (id == conf->ec)
@@ -187,7 +210,13 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
 {
     if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
         int c = *intv->se;
-        *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
+        *intv->se = '\0';
+        if ((tbx->conf.preset&0xffff) == TBX_GAF){
+            intv->tid = 0;
+        } else {
+            intv->tid = get_tid(tbx, intv->ss, is_add);
+        }
+        *intv->se = c;
         if (intv->tid < 0) return -2;  // get_tid out of memory
         return (intv->beg >= 0 && intv->end >= 0)? 0 : -1;
     } else {
@@ -196,11 +225,15 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
         {
             case TBX_SAM: type = "TBX_SAM"; break;
             case TBX_VCF: type = "TBX_VCF"; break;
+            case TBX_GAF: type = "TBX_GAF"; break;
             case TBX_UCSC: type = "TBX_UCSC"; break;
             default: type = "TBX_GENERIC"; break;
         }
-        hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"",
-            type, str->s);
+        if (hts_is_utf16_text(str))
+            hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type);
+        else
+            hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"",
+                type, str->s);
         return -1;
     }
 }
@@ -291,7 +324,7 @@ static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
 // files with very large contigs.
 static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
 {
-    int64_t s = 1LL << (min_shift + n_lvls * 3);
+    int64_t s = hts_bin_maxpos(min_shift, n_lvls);
     max_len += 256;
     for (; max_len > s; ++n_lvls, s <<= 3) {}
     return n_lvls;
diff --git a/htslib/textutils.c b/htslib/textutils.c
index 0cc2af818..b2c29a893 100644
--- a/htslib/textutils.c
+++ b/htslib/textutils.c
@@ -220,7 +220,7 @@ static char token_type(hts_json_token *token)
 }
 
 HTSLIB_EXPORT
-hts_json_token * hts_json_alloc_token() {
+hts_json_token * hts_json_alloc_token(void) {
     return calloc(1, sizeof(hts_json_token));
 }
 
diff --git a/htslib/textutils_internal.h b/htslib/textutils_internal.h
index 1ad096494..faa1d4d11 100644
--- a/htslib/textutils_internal.h
+++ b/htslib/textutils_internal.h
@@ -1,6 +1,6 @@
 /* textutils_internal.h -- non-bioinformatics utility routines for text etc.
 
-   Copyright (C) 2016,2018-2020 Genome Research Ltd.
+   Copyright (C) 2016,2018-2020, 2024 Genome Research Ltd.
 
    Author: John Marshall <jm18@sanger.ac.uk>
 
@@ -221,27 +221,43 @@ static inline int64_t hts_str2int(const char *in, char **end, int bits,
     uint32_t fast = (bits - 1) * 1000 / 3322 + 1; // log(10)/log(2) ~= 3.322
     const unsigned char *v = (const unsigned char *) in;
     const unsigned int ascii_zero = '0'; // Prevents conversion to signed
-    unsigned char d;
-    int neg = 1;
+    unsigned int d;
 
+    int neg;
     switch(*v) {
     case '-':
-        neg=-1;
-        limit++; /* fall through */
-    case '+':
+        limit++;
+        neg=1;
         v++;
+        // See "dup" comment below
+        while (--fast && *v>='0' && *v<='9')
+            n = n*10 + *v++ - ascii_zero;
         break;
+
+    case '+':
+        v++;
+        // fall through
+
     default:
+        neg = 0;
+        // dup of above.  This is somewhat unstable and mainly for code
+        // size cheats to prevent instruction cache lines spanning 32-byte
+        // blocks in the sam_parse_B_vals calling code.  It's been tested
+        // on gcc7, gcc13, clang10 and clang16 with -O2 and -O3.  While
+        // not exhaustive, this code duplication gives stable fast results
+        // while a single copy does not.
+        // (NB: system was "seq4d", so quite old)
+        while (--fast && *v>='0' && *v<='9')
+            n = n*10 + *v++ - ascii_zero;
         break;
     }
 
-    while (--fast && *v>='0' && *v<='9')
-        n = n*10 + *v++ - ascii_zero;
-
-    if (!fast) {
+    // NB gcc7 is slow with (unsigned)(*v - ascii_zero) < 10,
+    // while gcc13 prefers it.
+    if (*v>='0' && !fast) { // rejects ',' and tab
         uint64_t limit_d_10 = limit / 10;
         uint64_t limit_m_10 = limit - 10 * limit_d_10;
-         while ((d = *v - ascii_zero) < 10) {
+        while ((d = *v - ascii_zero) < 10) {
             if (n < limit_d_10 || (n == limit_d_10 && d <= limit_m_10)) {
                 n = n*10 + d;
                 v++;
@@ -256,7 +272,7 @@ static inline int64_t hts_str2int(const char *in, char **end, int bits,
 
     *end = (char *)v;
 
-    return (n && neg < 0) ? -((int64_t) (n - 1)) - 1 : (int64_t) n;
+    return neg ? (int64_t)-n : (int64_t)n;
 }
 
 /// Convert a string to an unsigned integer, with overflow detection
@@ -279,12 +295,12 @@ Both end and failed must be non-NULL.
  */
 
 static inline uint64_t hts_str2uint(const char *in, char **end, int bits,
-                                      int *failed) {
+                                    int *failed) {
     uint64_t n = 0, limit = (bits < 64 ? (1ULL << bits) : 0) - 1;
     const unsigned char *v = (const unsigned char *) in;
     const unsigned int ascii_zero = '0'; // Prevents conversion to signed
     uint32_t fast = bits * 1000 / 3322 + 1; // log(10)/log(2) ~= 3.322
-    unsigned char d;
+    unsigned int d;
 
     if (*v == '+')
         v++;
@@ -292,7 +308,7 @@ static inline uint64_t hts_str2uint(const char *in, char **end, int bits,
     while (--fast && *v>='0' && *v<='9')
         n = n*10 + *v++ - ascii_zero;
 
-    if (!fast) {
+    if ((unsigned)(*v - ascii_zero) < 10 && !fast) {
         uint64_t limit_d_10 = limit / 10;
         uint64_t limit_m_10 = limit - 10 * limit_d_10;
         while ((d = *v - ascii_zero) < 10) {
diff --git a/htslib/vcf.c b/htslib/vcf.c
index 9e589f993..105c7539d 100644
--- a/htslib/vcf.c
+++ b/htslib/vcf.c
@@ -1,7 +1,7 @@
 /*  vcf.c -- VCF/BCF API functions.
 
     Copyright (C) 2012, 2013 Broad Institute.
-    Copyright (C) 2012-2023 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
     Portions copyright (C) 2014 Intel Corporation.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -37,6 +37,10 @@ DEALINGS IN THE SOFTWARE.  */
 #include <inttypes.h>
 #include <errno.h>
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#include "fuzz_settings.h"
+#endif
+
 #include "htslib/vcf.h"
 #include "htslib/bgzf.h"
 #include "htslib/tbx.h"
@@ -46,9 +50,29 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/khash_str2int.h"
 #include "htslib/kstring.h"
 #include "htslib/sam.h"
-
 #include "htslib/khash.h"
+
+#if 0
+// This helps on Intel a bit, often 6-7% faster VCF parsing.
+// Conversely sometimes harms AMD Zen4 as ~9% slower.
+// Possibly related to IPC differences.  However for now it's just a
+// curiousity we ignore and stick with the simpler code.
+//
+// Left here as a hint for future explorers.
+static inline int xstreq(const char *a, const char *b) {
+    while (*a && *a == *b)
+        a++, b++;
+    return *a == *b;
+}
+
+#define KHASH_MAP_INIT_XSTR(name, khval_t) \
+  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
+
+KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
+#else
 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+#endif
+
 typedef khash_t(vdict) vdict_t;
 
 KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
@@ -91,6 +115,7 @@ typedef struct
 {
     vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
     hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
+    size_t *key_len;// length of h->id[BCF_DT_ID] strings
 }
 bcf_hdr_aux_t;
 
@@ -218,6 +243,14 @@ int bcf_hdr_sync(bcf_hdr_t *h)
             h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
         }
     }
+
+    // Invalidate key length cache
+    bcf_hdr_aux_t *aux = get_hdr_aux(h);
+    if (aux && aux->key_len) {
+        free(aux->key_len);
+        aux->key_len = NULL;
+    }
+
     h->dirty = 0;
     return 0;
 }
@@ -683,6 +716,11 @@ static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_i
     }
 
     new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    // hts_resize() can attempt to allocate up to 2 * requested items
+    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
+        return -1;
+#endif
     if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
                    &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
         return -1;
@@ -1370,12 +1408,17 @@ bcf_hdr_t *bcf_hdr_init(const char *mode)
     bcf_hdr_t *h;
     h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
     if (!h) return NULL;
-    for (i = 0; i < 3; ++i)
+    for (i = 0; i < 3; ++i) {
         if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
+        // Supersize the hash to make collisions very unlikely
+        static int dsize[3] = {16384,16384,2048}; // info, contig, format
+        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
+    }
 
     bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
     if ( !aux ) goto fail;
     if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
+    aux->key_len = NULL;
     aux->dict = *((vdict_t*)h->dict[0]);
     free(h->dict[0]);
     h->dict[0] = aux;
@@ -1411,6 +1454,7 @@ void bcf_hdr_destroy(bcf_hdr_t *h)
             for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
                 if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
             kh_destroy(hdict, aux->gen);
+            free(aux->key_len); // may exist for dict[0] only
         }
         kh_destroy(vdict, d);
         free(h->id[i]);
@@ -1465,6 +1509,9 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
     if (bgzf_read(fp, buf, 4) != 4) goto fail;
     hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
     if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
+#endif
     htxt = (char*)malloc(hlen + 1);
     if (!htxt) goto fail;
     if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
@@ -1510,6 +1557,7 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
     u32_to_le(htxt.l, hlen);
     if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
     if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
+    if ( bgzf_flush(fp) < 0) return -1;
 
     free(htxt.s);
     return 0;
@@ -1519,7 +1567,7 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
  *** BCF site I/O ***
  ********************/
 
-bcf1_t *bcf_init()
+bcf1_t *bcf_init(void)
 {
     bcf1_t *v;
     v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
@@ -1591,8 +1639,12 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
     shared_len = le_to_u32(x);
     if (shared_len < 24) return -2;
     shared_len -= 24; // to exclude six 32-bit integers
-    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
     indiv_len = le_to_u32(x + 4);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    // ks_resize() normally allocates 1.5 * requested size to allow for growth
+    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
+#endif
+    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
     if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
     v->rid  = le_to_i32(x + 8);
     v->pos  = le_to_u32(x + 12);
@@ -2151,7 +2203,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
     if ( hfp->format.format == vcf || hfp->format.format == text_format )
         return vcf_write(hfp,h,v);
 
-    if ( v->errcode )
+    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
     {
         // vcf_parse1() encountered a new contig or tag, undeclared in the
         // header.  At this point, the header must have been printed,
@@ -2185,7 +2237,8 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
     if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
 
     if (hfp->idx) {
-        if (hts_idx_push(hfp->idx, v->rid, v->pos, v->pos + v->rlen, bgzf_tell(fp), 1) < 0)
+        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
+                          bgzf_tell(fp), 1) < 0)
             return -1;
     }
 
@@ -2463,25 +2516,64 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
 {
     int32_t max = INT32_MIN, min = INT32_MAX;
     int i;
-    if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL);
-    else if (n == 1) bcf_enc_int1(s, a[0]);
-    else {
+    if (n <= 0) {
+        return bcf_enc_size(s, 0, BCF_BT_NULL);
+    } else if (n == 1) {
+        return bcf_enc_int1(s, a[0]);
+    } else {
         if (wsize <= 0) wsize = n;
-        for (i = 0; i < n; ++i) {
-            if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
+
+        // Equivalent to:
+        // for (i = 0; i < n; ++i) {
+        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
+        //         continue;
+        //     if (max < a[i]) max = a[i];
+        //     if (min > a[i]) min = a[i];
+        // }
+        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
+        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
+        for (i = 0; i < (n&~3); i+=4) {
+            // bcf_int32_missing    == INT32_MIN and
+            // bcf_int32_vector_end == INT32_MIN+1.
+            // We skip these, but can mostly avoid explicit checking
+            if (max4[0] < a[i+0]) max4[0] = a[i+0];
+            if (max4[1] < a[i+1]) max4[1] = a[i+1];
+            if (max4[2] < a[i+2]) max4[2] = a[i+2];
+            if (max4[3] < a[i+3]) max4[3] = a[i+3];
+            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
+            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
+            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
+            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
+        }
+        min = min4[0];
+        if (min > min4[1]) min = min4[1];
+        if (min > min4[2]) min = min4[2];
+        if (min > min4[3]) min = min4[3];
+        max = max4[0];
+        if (max < max4[1]) max = max4[1];
+        if (max < max4[2]) max = max4[2];
+        if (max < max4[3]) max = max4[3];
+        for (; i < n; ++i) {
             if (max < a[i]) max = a[i];
-            if (min > a[i]) min = a[i];
+            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
         }
+
         if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
-            bcf_enc_size(s, wsize, BCF_BT_INT8);
-            for (i = 0; i < n; ++i)
-                if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
-                else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
-                else kputc(a[i], s);
+            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
+                ks_resize(s, s->l + n) < 0)
+                return -1;
+            uint8_t *p = (uint8_t *) s->s + s->l;
+            for (i = 0; i < n; ++i, p++) {
+                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
+                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
+                else *p = a[i];
+            }
+            s->l += n;
         } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
             uint8_t *p;
-            bcf_enc_size(s, wsize, BCF_BT_INT16);
-            ks_resize(s, s->l + n * sizeof(int16_t));
+            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
+                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
+                return -1;
             p = (uint8_t *) s->s + s->l;
             for (i = 0; i < n; ++i)
             {
@@ -2495,8 +2587,9 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
             s->l += n * sizeof(int16_t);
         } else {
             uint8_t *p;
-            bcf_enc_size(s, wsize, BCF_BT_INT32);
-            ks_resize(s, s->l + n * sizeof(int32_t));
+            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
+                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
+                return -1;
             p = (uint8_t *) s->s + s->l;
             for (i = 0; i < n; ++i) {
                 i32_to_le(a[i], p);
@@ -2506,7 +2599,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
         }
     }
 
-    return 0; // FIXME: check for errs in this function
+    return 0;
 }
 
 #ifdef VCF_ALLOW_INT64
@@ -2562,20 +2655,84 @@ int bcf_enc_vchar(kstring_t *s, int l, const char *a)
     return 0; // FIXME: check for errs in this function
 }
 
+// Special case of n==1 as it also occurs quite often in FORMAT data.
+// This version is also small enough to get inlined.
+static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
+    uint32_t e = 0;
+    uint8_t *p = (uint8_t *)data;
+    int32_t v;
+
+    // helps gcc more than clang here. In billions of cycles:
+    //          bcf_fmt_array1  bcf_fmt_array
+    // gcc7:    23.2            24.3
+    // gcc13:   21.6            23.0
+    // clang13: 27.1            27.8
+    switch (type) {
+    case BCF_BT_CHAR:
+        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
+        break;
+
+    case BCF_BT_INT8:
+        if (*(int8_t *)p != bcf_int8_vector_end) {
+            e |= ((*(int8_t *)p == bcf_int8_missing)
+                  ? kputc_('.', s)
+                  : kputw(*(int8_t *)p, s)) < 0;
+        }
+        break;
+    case BCF_BT_INT16:
+        v = le_to_i16(p);
+        if (v != bcf_int16_vector_end) {
+            e |= (v == bcf_int16_missing
+                  ? kputc_('.', s)
+                  : kputw(v, s)) < 0;
+        }
+        break;
+
+    case BCF_BT_INT32:
+        v = le_to_i32(p);
+        if (v != bcf_int32_vector_end) {
+            e |= (v == bcf_int32_missing
+                  ? kputc_('.', s)
+                  : kputw(v, s)) < 0;
+        }
+        break;
+
+    case BCF_BT_FLOAT:
+        v = le_to_u32(p);
+        if (v != bcf_float_vector_end) {
+            e |= (v == bcf_float_missing
+                  ? kputc_('.', s)
+                  : kputd(le_to_float(p), s)) < 0;
+        }
+        break;
+
+    default:
+        hts_log_error("Unexpected type %d", type);
+        return -1;
+    }
+
+    return e == 0 ? 0 : -1;
+}
+
 int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
 {
     int j = 0;
     uint32_t e = 0;
     if (n == 0) {
-        return kputc('.', s) >= 0 ? 0 : -1;
+        return kputc_('.', s) >= 0 ? 0 : -1;
     }
+
     if (type == BCF_BT_CHAR)
     {
-        char *p = (char*)data;
-        for (j = 0; j < n && *p; ++j, ++p)
-        {
-            if ( *p==bcf_str_missing ) e |= kputc('.', s) < 0;
-            else e |= kputc(*p, s) < 0;
+        char *p = (char *)data;
+
+        // Note bcf_str_missing is already accounted for in n==0 above.
+        if (n >= 8) {
+            char *p_end = memchr(p, 0, n);
+            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
+        } else {
+            for (j = 0; j < n && *p; ++j, ++p)
+               e |= kputc(*p, s) < 0;
         }
     }
     else
@@ -2586,9 +2743,8 @@ int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
             { \
                 type_t v = convert(p); \
                 if ( is_vector_end ) break; \
-                if ( j ) kputc(',', s); \
-                if ( is_missing ) kputc('.', s); \
-                else e |= kprint < 0; \
+                if ( j ) e |= kputc_(',', s) < 0; \
+                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
             } \
         }
         switch (type) {
@@ -2616,13 +2772,36 @@ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
  ********************/
 
 typedef struct {
-    int key, max_m, size, offset;
-    uint32_t is_gt:1, max_g:31;
-    uint32_t max_l;
-    uint32_t y;
-    uint8_t *buf;
+    int key;            // Key for h->id[BCF_DT_ID][key] vdict
+    int max_m;          // number of elements in field array (ie commas)
+    int size;           // field size (max_l or max_g*4 if is_gt)
+    int offset;         // offset of buf into h->mem
+    uint32_t is_gt:1,   // is genotype
+             max_g:31;  // maximum number of genotypes
+    uint32_t max_l;     // length of field
+    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
+    uint8_t *buf;       // Pointer into h->mem
 } fmt_aux_t;
 
+// fmt_aux_t field notes:
+// max_* are biggest sizes of the various FORMAT fields across all samples.
+// We use these after pivoting the data to ensure easy random access
+// of a specific sample.
+//
+// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
+// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
+// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
+//
+// These are computed in vcf_parse_format_max3 and used in
+// vcf_parse_format_alloc4 to get the size.
+//
+// size is computed from max_g, max_l, max_m and is_gt.  Once computed
+// the max values are never accessed again.
+//
+// In theory all 4 vars could be coalesced into a single variable, but this
+// significantly harms speed (even if done via a union).  It's about 25-30%
+// slower.
+
 static inline int align_mem(kstring_t *s)
 {
     int e = 0;
@@ -2633,23 +2812,12 @@ static inline int align_mem(kstring_t *s)
     return e == 0 ? 0 : -1;
 }
 
-// p,q is the start and the end of the FORMAT field
 #define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
-static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
-{
-    if ( !bcf_hdr_nsamples(h) ) return 0;
-
-    static int extreme_val_warned = 0;
-    char *r, *t;
-    int j, l, m, g, overflow = 0;
-    khint_t k;
-    ks_tokaux_t aux1;
-    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
-    kstring_t *mem = (kstring_t*)&h->mem;
-    fmt_aux_t fmt[MAX_N_FMT];
-    mem->l = 0;
 
-    char *end = s->s + s->l;
+// detect FORMAT "."
+static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                   const char *p, const char *q) {
+    const char *end = s->s + s->l;
     if ( q>=end )
     {
         hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
@@ -2661,10 +2829,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
     if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
     {
         v->n_sample = bcf_hdr_nsamples(h);
-        return 0;
+        return 1;
     }
 
-    // get format information from the dictionary
+    return 0;
+}
+
+// get format information from the dictionary
+static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                  const char *p, const char *q, fmt_aux_t *fmt) {
+    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
+    char *t;
+    int j;
+    ks_tokaux_t aux1;
+
     for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
         if (j >= MAX_N_FMT) {
             v->errcode |= BCF_ERR_LIMITS;
@@ -2674,7 +2852,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
 
         *(char*)aux1.p = 0;
-        k = kh_get(vdict, d, t);
+        khint_t k = kh_get(vdict, d, t);
         if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
             if ( t[0]=='.' && t[1]==0 )
             {
@@ -2702,14 +2880,22 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
         fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
         fmt[j].key = kh_val(d, k).id;
-        fmt[j].is_gt = !strcmp(t, "GT");
+        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
         fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
         v->n_fmt++;
     }
-    // compute max
+    return 0;
+}
+
+// compute max
+static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                 char *p, char *q, fmt_aux_t *fmt) {
     int n_sample_ori = -1;
-    r = q + 1;  // r: position in the format string
-    l = 0, m = g = 1, v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
+    char *r = q + 1;  // r: position in the format string
+    int l = 0, m = 1, g = 1, j;
+    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
+    const char *end = s->s + s->l;
+
     while ( r<end )
     {
         // can we skip some samples?
@@ -2727,7 +2913,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         // collect fmt stats: max vector size, length, number of alleles
         j = 0;  // j-th format field
         fmt_aux_t *f = fmt;
+        static char meta[256] = {
+            // \0 \t , / : |
+            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+        };
+
+        char *r_start = r;
         for (;;) {
+            // Quickly skip ahead to an appropriate meta-character
+            while (!meta[(unsigned char)*r]) r++;
+
             switch (*r) {
             case ',':
                 m++;
@@ -2741,8 +2943,10 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
             case '\t':
                 *r = 0; // fall through
 
+            default: // valid due to while loop above.
             case '\0':
             case ':':
+                l = r - r_start; r_start = r;
                 if (f->max_m < m) f->max_m = m;
                 if (f->max_l < l) f->max_l = l;
                 if (f->is_gt && f->max_g < g) f->max_g = g;
@@ -2759,7 +2963,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                 break;
             }
             if ( r>=end ) break;
-            r++; l++;
+            r++;
         }
     end_for:
         v->n_sample++;
@@ -2767,20 +2971,30 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         r++;
     }
 
-    // allocate memory for arrays
+    return 0;
+}
+
+// allocate memory for arrays
+static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                   const char *p, const char *q,
+                                   fmt_aux_t *fmt) {
+    kstring_t *mem = (kstring_t*)&h->mem;
+
+    int j;
     for (j = 0; j < v->n_fmt; ++j) {
         fmt_aux_t *f = &fmt[j];
         if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
+
         if ((f->y>>4&0xf) == BCF_HT_STR) {
             f->size = f->is_gt? f->max_g << 2 : f->max_l;
         } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
             f->size = f->max_m << 2;
-        } else
-        {
+        } else {
             hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
             v->errcode |= BCF_ERR_TAG_INVALID;
             return -1;
         }
+
         if (align_mem(mem) < 0) {
             hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
             v->errcode |= BCF_ERR_LIMITS;
@@ -2791,9 +3005,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         // malformed VCF data is less likely to take excessive memory and/or
         // time.
         if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
-            hts_log_error("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
+            static int warned = 0;
+            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
+            warned = 1;
             v->errcode |= BCF_ERR_LIMITS;
-            return -1;
+            f->size = -1;
+            f->offset = 0;
+            continue;
         }
 
         f->offset = mem->l;
@@ -2804,11 +3022,47 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
         mem->l += v->n_sample * f->size;
     }
-    for (j = 0; j < v->n_fmt; ++j)
-        fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
-    // fill the sample fields; at beginning of the loop, t points to the first char of a format
-    n_sample_ori = -1;
-    t = q + 1; m = 0;   // m: sample id
+
+    {
+        int j;
+        for (j = 0; j < v->n_fmt; ++j)
+            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
+    }
+
+    // check for duplicate tags
+    int i;
+    for (i=1; i<v->n_fmt; i++)
+    {
+        fmt_aux_t *ifmt = &fmt[i];
+        if ( ifmt->size==-1 ) continue; // already marked for removal
+        for (j=0; j<i; j++)
+        {
+            fmt_aux_t *jfmt = &fmt[j];
+            if ( jfmt->size==-1 ) continue; // already marked for removal
+            if ( ifmt->key!=jfmt->key ) continue;
+            static int warned = 0;
+            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
+            warned = 1;
+            v->errcode |= BCF_ERR_TAG_INVALID;
+            ifmt->size = -1;
+            ifmt->offset = 0;
+            break;
+        }
+    }
+    return 0;
+}
+
+// Fill the sample fields
+static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                  const char *p, const char *q, fmt_aux_t *fmt) {
+    static int extreme_val_warned = 0;
+    int n_sample_ori = -1;
+    // At beginning of the loop t points to the first char of a format
+    const char *t = q + 1;
+    int m = 0;   // m: sample id
+    const int nsamples = bcf_hdr_nsamples(h);
+
+    const char *end = s->s + s->l;
     while ( t<end )
     {
         // can we skip some samples?
@@ -2822,32 +3076,52 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                 continue;
             }
         }
-        if ( m == bcf_hdr_nsamples(h) ) break;
+        if ( m == nsamples ) break;
 
-        j = 0; // j-th format field, m-th sample
+        int j = 0; // j-th format field, m-th sample
         while ( t < end )
         {
             fmt_aux_t *z = &fmt[j++];
+            const int htype = z->y>>4&0xf;
             if (!z->buf) {
                 hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
                               z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
                 v->errcode |= BCF_ERR_LIMITS;
                 return -1;
             }
-            if ((z->y>>4&0xf) == BCF_HT_STR) {
-                if (z->is_gt) { // genotypes
+
+            if ( z->size==-1 )
+            {
+                // this field is to be ignored, it's either too big or a duplicate
+                while ( *t != ':' && *t ) t++;
+            }
+            else if (htype == BCF_HT_STR) {
+                int l;
+                if (z->is_gt) {
+                    // Genotypes.
+                    // <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
                     int32_t is_phased = 0;
                     uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
                     uint32_t unreadable = 0;
                     uint32_t max = 0;
-                    overflow = 0;
+                    int overflow = 0;
                     for (l = 0;; ++t) {
                         if (*t == '.') {
                             ++t, x[l++] = is_phased;
                         } else {
-                            char *tt = t;
-                            uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow);
-                            unreadable |= tt == t;
+                            const char *tt = t;
+                            uint32_t val;
+                            // Or "v->n_allele < 10", but it doesn't
+                            // seem to be any faster and this feels safer.
+                            if (*t >= '0' && *t <= '9' &&
+                                !(t[1] >= '0' && t[1] <= '9')) {
+                                val = *t++ - '0';
+                            } else {
+                                val = hts_str2uint(t, (char **)&t,
+                                                   sizeof(val) * CHAR_MAX - 2,
+                                                   &overflow);
+                                unreadable |= tt == t;
+                            }
                             if (max < val) max = val;
                             x[l++] = (val + 1) << 1 | is_phased;
                         }
@@ -2864,26 +3138,35 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                         return -1;
                     }
                     if ( !l ) x[l++] = 0;   // An empty field, insert missing value
-                    for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
+                    for (; l < z->size>>2; ++l)
+                        x[l] = bcf_int32_vector_end;
+
                 } else {
+                    // Otherwise arbitrary strings
                     char *x = (char*)z->buf + z->size * (size_t)m;
-                    for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
-                    for (; l < z->size; ++l) x[l] = 0;
+                    for (l = 0; *t != ':' && *t; ++t)
+                        x[l++] = *t;
+                    if (z->size > l)
+                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
                 }
-            } else if ((z->y>>4&0xf) == BCF_HT_INT) {
+
+            } else if (htype == BCF_HT_INT) {
+                // One or more integers in an array
                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
+                int l;
                 for (l = 0;; ++t) {
                     if (*t == '.') {
                         x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
                     } else {
-                        overflow = 0;
+                        int overflow = 0;
                         char *te;
                         long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
                         if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
                         {
                             if ( !extreme_val_warned )
                             {
-                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
+                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
+                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
                                 extreme_val_warned = 1;
                             }
                             tmp_val = bcf_int32_missing;
@@ -2893,15 +3176,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                     }
                     if (*t != ',') break;
                 }
-                if ( !l ) x[l++] = bcf_int32_missing;
-                for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
-            } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
+                if ( !l )
+                    x[l++] = bcf_int32_missing;
+                for (; l < z->size>>2; ++l)
+                    x[l] = bcf_int32_vector_end;
+
+            } else if (htype == BCF_HT_REAL) {
+                // One of more floating point values in an array
                 float *x = (float*)(z->buf + z->size * (size_t)m);
+                int l;
                 for (l = 0;; ++t) {
                     if (*t == '.' && !isdigit_c(t[1])) {
                         bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
                     } else {
-                        overflow = 0;
+                        int overflow = 0;
                         char *te;
                         float tmp_val = hts_str2dbl(t, &te, &overflow);
                         if ( (te==t || overflow) && !extreme_val_warned )
@@ -2914,10 +3202,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                     }
                     if (*t != ',') break;
                 }
-                if ( !l ) bcf_float_set_missing(x[l++]);    // An empty field, insert missing value
-                for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
+                if ( !l )
+                    // An empty field, insert missing value
+                    bcf_float_set_missing(x[l++]);
+                for (; l < z->size>>2; ++l)
+                    bcf_float_set_vector_end(x[l]);
             } else {
-                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
+                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
                 v->errcode |= BCF_ERR_TAG_INVALID;
                 return -1;
             }
@@ -2938,23 +3229,32 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
             }
         }
 
-        for (; j < v->n_fmt; ++j) { // fill end-of-vector values
+        // fill end-of-vector values
+        for (; j < v->n_fmt; ++j) {
             fmt_aux_t *z = &fmt[j];
-            if ((z->y>>4&0xf) == BCF_HT_STR) {
+            const int htype = z->y>>4&0xf;
+            int l;
+
+            if (z->size == -1) // this field is to be ignored
+                continue;
+
+            if (htype == BCF_HT_STR) {
                 if (z->is_gt) {
                     int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
                     if (z->size) x[0] = bcf_int32_missing;
                     for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
                 } else {
                     char *x = (char*)z->buf + z->size * (size_t)m;
-                    if ( z->size ) x[0] = '.';
-                    for (l = 1; l < z->size; ++l) x[l] = 0;
+                    if ( z->size ) {
+                        x[0] = '.';
+                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
+                    }
                 }
-            } else if ((z->y>>4&0xf) == BCF_HT_INT) {
+            } else if (htype == BCF_HT_INT) {
                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
                 x[0] = bcf_int32_missing;
                 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
-            } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
+            } else if (htype == BCF_HT_REAL) {
                 float *x = (float*)(z->buf + z->size * (size_t)m);
                 bcf_float_set_missing(x[0]);
                 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
@@ -2964,12 +3264,21 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         m++; t++;
     }
 
-    // write individual genotype information
+    return 0;
+}
+
+// write individual genotype information
+static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                const char *p, const char *q, fmt_aux_t *fmt) {
     kstring_t *str = &v->indiv;
-    int i;
+    int i, need_downsize = 0;
     if (v->n_sample > 0) {
         for (i = 0; i < v->n_fmt; ++i) {
             fmt_aux_t *z = &fmt[i];
+            if ( z->size==-1 ) {
+                need_downsize = 1;
+                continue;
+            }
             bcf_enc_int1(str, z->key);
             if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
                 bcf_enc_size(str, z->size, BCF_BT_CHAR);
@@ -2986,8 +3295,25 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                 }
             }
         }
+
     }
+    if ( need_downsize ) {
+        i = 0;
+        while ( i < v->n_fmt ) {
+            if ( fmt[i].size==-1 )
+            {
+                v->n_fmt--;
+                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
+            }
+            else
+                i++;
+        }
+    }
+    return 0;
+}
 
+// validity checking
+static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
     if ( v->n_sample!=bcf_hdr_nsamples(h) )
     {
         hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
@@ -3008,6 +3334,65 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
     return 0;
 }
 
+// p,q is the start and the end of the FORMAT field
+static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                            char *p, char *q)
+{
+    if ( !bcf_hdr_nsamples(h) ) return 0;
+    kstring_t *mem = (kstring_t*)&h->mem;
+    mem->l = 0;
+
+    fmt_aux_t fmt[MAX_N_FMT];
+
+    // detect FORMAT "."
+    int ret; // +ve = ok, -ve = err
+    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
+        return ret ? 0 : -1;
+
+    // get format information from the dictionary
+    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
+    // stored as per-type arrays AAA... BBB... CCC...  This is basically
+    // a data rotation or pivot.
+
+    // The size of elements in the array grow to their maximum needed,
+    // permitting fast random access.  This means however we have to first
+    // scan the whole FORMAT line to find the maximum of each type, and
+    // then scan it again to find the store the data.
+    // We break this down into compute-max, allocate, fill-out-buffers
+
+    // TODO: ?
+    // The alternative would be to pivot on the first pass, with fixed
+    // size entries for numerics and concatenated strings otherwise, also
+    // tracking maximum sizes.  Then on a second pass we reallocate and
+    // copy the data again to a uniformly sized array.  Two passes through
+    // memory, but without doubling string parsing.
+
+    // compute max
+    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // allocate memory for arrays
+    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // fill the sample fields; at beginning of the loop
+    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // write individual genotype information
+    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // validity checking
+    if (vcf_parse_format_check7(h, v) < 0)
+        return -1;
+
+    return 0;
+}
+
 static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
     // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
     // been already printed, but will enable tools like vcfcheck to proceed.
@@ -3095,17 +3480,18 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
     for (r = key = p;; ++r) {
         int c;
         char *val, *end;
-        if (*r != ';' && *r != '=' && *r != 0) continue;
+        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
         if (v->n_info == UINT16_MAX) {
             hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
                           bcf_seqname_safe(h,v), v->pos+1);
             v->errcode |= BCF_ERR_LIMITS;
             goto fail;
         }
-        val = end = 0;
+        val = end = NULL;
         c = *r; *r = 0;
         if (c == '=') {
             val = r + 1;
+
             for (end = val; *end != ';' && *end != 0; ++end);
             c = *end; *end = 0;
         } else end = r;
@@ -3212,7 +3598,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
                 } else {
                     bcf_enc_vint(str, n_val, a_val, -1);
                 }
-                if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0)
+                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
+                    && memcmp(key, "END", 4) == 0)
                 {
                     if ( val1 <= v->pos )
                     {
@@ -3253,95 +3640,163 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
 
 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
 {
-    int i = 0, ret = -2, overflow = 0;
+    int ret = -2, overflow = 0;
     char *p, *q, *r, *t;
     kstring_t *str;
     khint_t k;
     ks_tokaux_t aux;
 
+//#define NOT_DOT(p) strcmp((p), ".")
+//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
+//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
+//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
+#define NOT_DOT(p) (memcmp(p, ".\0", 2))
+
     if (!s || !h || !v || !(s->s))
         return ret;
 
     // Assumed in lots of places, but we may as well spot this early
     assert(sizeof(float) == sizeof(int32_t));
 
+    // Ensure string we parse has space to permit some over-flow when during
+    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
+    // the more straight forward looking strcmp, giving a speed advantage.
+    if (ks_resize(s, s->l+4) < 0)
+        return -1;
+
+    // Force our memory to be initialised so we avoid the technicality of
+    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
+    // almost certainly is never detected by the compiler so has no impact,
+    // but equally so this code has minimal (often beneficial) impact on
+    // performance too.)
+    s->s[s->l+0] = 0;
+    s->s[s->l+1] = 0;
+    s->s[s->l+2] = 0;
+    s->s[s->l+3] = 0;
+
     bcf_clear1(v);
     str = &v->shared;
     memset(&aux, 0, sizeof(ks_tokaux_t));
-    for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
-        q = (char*)aux.p;
-        *q = 0;
-        if (i == 0) { // CHROM
-            vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
-            k = kh_get(vdict, d, p);
-            if (k == kh_end(d))
-            {
-                hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
-                v->errcode = BCF_ERR_CTG_UNDEF;
-                if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
-                    hts_log_error("Could not add dummy header for contig '%s'", p);
-                    v->errcode |= BCF_ERR_CTG_INVALID;
+
+    // CHROM
+    if (!(p = kstrtok(s->s, "\t", &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
+    k = kh_get(vdict, d, p);
+    if (k == kh_end(d)) {
+        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
+        v->errcode = BCF_ERR_CTG_UNDEF;
+        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
+            hts_log_error("Could not add dummy header for contig '%s'", p);
+            v->errcode |= BCF_ERR_CTG_INVALID;
+            goto err;
+        }
+    }
+    v->rid = kh_val(d, k).id;
+
+    // POS
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    overflow = 0;
+    char *tmp = p;
+    v->pos = hts_str2uint(p, &p, 62, &overflow);
+    if (overflow) {
+        hts_log_error("Position value '%s' is too large", tmp);
+        goto err;
+    } else if ( *p ) {
+        hts_log_error("Could not parse the position '%s'", tmp);
+        goto err;
+    } else {
+        v->pos -= 1;
+    }
+    if (v->pos >= INT32_MAX)
+        v->unpacked |= BCF_IS_64BIT;
+
+    // ID
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
+    else bcf_enc_size(str, 0, BCF_BT_CHAR);
+
+    // REF
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    bcf_enc_vchar(str, q - p, p);
+    v->n_allele = 1, v->rlen = q - p;
+
+    // ALT
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) {
+        for (r = t = p;; ++r) {
+            if (*r == ',' || *r == 0) {
+                if (v->n_allele == UINT16_MAX) {
+                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
+                                  bcf_seqname_safe(h,v), v->pos+1);
+                    v->errcode |= BCF_ERR_LIMITS;
                     goto err;
                 }
+                bcf_enc_vchar(str, r - t, t);
+                t = r + 1;
+                ++v->n_allele;
             }
-            v->rid = kh_val(d, k).id;
-        } else if (i == 1) { // POS
-            overflow = 0;
-            char *tmp = p;
-            v->pos = hts_str2uint(p, &p, 63, &overflow);
-            if (overflow) {
-                hts_log_error("Position value '%s' is too large", tmp);
-                goto err;
-            } else if ( *p ) {
-                hts_log_error("Could not parse the position '%s'", tmp);
-                goto err;
-            } else {
-                v->pos -= 1;
-            }
-            if (v->pos >= INT32_MAX)
-                v->unpacked |= BCF_IS_64BIT;
-        } else if (i == 2) { // ID
-            if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
-            else bcf_enc_size(str, 0, BCF_BT_CHAR);
-        } else if (i == 3) { // REF
-            bcf_enc_vchar(str, q - p, p);
-            v->n_allele = 1, v->rlen = q - p;
-        } else if (i == 4) { // ALT
-            if (strcmp(p, ".")) {
-                for (r = t = p;; ++r) {
-                    if (*r == ',' || *r == 0) {
-                        if (v->n_allele == UINT16_MAX) {
-                            hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
-                                          bcf_seqname_safe(h,v), v->pos+1);
-                            v->errcode |= BCF_ERR_LIMITS;
-                            goto err;
-                        }
-                        bcf_enc_vchar(str, r - t, t);
-                        t = r + 1;
-                        ++v->n_allele;
-                    }
-                    if (r == q) break;
-                }
-            }
-        } else if (i == 5) { // QUAL
-            if (strcmp(p, ".")) v->qual = atof(p);
-            else bcf_float_set_missing(v->qual);
-            if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
-        } else if (i == 6) { // FILTER
-            if (strcmp(p, ".")) {
-                if (vcf_parse_filter(str, h, v, p, q)) goto err;
-            } else bcf_enc_vint(str, 0, 0, -1);
-            if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
-        } else if (i == 7) { // INFO
-            if (strcmp(p, ".")) {
-                if (vcf_parse_info(str, h, v, p, q)) goto err;
-            }
-            if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
-        } else if (i == 8) {// FORMAT
-            return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
+            if (r == q) break;
         }
     }
 
+    // QUAL
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) v->qual = atof(p);
+    else bcf_float_set_missing(v->qual);
+    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
+
+    // FILTER
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) {
+        if (vcf_parse_filter(str, h, v, p, q)) {
+            goto err;
+        }
+    } else bcf_enc_vint(str, 0, 0, -1);
+    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
+
+    // INFO
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) {
+        if (vcf_parse_info(str, h, v, p, q)) {
+            goto err;
+        }
+    }
+    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
+
+    // FORMAT; optional
+    p = kstrtok(0, 0, &aux);
+    if (p) {
+        *(q = (char*)aux.p) = 0;
+
+        return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
+    } else {
+        return 0;
+    }
+
  end:
     ret = 0;
 
@@ -3390,20 +3845,41 @@ static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_
 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
 {
     uint8_t *ptr_start = ptr;
+    int64_t len = 0;
     info->key = bcf_dec_typed_int1(ptr, &ptr);
-    info->len = bcf_dec_size(ptr, &ptr, &info->type);
+    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
     info->vptr = ptr;
     info->vptr_off  = ptr - ptr_start;
     info->vptr_free = 0;
     info->v1.i = 0;
     if (info->len == 1) {
-        if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
-        else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr);
-        else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr);
-        else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr);
-        else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr);
+        switch(info->type) {
+        case BCF_BT_INT8:
+        case BCF_BT_CHAR:
+            info->v1.i = *(int8_t*)ptr;
+            break;
+        case BCF_BT_INT16:
+            info->v1.i = le_to_i16(ptr);
+            len <<= 1;
+            break;
+        case BCF_BT_INT32:
+            info->v1.i = le_to_i32(ptr);
+            len <<= 2;
+            break;
+        case BCF_BT_FLOAT:
+            info->v1.f = le_to_float(ptr);
+            len <<= 2;
+            break;
+        case BCF_BT_INT64:
+            info->v1.i = le_to_i64(ptr);
+            len <<= 3;
+            break;
+        }
+    } else {
+        len <<= bcf_type_shift[info->type];
     }
-    ptr += info->len << bcf_type_shift[info->type];
+    ptr += len;
+
     info->vptr_len = ptr - info->vptr;
     return ptr;
 }
@@ -3425,7 +3901,7 @@ int bcf_unpack(bcf1_t *b, int which)
         ptr_ori = ptr;
         ptr = bcf_fmt_sized_array(&tmp, ptr);
         b->unpack_size[0] = ptr - ptr_ori;
-        kputc('\0', &tmp);
+        kputc_('\0', &tmp);
         d->id = tmp.s; d->m_id = tmp.m;
 
         // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
@@ -3436,7 +3912,7 @@ int bcf_unpack(bcf1_t *b, int which)
             // Use offset within tmp.s as realloc may change pointer
             d->allele[i] = (char *)(intptr_t)tmp.l;
             ptr = bcf_fmt_sized_array(&tmp, ptr);
-            kputc('\0', &tmp);
+            kputc_('\0', &tmp);
         }
         b->unpack_size[1] = ptr - ptr_ori;
         d->als = tmp.s; d->m_als = tmp.m;
@@ -3489,24 +3965,42 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
         errno = EINVAL;
         return -1;
     }
-    bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
+
+    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
+
+    // Cache of key lengths so we don't keep repeatedly using them.
+    // This assumes we're not modifying the header between successive calls
+    // to vcf_format, but that would lead to many other forms of breakage
+    // so it feels like a valid assumption to make.
+    //
+    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
+    // annotate) manipulates the headers directly without calling sync to
+    // refresh the data structures.  So we must do just-in-time length
+    // calculation during writes instead.
+    bcf_hdr_aux_t *aux = get_hdr_aux(h);
+    if (!aux->key_len) {
+        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
+            return -1;
+    }
+    size_t *key_len = aux->key_len;
+
     kputs(chrom, s); // CHROM
-    kputc('\t', s); kputll(v->pos + 1, s); // POS
-    kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
-    kputc('\t', s); // REF
+    kputc_('\t', s); kputll(v->pos + 1, s); // POS
+    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
+    kputc_('\t', s); // REF
     if (v->n_allele > 0) kputs(v->d.allele[0], s);
-    else kputc('.', s);
-    kputc('\t', s); // ALT
+    else kputc_('.', s);
+    kputc_('\t', s); // ALT
     if (v->n_allele > 1) {
         for (i = 1; i < v->n_allele; ++i) {
-            if (i > 1) kputc(',', s);
+            if (i > 1) kputc_(',', s);
             kputs(v->d.allele[i], s);
         }
-    } else kputc('.', s);
-    kputc('\t', s); // QUAL
-    if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
+    } else kputc_('.', s);
+    kputc_('\t', s); // QUAL
+    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
     else kputd(v->qual, s);
-    kputc('\t', s); // FILTER
+    kputc_('\t', s); // FILTER
     if (v->d.n_flt) {
         for (i = 0; i < v->d.n_flt; ++i) {
             int32_t idx = v->d.flt[i];
@@ -3517,20 +4011,50 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
                 errno = EINVAL;
                 return -1;
             }
-            if (i) kputc(';', s);
-            kputs(h->id[BCF_DT_ID][idx].key, s);
+            if (i) kputc_(';', s);
+            if (!key_len[idx])
+                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
+            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
         }
-    } else kputc('.', s);
-    kputc('\t', s); // INFO
+    } else kputc_('.', s);
+
+    kputc_('\t', s); // INFO
     if (v->n_info) {
+        uint8_t *ptr = v->shared.s
+            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
+               v->unpack_size[1] + v->unpack_size[2]
+            : NULL;
         int first = 1;
+        bcf_info_t *info = v->d.info;
+
+        // Note if we duplicate this code into custom packed and unpacked
+        // implementations then we gain a bit more speed, particularly with
+        // clang 13 (up to 5%).  Not sure why this is, but code duplication
+        // isn't pleasant and it's still faster adding packed support than
+        // not so it's a win, just not as good as it should be.
+        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
         for (i = 0; i < v->n_info; ++i) {
-            bcf_info_t *z = &v->d.info[i];
-            if ( !z->vptr ) continue;
-            if ( !first ) kputc(';', s);
-            first = 0;
-            if (z->key < 0 || z->key >= max_dt_id
-                || h->id[BCF_DT_ID][z->key].key == NULL) {
+            bcf_info_t in, *z;
+            if (info_packed) {
+                // Use a local bcf_info_t when data is packed
+                z = &in;
+                z->key  = bcf_dec_typed_int1(ptr, &ptr);
+                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
+                z->vptr = ptr;
+                ptr += z->len << bcf_type_shift[z->type];
+            } else {
+                // Else previously unpacked INFO struct
+                z = &info[i];
+
+                // Also potentially since deleted
+                if ( !z->vptr ) continue;
+            }
+
+            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
+                ? &h->id[BCF_DT_ID][z->key]
+                : NULL;
+
+            if (!id || !id->key) {
                 hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
                               z->key,
                               z->key < 0 ? "negative"
@@ -3539,71 +4063,156 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
                 errno = EINVAL;
                 return -1;
             }
-            kputs(h->id[BCF_DT_ID][z->key].key, s);
+
+            // KEY
+            if (!key_len[z->key])
+                key_len[z->key] = strlen(id->key);
+            size_t id_len = key_len[z->key];
+            if (ks_resize(s, s->l + 3 + id_len) < 0)
+                return -1;
+            char *sptr = s->s + s->l;
+            if ( !first ) {
+                *sptr++ = ';';
+                s->l++;
+            }
+            first = 0;
+            memcpy(sptr, id->key, id_len);
+            s->l += id_len;
+
+            // VALUE
             if (z->len <= 0) continue;
-            kputc('=', s);
-            if (z->len == 1)
-            {
-                switch (z->type)
-                {
-                    case BCF_BT_INT8:  if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
-                    case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
-                    case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
-                    case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break;
-                    case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break;
-                    case BCF_BT_CHAR:  kputc(z->v1.i, s); break;
-                    default:
-                        hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
-                        errno = EINVAL;
-                        return -1;
+            sptr[id_len] = '=';
+            s->l++;
+
+            if (z->len != 1 || info_packed) {
+                bcf_fmt_array(s, z->len, z->type, z->vptr);
+            } else {
+                // Single length vectors are unpacked into their
+                // own info.v1 union and handled separately.
+                if (z->type == BCF_BT_FLOAT) {
+                    if ( bcf_float_is_missing(z->v1.f) )
+                        kputc_('.', s);
+                    else
+                        kputd(z->v1.f, s);
+                } else if (z->type == BCF_BT_CHAR) {
+                    kputc_(z->v1.i, s);
+                } else if (z->type < BCF_BT_INT64) {
+                    int64_t missing[] = {
+                        0, // BCF_BT_NULL
+                        bcf_int8_missing,
+                        bcf_int16_missing,
+                        bcf_int32_missing,
+                    };
+                    if (z->v1.i == missing[z->type])
+                        kputc_('.', s);
+                    else
+                        kputw(z->v1.i, s);
+                } else if (z->type == BCF_BT_INT64) {
+                    if (z->v1.i == bcf_int64_missing)
+                        kputc_('.', s);
+                    else
+                        kputll(z->v1.i, s);
+                } else {
+                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
+                    errno = EINVAL;
+                    return -1;
                 }
             }
-            else bcf_fmt_array(s, z->len, z->type, z->vptr);
         }
-        if ( first ) kputc('.', s);
-    } else kputc('.', s);
+        if ( first ) kputc_('.', s);
+    } else kputc_('.', s);
+
     // FORMAT and individual information
-    if (v->n_sample)
-    {
+    if (v->n_sample) {
         int i,j;
-        if ( v->n_fmt)
-        {
+        if ( v->n_fmt) {
+            uint8_t *ptr = (uint8_t *)v->indiv.s;
             int gt_i = -1;
             bcf_fmt_t *fmt = v->d.fmt;
             int first = 1;
+            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
+
+            if (fmt_packed) {
+                // Local fmt as we have an array of num FORMAT keys,
+                // each of which points to N.Sample values.
+
+                // No real gain to be had in handling unpacked data here,
+                // but it doesn't cost us much in complexity either and
+                // it gives us flexibility.
+                fmt = malloc(v->n_fmt * sizeof(*fmt));
+                if (!fmt)
+                    return -1;
+            }
+
+            // KEYS
             for (i = 0; i < (int)v->n_fmt; ++i) {
-                if ( !fmt[i].p ) continue;
-                kputc(!first ? ':' : '\t', s); first = 0;
-                if (fmt[i].id < 0 || fmt[i].id >= max_dt_id
-                    || h->id[BCF_DT_ID][fmt[i].id].key == NULL) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
-                {
-                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h, v), v->pos+1);
+                bcf_fmt_t *z;
+                z = &fmt[i];
+                if (fmt_packed) {
+                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
+                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
+                    z->p    = ptr;
+                    z->size = z->n << bcf_type_shift[z->type];
+                    ptr += v->n_sample * z->size;
+                }
+                if ( !z->p ) continue;
+                kputc_(!first ? ':' : '\t', s); first = 0;
+
+                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
+                    ? &h->id[BCF_DT_ID][z->id]
+                    : NULL;
+
+                if (!id || !id->key) {
+                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
                     errno = EINVAL;
                     return -1;
                 }
-                kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
-                if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
+
+                if (!key_len[z->id])
+                    key_len[z->id] = strlen(id->key);
+                size_t id_len = key_len[z->id];
+                kputsn(id->key, id_len, s);
+                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
+                    gt_i = i;
             }
-            if ( first ) kputs("\t.", s);
+            if ( first ) kputsn("\t.", 2, s);
+
+            // VALUES per sample
             for (j = 0; j < v->n_sample; ++j) {
-                kputc('\t', s);
+                kputc_('\t', s);
                 first = 1;
-                for (i = 0; i < (int)v->n_fmt; ++i) {
-                    bcf_fmt_t *f = &fmt[i];
+                bcf_fmt_t *f = fmt;
+                for (i = 0; i < (int)v->n_fmt; i++, f++) {
                     if ( !f->p ) continue;
-                    if (!first) kputc(':', s);
+                    if (!first) kputc_(':', s);
                     first = 0;
-                    if (gt_i == i)
+                    if (gt_i == i) {
                         bcf_format_gt(f,j,s);
+                        break;
+                    }
+                    else if (f->n == 1)
+                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
                     else
                         bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
                 }
-                if ( first ) kputc('.', s);
+
+                // Simpler loop post GT and at least 1 iteration
+                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
+                    if ( !f->p ) continue;
+                    kputc_(':', s);
+                    if (f->n == 1)
+                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
+                    else
+                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
+                }
+                if ( first ) kputc_('.', s);
             }
+            if (fmt_packed)
+                free(fmt);
         }
         else
             for (j=0; j<=v->n_sample; j++)
-                kputs("\t.", s);
+                kputsn("\t.", 2, s);
     }
     kputc('\n', s);
     return 0;
@@ -3629,19 +4238,21 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
     if ( fp->format.compression!=no_compression ) {
         if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
             return -1;
-        if (fp->idx)
+        if (fp->idx && !fp->fp.bgzf->mt)
             hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
         ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
     } else {
         ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
     }
 
-    if (fp->idx) {
+    if (fp->idx && fp->format.compression == bgzf) {
         int tid;
         if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
             return -1;
 
-        if (hts_idx_push(fp->idx, tid, v->pos, v->pos + v->rlen, bgzf_tell(fp->fp.bgzf), 1) < 0)
+        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
+                          tid, v->pos, v->pos + v->rlen,
+                          bgzf_tell(fp->fp.bgzf), 1) < 0)
             return -1;
     }
 
@@ -3682,7 +4293,7 @@ static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
     }
     if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
     max_len += 256;
-    s = 1LL << (min_shift + starting_n_lvls * 3);
+    s = hts_bin_maxpos(min_shift, starting_n_lvls);
     for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
 
     if (nids_out) *nids_out = nids;
@@ -3830,6 +4441,11 @@ static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fn
 int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
     int n_lvls, nids = 0;
 
+    if (fp->format.compression != bgzf) {
+        hts_log_error("Indexing is only supported on BGZF-compressed files");
+        return -3; // Matches no-compression return for bcf_index_build3()
+    }
+
     if (fp->format.format == vcf)
         return vcf_idx_init(fp, h, min_shift, fnidx);
 
diff --git a/htslib/version.sh b/htslib/version.sh
index 7cb5c179c..f35234c2d 100755
--- a/htslib/version.sh
+++ b/htslib/version.sh
@@ -24,7 +24,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Master version, for use in tarballs or non-git source copies
-VERSION=1.18
+VERSION=1.21
 
 # If we have a git clone, then check against the current tag
 srcdir=${0%/version.sh}
diff --git a/import/pysam.h b/import/pysam.h
index da0728199..94cf203b8 100644
--- a/import/pysam.h
+++ b/import/pysam.h
@@ -69,6 +69,12 @@ extern int @pysam@_main(int argc, char *argv[]);
 #define bam_smpl_destroy @pysam@_bam_smpl_destroy
 #define read_file_list @pysam@_read_file_list
 
+/*! A non-static error() function name is used in bcftools, which collides
+    with glibc's error() function and leads to the wrong function being called
+    on some platforms. #define this name with a prefix to avoid this collision.
+ */
+#define error @pysam@_error
+
 #endif
 
 #endif
diff --git a/pyproject.toml b/pyproject.toml
index 1f89f9b2b..aa99371ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ license = { text = "MIT License" }
 authors = [
  { name = "Andreas Heger", email = "andreas.heger@gmail.com"}
 ]
-requires-python = ">=3.6"
+requires-python = ">=3.8"
 
 dynamic = [
     "classifiers",
@@ -22,7 +22,7 @@ requires = ["setuptools>=59.0", "Cython>=0.29.12,<4"]
 build-backend = "setuptools.build_meta:__legacy__"
 
 [tool.cibuildwheel]
-before-all = "{project}/devtools/install-prerequisites.sh"
+before-all = "{project}/devtools/install-prerequisites.sh {project}/devtools/emulate-tools.py /usr/local/bin"
 # Necessary until we build libhts.a out-of-tree from within build_temp
 before-build = "make -C {project}/htslib distclean"
 
@@ -32,7 +32,7 @@ test-command = "REF_PATH=: pytest {project}/tests"
 [tool.tox]
 legacy_tox_ini = """
     [tox]
-    envlist = py36, py311
+    envlist = py38, py311
 
     [testenv]
     deps = pytest
diff --git a/pysam/bcftools.py b/pysam/bcftools.py
index 7f3c56682..ed539e016 100644
--- a/pysam/bcftools.py
+++ b/pysam/bcftools.py
@@ -1,65 +1,32 @@
-try:
-    from typing import Final
-    HAVE_FINAL = True
-except ImportError:
-    HAVE_FINAL = False
+import pysam.utils
 
-from pysam.utils import PysamDispatcher
+annotate = pysam.utils.PysamDispatcher('bcftools', 'annotate')
+call = pysam.utils.PysamDispatcher('bcftools', 'call')
+cnv = pysam.utils.PysamDispatcher('bcftools', 'cnv')
+concat = pysam.utils.PysamDispatcher('bcftools', 'concat')
+consensus = pysam.utils.PysamDispatcher('bcftools', 'consensus')
+convert = pysam.utils.PysamDispatcher('bcftools', 'convert')
+csq = pysam.utils.PysamDispatcher('bcftools', 'csq')
+filter = pysam.utils.PysamDispatcher('bcftools', 'filter')
+gtcheck = pysam.utils.PysamDispatcher('bcftools', 'gtcheck')
+head = pysam.utils.PysamDispatcher('bcftools', 'head')
+index = pysam.utils.PysamDispatcher('bcftools', 'index')
+isec = pysam.utils.PysamDispatcher('bcftools', 'isec')
+merge = pysam.utils.PysamDispatcher('bcftools', 'merge')
+mpileup = pysam.utils.PysamDispatcher('bcftools', 'mpileup')
+norm = pysam.utils.PysamDispatcher('bcftools', 'norm')
+plugin = pysam.utils.PysamDispatcher('bcftools', 'plugin')
+query = pysam.utils.PysamDispatcher('bcftools', 'query')
+reheader = pysam.utils.PysamDispatcher('bcftools', 'reheader')
+roh = pysam.utils.PysamDispatcher('bcftools', 'roh')
+sort = pysam.utils.PysamDispatcher('bcftools', 'sort')
+stats = pysam.utils.PysamDispatcher('bcftools', 'stats')
+view = pysam.utils.PysamDispatcher('bcftools', 'view')
 
-_BCFTOOLS_DISPATCH = [
-    "index",
-    "annotate",
-    "concat",
-    "convert",
-    "isec",
-    "merge",
-    "norm",
-    "plugin",
-    "query",
-    "reheader",
-    "sort",
-    "view",
-    "head",
-    "call",
-    "consensus",
-    "cnv",
-    "csq",
-    "filter",
-    "gtcheck",
-    "mpileup",
-    "roh",
-    "stats"]
-
-
-def _wrap_command(dispatch: str) -> PysamDispatcher:
-    return PysamDispatcher("bcftools", dispatch, ())
-
-
-if not HAVE_FINAL:
-    # instantiate bcftools commands as python functions
-    for cmd in _BCFTOOLS_DISPATCH:
-        globals()[cmd] = PysamDispatcher("bcftools", cmd, None)
-else:
-    # python >=3.8
-    index: Final[PysamDispatcher] = _wrap_command("index")
-    annotate: Final[PysamDispatcher] = _wrap_command("annotate")
-    concat: Final[PysamDispatcher] = _wrap_command("concat")
-    convert: Final[PysamDispatcher] = _wrap_command("convert")
-    isec: Final[PysamDispatcher] = _wrap_command("isec")
-    merge: Final[PysamDispatcher] = _wrap_command("merge")
-    norm: Final[PysamDispatcher] = _wrap_command("norm")
-    plugin: Final[PysamDispatcher] = _wrap_command("plugin")
-    query: Final[PysamDispatcher] = _wrap_command("query")
-    reheader: Final[PysamDispatcher] = _wrap_command("reheader")
-    sort: Final[PysamDispatcher] = _wrap_command("sort")
-    view: Final[PysamDispatcher] = _wrap_command("view")
-    head: Final[PysamDispatcher] = _wrap_command("head")
-    call: Final[PysamDispatcher] = _wrap_command("call")
-    consensus: Final[PysamDispatcher] = _wrap_command("consensus")
-    cnv: Final[PysamDispatcher] = _wrap_command("cnv")
-    csq: Final[PysamDispatcher] = _wrap_command("csq")
-    filter: Final[PysamDispatcher] = _wrap_command("filter")
-    gtcheck: Final[PysamDispatcher] = _wrap_command("gtcheck")
-    mpileup: Final[PysamDispatcher] = _wrap_command("mpileup")
-    roh: Final[PysamDispatcher] = _wrap_command("roh")
-    stats: Final[PysamDispatcher] = _wrap_command("stats")
+__all__ = [
+    'annotate', 'call', 'cnv', 'concat', 'consensus',
+    'convert', 'csq', 'filter', 'gtcheck', 'head',
+    'index', 'isec', 'merge', 'mpileup', 'norm',
+    'plugin', 'query', 'reheader', 'roh', 'sort',
+    'stats', 'view',
+]
diff --git a/pysam/cbcftools_util.h b/pysam/cbcftools_util.h
deleted file mode 100644
index 4a9f2e9ca..000000000
--- a/pysam/cbcftools_util.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef CBCFTOOLS_UTIL_H
-#define CBCFTOOLS_UTIL_H
-
-int bcftools_main(int argc, char *argv[]);
-
-#endif
diff --git a/pysam/conftest_cstd.c b/pysam/conftest_cstd.c
new file mode 100644
index 000000000..58248470c
--- /dev/null
+++ b/pysam/conftest_cstd.c
@@ -0,0 +1,11 @@
+#include <string.h>
+#include <unistd.h>
+
+// C++-style comment; for-decl; optind
+
+int main(int argc, char **argv) {
+    int sum = 0;
+    for (int i = optind; i < argc; i++)
+        sum += strlen(argv[i]);
+    return sum;
+}
diff --git a/pysam/csamtools_util.h b/pysam/csamtools_util.h
deleted file mode 100644
index 0a03c1389..000000000
--- a/pysam/csamtools_util.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef CSAMTOOLS_UTIL_H
-#define CSAMTOOLS_UTIL_H
-
-int samtools_main(int argc, char *argv[]);
-
-#endif
diff --git a/pysam/htslib_util.c b/pysam/htslib_util.c
index 08309006e..bc8ab894b 100644
--- a/pysam/htslib_util.c
+++ b/pysam/htslib_util.c
@@ -1,13 +1,8 @@
-#include <ctype.h>
 #include <assert.h>
 #include "htslib/khash.h"
-#include "htslib/ksort.h"
 #include "htslib/sam.h"
 #include "htslib/hts.h"
-#include "htslib/knetfile.h"
-#include "htslib/kseq.h"
 #include "htslib_util.h"
-#include <stdio.h>
 
 #ifndef inline
 #define inline __inline
diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd
index e14cbb1f6..1c36319f1 100644
--- a/pysam/libcalignedsegment.pxd
+++ b/pysam/libcalignedsegment.pxd
@@ -31,6 +31,18 @@ from pysam.libcalignmentfile cimport AlignmentFile, AlignmentHeader
 ctypedef AlignmentFile AlignmentFile_t
 
 
+cdef class _AlignedSegment_Cache:  # For internal use only
+    cdef clear_query_sequences(self)
+    cdef clear_query_qualities(self)
+
+    cdef object query_sequence
+    cdef object query_alignment_sequence
+    cdef object query_qualities
+    cdef object query_qualities_str
+    cdef object query_alignment_qualities
+    cdef object query_alignment_qualities_str
+
+
 # Note: need to declare all C fields and methods here
 cdef class AlignedSegment:
 
@@ -41,10 +53,11 @@ cdef class AlignedSegment:
     cdef readonly AlignmentHeader header
 
     # caching of array properties for quick access
-    cdef object cache_query_qualities
-    cdef object cache_query_alignment_qualities
-    cdef object cache_query_sequence
-    cdef object cache_query_alignment_sequence
+    cdef _AlignedSegment_Cache cache
+
+    cdef object unused1
+    cdef object unused2
+    cdef object unused3
 
     # add an alignment tag with value to the AlignedSegment
     # an existing tag of the same name will be replaced.
diff --git a/pysam/libcalignedsegment.pyi b/pysam/libcalignedsegment.pyi
index bea806e72..66da76ed5 100644
--- a/pysam/libcalignedsegment.pyi
+++ b/pysam/libcalignedsegment.pyi
@@ -2,12 +2,7 @@ import enum
 import re
 import sys
 from array import array
-from typing import Any, List, Optional, Dict, Tuple, Union, overload
-
-if sys.version_info < (3, 8):
-    from typing_extensions import Literal
-else:
-    from typing import Literal
+from typing import Any, List, Literal, Optional, Dict, Tuple, Union, overload
 
 from pysam import AlignmentHeader  # type: ignore
 
@@ -43,30 +38,30 @@ KEY_NAMES: List[str]
 TagValue = Union[str, int, float, array]
 
 class CIGAR_OPS(enum.IntEnum):
-    CBACK: int
-    CDEL: int
-    CDIFF: int
-    CEQUAL: int
-    CHARD_CLIP: int
-    CINS: int
-    CMATCH: int
-    CPAD: int
-    CREF_SKIP: int
-    CSOFT_CLIP: int
+    CBACK = ...
+    CDEL = ...
+    CDIFF = ...
+    CEQUAL = ...
+    CHARD_CLIP = ...
+    CINS = ...
+    CMATCH = ...
+    CPAD = ...
+    CREF_SKIP = ...
+    CSOFT_CLIP = ...
 
 class SAM_FLAGS(enum.IntEnum):
-    FDUP: int
-    FMREVERSE: int
-    FMUNMAP: int
-    FPAIRED: int
-    FPROPER_PAIR: int
-    FQCFAIL: int
-    FREAD1: int
-    FREAD2: int
-    FREVERSE: int
-    FSECONDARY: int
-    FSUPPLEMENTARY: int
-    FUNMAP: int
+    FDUP = ...
+    FMREVERSE = ...
+    FMUNMAP = ...
+    FPAIRED = ...
+    FPROPER_PAIR = ...
+    FQCFAIL = ...
+    FREAD1 = ...
+    FREAD2 = ...
+    FREVERSE = ...
+    FSECONDARY = ...
+    FSUPPLEMENTARY = ...
+    FUNMAP = ...
 
 class AlignedSegment:
     header: AlignmentHeader
@@ -83,6 +78,7 @@ class AlignedSegment:
     template_length: int
     query_sequence: Optional[str]
     query_qualities: Optional[array]
+    query_qualities_str: Optional[str]
     bin: int
     is_paired: bool
     is_proper_pair: bool
@@ -121,6 +117,8 @@ class AlignedSegment:
     @property
     def query_alignment_qualities(self) -> Optional[array]: ...
     @property
+    def query_alignment_qualities_str(self) -> Optional[str]: ...
+    @property
     def query_alignment_start(self) -> int: ...
     @property
     def query_alignment_end(self) -> int: ...
@@ -135,9 +133,28 @@ class AlignedSegment:
     def get_reference_sequence(self) -> str: ...
     def get_forward_sequence(self) -> Optional[str]: ...
     def get_forward_qualities(self) -> Optional[array]: ...
-    def get_aligned_pairs(
-        self, matches_only: bool = ..., with_seq: bool = ...
-    ) -> List[Tuple[int, int]]: ...
+
+    @overload
+    def get_aligned_pairs(self, matches_only: Literal[True], with_seq: Literal[False] = ..., with_cigar: Literal[False] = ...) -> List[Tuple[int, int]]: ...
+    @overload
+    def get_aligned_pairs(self, matches_only: Literal[True], with_seq: Literal[False], with_cigar: Literal[True]) -> List[Tuple[int, int, CIGAR_OPS]]: ...
+    @overload
+    def get_aligned_pairs(self, matches_only: Literal[True], with_seq: Literal[True], with_cigar: Literal[False] = ...) -> List[Tuple[int, int, str]]: ...
+    @overload
+    def get_aligned_pairs(self, matches_only: Literal[True], with_seq: Literal[True], with_cigar: Literal[True]) -> List[Tuple[int, int, str, CIGAR_OPS]]: ...
+
+    @overload
+    def get_aligned_pairs(self, matches_only: bool = ..., with_seq: Literal[False] = ..., with_cigar: Literal[False] = ...) -> List[Tuple[Optional[int], Optional[int]]]: ...
+    @overload
+    def get_aligned_pairs(self, matches_only: bool, with_seq: Literal[False], with_cigar: Literal[True]) -> List[Tuple[Optional[int], Optional[int], CIGAR_OPS]]: ...
+    @overload
+    def get_aligned_pairs(self, matches_only: bool, with_seq: Literal[True], with_cigar: Literal[False] = ...) -> List[Tuple[Optional[int], Optional[int], Optional[str]]]: ...
+    @overload
+    def get_aligned_pairs(self, matches_only: bool, with_seq: Literal[True], with_cigar: Literal[True]) -> List[Tuple[Optional[int], Optional[int], Optional[str], CIGAR_OPS]]: ...
+
+    @overload
+    def get_aligned_pairs(self, matches_only: bool = ..., with_seq: bool = ..., with_cigar: bool = ...) -> List[Tuple]: ...
+
     def get_blocks(self) -> List[Tuple[int, int]]: ...
     def get_overlap(self, start: int, end: int) -> Optional[int]: ...
     def get_cigar_stats(self) -> Tuple[array, array]: ...
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx
index 3071f3753..ceffc3e30 100644
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -65,7 +65,6 @@ cimport cython
 from cpython cimport array as c_array
 from cpython cimport PyBytes_FromStringAndSize
 from libc.string cimport memset, strchr
-from cpython cimport array as c_array
 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
     INT8_MAX, INT16_MAX, INT32_MAX, \
     UINT8_MAX, UINT16_MAX, UINT32_MAX
@@ -73,8 +72,6 @@ from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
 from pysam.libchtslib cimport HTS_IDX_NOCOOR
 from pysam.libcutils cimport force_bytes, force_str, \
     charptr_to_str, charptr_to_bytes
-from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
-    array_to_qualitystring
 
 # Constants for binary tag conversion
 cdef char * htslib_types = 'cCsSiIf'
@@ -382,7 +379,7 @@ cdef inline pack_tags(tags):
                     raise ValueError("unsupported type code '{}'".format(value.typecode))
 
             if typecode not in DATATYPE2FORMAT:
-                raise ValueError("invalid value type '{}' ({})".format(chr(typecode), array.typecode))
+                raise ValueError("invalid value type '{}'".format(chr(typecode)))
 
             # use array.tostring() to retrieve byte representation and
             # save as bytes
@@ -407,8 +404,10 @@ cdef inline pack_tags(tags):
 
             if typecode == b'Z' or typecode == b'H':
                 datafmt = "2sB%is" % (len(value)+1)
-            else:
+            elif typecode in DATATYPE2FORMAT:
                 datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
+            else:
+                raise ValueError("invalid value type '{}'".format(chr(typecode)))
 
             args.extend([pytag[:2],
                          typecode,
@@ -554,28 +553,6 @@ cdef inline bytes getSequenceInRange(bam1_t *src,
     return charptr_to_bytes(seq)
 
 
-cdef inline object getQualitiesInRange(bam1_t *src,
-                                       uint32_t start,
-                                       uint32_t end):
-    """return python array of quality values from a bam1_t object"""
-
-    cdef uint8_t * p
-    cdef uint32_t k
-
-    p = pysam_bam_get_qual(src)
-    if p[0] == 0xff:
-        return None
-
-    # 'B': unsigned char
-    cdef c_array.array result = array.array('B', [0])
-    c_array.resize(result, end - start)
-
-    # copy data
-    memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
-
-    return result
-
-
 #####################################################################
 ## factory methods for instantiating extension classes
 cdef class AlignedSegment
@@ -586,6 +563,7 @@ cdef AlignedSegment makeAlignedSegment(bam1_t *src,
     cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
     dest._delegate = bam_dup1(src)
     dest.header = header
+    dest.cache = _AlignedSegment_Cache()
     return dest
 
 
@@ -757,7 +735,8 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
         elif op == BAM_CHARD_CLIP:
             pass # advances neither
 
-    cdef char *md_tag, md_buffer[2];
+    cdef char md_buffer[2]
+    cdef char *md_tag
     cdef uint8_t md_typecode = md_tag_ptr[0]
     if md_typecode == b'Z':
         md_tag = bam_aux2Z(md_tag_ptr)
@@ -891,6 +870,48 @@ cdef inline bytes build_reference_sequence(bam1_t * src):
     return seq
 
 
+cdef inline str safe_reference_name(AlignmentHeader header, int tid):
+    if tid == -1: return "*"
+    elif header is not None: return header.get_reference_name(tid)
+    else: return f"#{tid}"
+
+
+# Tuple-building helper functions used by AlignedSegment.get_aligned_pairs()
+
+cdef _alignedpairs_positions(qpos, pos, ref_seq, uint32_t r_idx, int op):
+    return (qpos, pos)
+
+
+cdef _alignedpairs_with_seq(qpos, pos, ref_seq, uint32_t r_idx, int op):
+    ref_base = ref_seq[r_idx] if ref_seq is not None else None
+    return (qpos, pos, ref_base)
+
+
+cdef _alignedpairs_with_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
+    return (qpos, pos, CIGAR_OPS(op))
+
+
+cdef _alignedpairs_with_seq_cigar(qpos, pos, ref_seq, uint32_t r_idx, int op):
+    ref_base = ref_seq[r_idx] if ref_seq is not None else None
+    return (qpos, pos, ref_base, CIGAR_OPS(op))
+
+
+cdef class _AlignedSegment_Cache:
+    def __cinit__(self):
+        self.clear_query_sequences()
+        self.clear_query_qualities()
+
+    cdef clear_query_sequences(self):
+        self.query_sequence = NotImplemented
+        self.query_alignment_sequence = NotImplemented
+
+    cdef clear_query_qualities(self):
+        self.query_qualities = NotImplemented
+        self.query_qualities_str = NotImplemented
+        self.query_alignment_qualities = NotImplemented
+        self.query_alignment_qualities_str = NotImplemented
+
+
 cdef class AlignedSegment:
     '''Class representing an aligned segment.
 
@@ -942,10 +963,7 @@ cdef class AlignedSegment:
         self._delegate.core.mpos = -1
 
         # caching for selected fields
-        self.cache_query_qualities = None
-        self.cache_query_alignment_qualities = None
-        self.cache_query_sequence = None
-        self.cache_query_alignment_sequence = None
+        self.cache = _AlignedSegment_Cache()
 
         self.header = header
 
@@ -957,7 +975,8 @@ cdef class AlignedSegment:
 
         The representation is an approximate :term:`SAM` format, because
         an aligned read might not be associated with a :term:`AlignmentFile`.
-        As a result :term:`tid` is shown instead of the reference name.
+        Hence when the read does not have an associated :class:`AlignedHeader`,
+        :term:`tid` is shown instead of the reference name.
         Similarly, the tags field is returned in its parsed state.
 
         To get a valid SAM record, use :meth:`to_string`.
@@ -966,17 +985,21 @@ cdef class AlignedSegment:
         # requires a valid header.
         return "\t".join(map(str, (self.query_name,
                                    self.flag,
-                                   "#%d" % self.reference_id if self.reference_id >= 0 else "*",
+                                   safe_reference_name(self.header, self.reference_id),
                                    self.reference_start + 1,
                                    self.mapping_quality,
                                    self.cigarstring,
-                                   "#%d" % self.next_reference_id if self.next_reference_id >= 0 else "*",
+                                   safe_reference_name(self.header, self.next_reference_id),
                                    self.next_reference_start + 1,
                                    self.template_length,
                                    self.query_sequence,
                                    self.query_qualities,
                                    self.tags)))
 
+    def __repr__(self):
+        ref = self.reference_name if self.header is not None else self.reference_id
+        return f'<{type(self).__name__}({self.query_name!r}, flags={self.flag}={self.flag:#x}, ref={ref!r}, zpos={self.reference_start}, mapq={self.mapping_quality}, cigar={self.cigarstring!r}, ...)>'
+
     def __copy__(self):
         return makeAlignedSegment(self._delegate, self.header)
 
@@ -1088,6 +1111,7 @@ cdef class AlignedSegment:
         cdef AlignedSegment dest = cls.__new__(cls)
         dest._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
         dest.header = header
+        dest.cache = _AlignedSegment_Cache()
 
         cdef kstring_t line
         line.l = line.m = len(sam)
@@ -1279,15 +1303,30 @@ cdef class AlignedSegment:
         empty string.
         '''
         def __get__(self):
-            c = self.cigartuples
-            if c is None:
+            cdef bam1_t *src = self._delegate
+            if pysam_get_n_cigar(src) == 0:
                 return None
-            # reverse order
-            else:
-                return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
+
+            cdef kstring_t buf
+            buf.l = buf.m = 0
+            buf.s = NULL
+
+            cdef uint32_t *cigar_p = pysam_bam_get_cigar(src)
+            cdef uint32_t op, l
+            cdef int k
+            for k from 0 <= k < pysam_get_n_cigar(src):
+                op = cigar_p[k] & BAM_CIGAR_MASK
+                l = cigar_p[k] >> BAM_CIGAR_SHIFT
+                kputl(l, &buf)
+                kputc(CODE2CIGAR[op], &buf)
+
+            try:
+                return buf.s[:buf.l].decode("ascii")
+            finally:
+                free(buf.s)
 
         def __set__(self, cigar):
-            if cigar is None or len(cigar) == 0:
+            if cigar is None or len(cigar) == 0 or cigar == "*":
                 self.cigartuples = []
             else:
                 parts = CIGAR_REGEX.findall(cigar)
@@ -1386,19 +1425,20 @@ cdef class AlignedSegment:
         has aligned the read to the reverse strand.)
         """
         def __get__(self):
-            if self.cache_query_sequence:
-                return self.cache_query_sequence
+            if self.cache.query_sequence is not NotImplemented:
+                return self.cache.query_sequence
 
             cdef bam1_t * src
             cdef char * s
             src = self._delegate
 
             if src.core.l_qseq == 0:
+                self.cache.query_sequence = None
                 return None
 
-            self.cache_query_sequence = force_str(getSequenceInRange(
+            self.cache.query_sequence = force_str(getSequenceInRange(
                 src, 0, src.core.l_qseq))
-            return self.cache_query_sequence
+            return self.cache.query_sequence
 
         def __set__(self, seq):
             # samtools manages sequence and quality length memory together
@@ -1409,7 +1449,7 @@ cdef class AlignedSegment:
             cdef int l, k
             cdef Py_ssize_t nbytes_new, nbytes_old
 
-            if seq == None:
+            if seq is None or len(seq) == 0 or seq == "*":
                 l = 0
             else:
                 l = len(seq)
@@ -1450,11 +1490,8 @@ cdef class AlignedSegment:
                 p = pysam_bam_get_qual(src)
                 memset(p, 0xff, l)
 
-            self.cache_query_sequence = force_str(seq)
-
-            # clear cached values for quality values
-            self.cache_query_qualities = None
-            self.cache_query_alignment_qualities = None
+            self.cache.clear_query_sequences()
+            self.cache.clear_query_qualities()
 
     property query_qualities:
         """read sequence base qualities, including :term:`soft clipped` bases 
@@ -1467,62 +1504,113 @@ cdef class AlignedSegment:
 
         Note that to set quality scores the sequence has to be set
         beforehand as this will determine the expected length of the
-        quality score array.
-
-        This method raises a ValueError if the length of the
-        quality scores and the sequence are not the same.
+        quality score array. Setting will raise a ValueError if the
+        length of the new quality scores is not the same as the
+        length of the existing sequence.
 
+        Quality scores to be set may be specified as a Python array
+        or other iterable of ints, or as a string of ASCII-encooded
+        FASTQ/SAM-style base quality characters.
         """
         def __get__(self):
+            if self.cache.query_qualities is not NotImplemented:
+                return self.cache.query_qualities
 
-            if self.cache_query_qualities:
-                return self.cache_query_qualities
+            cdef bam1_t *src = self._delegate
+            cdef int qual_len = src.core.l_qseq
+            cdef uint8_t *qual = pysam_bam_get_qual(src)
 
-            cdef bam1_t * src
-            cdef char * q
+            if qual_len == 0 or qual[0] == 0xff:
+                self.cache.query_qualities = None
+                return None
 
-            src = self._delegate
+            cdef c_array.array qual_array = array.array('B')
+            c_array.resize(qual_array, qual_len)
+            memcpy(qual_array.data.as_uchars, qual, qual_len)
+            self.cache.query_qualities = qual_array
+            return qual_array
 
-            if src.core.l_qseq == 0:
-                return None
+        def __set__(self, new_qual):
+            if isinstance(new_qual, str):
+                self.query_qualities_str = new_qual
+                return
 
-            self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
-            return self.cache_query_qualities
+            cdef bam1_t *src = self._delegate
+            cdef int qual_len = src.core.l_qseq
+            cdef uint8_t *qual = pysam_bam_get_qual(src)
 
-        def __set__(self, qual):
+            cdef int new_qual_len = len(new_qual) if new_qual is not None else 0
+            if new_qual_len == 0:
+                if qual_len != 0: memset(qual, 0xff, qual_len)
+                self.cache.clear_query_qualities()
+                return
 
-            # note that memory is already allocated via setting the sequence
-            # hence length match of sequence and quality needs is checked.
-            cdef bam1_t * src
-            cdef uint8_t * p
-            cdef int l
+            if new_qual_len != qual_len:
+                raise ValueError(f"quality ({new_qual_len}) and sequence ({qual_len}) length mismatch")
 
-            src = self._delegate
-            p = pysam_bam_get_qual(src)
-            if qual is None or len(qual) == 0:
-                # if absent and there is a sequence: set to 0xff
-                memset(p, 0xff, src.core.l_qseq)
+            if isinstance(new_qual, array.array) and new_qual.typecode == 'B':
+                memcpy(qual, (<c_array.array> new_qual).data.as_uchars, qual_len)
+                self.cache.clear_query_qualities()
                 return
 
-            # check for length match
-            l = len(qual)
-            if src.core.l_qseq != l:
-                raise ValueError(
-                    "quality and sequence mismatch: %i != %i" %
-                    (l, src.core.l_qseq))
+            cdef uint8_t *s = qual
+            cdef uint8_t q
+            for q in new_qual:
+                s[0] = q
+                s += 1
+
+            self.cache.clear_query_qualities()
+
+    property query_qualities_str:
+        """read sequence base qualities, including :term:`soft clipped` bases,
+        returned as an ASCII-encoded string similar to that in FASTQ or SAM files,
+        or None if base qualities are not present.
+
+        Note that to set quality scores the sequence has to be set beforehand
+        as this will determine the expected length of the quality score string.
+        Setting will raise a ValueError if the length of the new quality scores
+        is not the same as the length of the existing sequence.
+        """
+        def __get__(self):
+            if self.cache.query_qualities_str is not NotImplemented:
+                return self.cache.query_qualities_str
+
+            cdef bam1_t *src = self._delegate
+            cdef int qual_len = src.core.l_qseq
+            cdef uint8_t *qual = pysam_bam_get_qual(src)
+
+            if qual_len == 0 or qual[0] == 0xff:
+                self.cache.query_qualities_str = None
+                return None
+
+            cdef bytes qual_bytes = qual[:qual_len]
+            cdef char *s = qual_bytes
+            cdef int i
+            for i in range(qual_len): s[i] += 33
 
-            # create a python array object filling it
-            # with the quality scores
+            self.cache.query_qualities_str = qual_bytes.decode('ascii')
+            return self.cache.query_qualities_str
 
-            # NB: should avoid this copying if qual is
-            # already of the correct type.
-            cdef c_array.array result = c_array.array('B', qual)
+        def __set__(self, new_qual):
+            cdef bam1_t *src = self._delegate
+            cdef int qual_len = src.core.l_qseq
+            cdef uint8_t *qual = pysam_bam_get_qual(src)
 
-            # copy data
-            memcpy(p, result.data.as_voidptr, l)
+            cdef int new_qual_len = len(new_qual) if new_qual is not None else 0
+            if new_qual_len == 0 or new_qual == "*":
+                if qual_len != 0: memset(qual, 0xff, qual_len)
+                self.cache.clear_query_qualities()
+                return
+
+            if new_qual_len != qual_len:
+                raise ValueError(f"quality ({new_qual_len}) and sequence ({qual_len}) length mismatch")
 
-            # save in cache
-            self.cache_query_qualities = qual
+            cdef bytes new_qual_bytes = new_qual.encode('ascii')
+            cdef const char *s = new_qual_bytes
+            cdef int i
+            for i in range(qual_len): qual[i] = s[i] - 33
+
+            self.cache.clear_query_qualities()
 
     property bin:
         """properties bin"""
@@ -1701,8 +1789,8 @@ cdef class AlignedSegment:
         """
 
         def __get__(self):
-            if self.cache_query_alignment_sequence:
-                return self.cache_query_alignment_sequence
+            if self.cache.query_alignment_sequence is not NotImplemented:
+                return self.cache.query_alignment_sequence
 
             cdef bam1_t * src
             cdef uint32_t start, end
@@ -1710,14 +1798,14 @@ cdef class AlignedSegment:
             src = self._delegate
 
             if src.core.l_qseq == 0:
+                self.cache.query_alignment_sequence = None
                 return None
 
             start = getQueryStart(src)
             end   = getQueryEnd(src)
 
-            self.cache_query_alignment_sequence = force_str(
-                getSequenceInRange(src, start, end))
-            return self.cache_query_alignment_sequence
+            self.cache.query_alignment_sequence = force_str(getSequenceInRange(src, start, end))
+            return self.cache.query_alignment_sequence
 
     property query_alignment_qualities:
         """aligned query sequence quality values (None if not present). These
@@ -1732,26 +1820,44 @@ cdef class AlignedSegment:
         needs to be subtracted.
 
         This property is read-only.
-
         """
         def __get__(self):
+            if self.cache.query_alignment_qualities is not NotImplemented:
+                return self.cache.query_alignment_qualities
 
-            if self.cache_query_alignment_qualities:
-                return self.cache_query_alignment_qualities
+            cdef object full_qual = self.query_qualities
+            if full_qual is None:
+                self.cache.query_alignment_qualities = None
+                return None
 
-            cdef bam1_t * src
-            cdef uint32_t start, end
+            cdef bam1_t *src = self._delegate
+            cdef uint32_t start = getQueryStart(src)
+            cdef uint32_t end = getQueryEnd(src)
+            self.cache.query_alignment_qualities = full_qual[start:end]
+            return self.cache.query_alignment_qualities
 
-            src = self._delegate
+    property query_alignment_qualities_str:
+        """aligned query sequence quality values, returned as an ASCII-encoded string
+        similar to that in FASTQ or SAM files, or None if base qualities are not present.
+        These are the quality values that correspond to :attr:`query_alignment_sequence`,
+        i.e., excluding qualities corresponding to soft-clipped bases.
 
-            if src.core.l_qseq == 0:
+        This property is read-only.
+        """
+        def __get__(self):
+            if self.cache.query_alignment_qualities_str is not NotImplemented:
+                return self.cache.query_alignment_qualities_str
+
+            cdef object full_qual = self.query_qualities_str
+            if full_qual is None:
+                self.cache.query_alignment_qualities_str = None
                 return None
 
-            start = getQueryStart(src)
-            end   = getQueryEnd(src)
-            self.cache_query_alignment_qualities = \
-                getQualitiesInRange(src, start, end)
-            return self.cache_query_alignment_qualities
+            cdef bam1_t *src = self._delegate
+            cdef uint32_t start = getQueryStart(src)
+            cdef uint32_t end = getQueryEnd(src)
+            self.cache.query_alignment_qualities_str = full_qual[start:end]
+            return self.cache.query_alignment_qualities_str
 
     property query_alignment_start:
         """start index of the aligned query portion of the sequence (0-based,
@@ -1972,8 +2078,7 @@ cdef class AlignedSegment:
         else:
             return self.query_qualities
 
-
-    def get_aligned_pairs(self, matches_only=False, with_seq=False):
+    def get_aligned_pairs(self, matches_only=False, with_seq=False, with_cigar=False):
         """a list of aligned read (query) and reference positions.
 
         Each item in the returned list is a tuple consisting of
@@ -1997,6 +2102,9 @@ cdef class AlignedSegment:
           reference sequence. For CIGAR 'P' (padding in the reference)
           operations, the third tuple element will be None. Substitutions
           are lower-case. This option requires an MD tag to be present.
+        with_cigar : bool
+          If True, return an extra element in the tuple containing the
+          CIGAR operator corresponding to this position tuple.
 
         Returns
         -------
@@ -2010,6 +2118,8 @@ cdef class AlignedSegment:
         cdef bam1_t * src = self._delegate
         cdef bint _matches_only = bool(matches_only)
         cdef bint _with_seq = bool(with_seq)
+        cdef bint _with_cigar = bool(with_cigar)
+        cdef object (*make_tuple)(object, object, object, uint32_t, int)
 
         # TODO: this method performs no checking and assumes that
         # read sequence, cigar and MD tag are consistent.
@@ -2019,6 +2129,10 @@ cdef class AlignedSegment:
             ref_seq = force_str(build_reference_sequence(src))
             if ref_seq is None:
                 raise ValueError("MD tag not present")
+            make_tuple = _alignedpairs_with_seq_cigar if _with_cigar else _alignedpairs_with_seq
+        else:
+            ref_seq = None
+            make_tuple = _alignedpairs_with_cigar if _with_cigar else _alignedpairs_positions
 
         r_idx = 0
 
@@ -2034,39 +2148,25 @@ cdef class AlignedSegment:
             l = cigar_p[k] >> BAM_CIGAR_SHIFT
 
             if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
-                if _with_seq:
-                    for i from pos <= i < pos + l:
-                        result.append((qpos, i, ref_seq[r_idx]))
-                        r_idx += 1
-                        qpos += 1
-                else:
-                    for i from pos <= i < pos + l:
-                        result.append((qpos, i))
-                        qpos += 1
+                for i from pos <= i < pos + l:
+                    result.append(make_tuple(qpos, i, ref_seq, r_idx, op))
+                    r_idx += 1
+                    qpos += 1
                 pos += l
 
             elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD:
                 if not _matches_only:
-                    if _with_seq:
-                        for i from pos <= i < pos + l:
-                            result.append((qpos, None, None))
-                            qpos += 1
-                    else:
-                        for i from pos <= i < pos + l:
-                            result.append((qpos, None))
-                            qpos += 1
+                    for i from pos <= i < pos + l:
+                        result.append(make_tuple(qpos, None, None, 0, op))
+                        qpos += 1
                 else:
                     qpos += l
 
             elif op == BAM_CDEL:
                 if not _matches_only:
-                    if _with_seq:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i, ref_seq[r_idx]))
-                            r_idx += 1
-                    else:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i))
+                    for i from pos <= i < pos + l:
+                        result.append(make_tuple(None, i, ref_seq, r_idx, op))
+                        r_idx += 1
                 else:
                     r_idx += l
                 pos += l
@@ -2076,12 +2176,8 @@ cdef class AlignedSegment:
 
             elif op == BAM_CREF_SKIP:
                 if not _matches_only:
-                    if _with_seq:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i, None))
-                    else:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i))
+                    for i from pos <= i < pos + l:
+                        result.append(make_tuple(None, i, None, 0, op))
 
                 pos += l
 
@@ -2163,31 +2259,32 @@ cdef class AlignedSegment:
 
         The output order in the array is "MIDNSHP=X" followed by a
         field for the NM tag. If the NM tag is not present, this
-        field will always be 0.
-
-        +-----+--------------+-----+
-        |M    |BAM_CMATCH    |0    |
-        +-----+--------------+-----+
-        |I    |BAM_CINS      |1    |
-        +-----+--------------+-----+
-        |D    |BAM_CDEL      |2    |
-        +-----+--------------+-----+
-        |N    |BAM_CREF_SKIP |3    |
-        +-----+--------------+-----+
-        |S    |BAM_CSOFT_CLIP|4    |
-        +-----+--------------+-----+
-        |H    |BAM_CHARD_CLIP|5    |
-        +-----+--------------+-----+
-        |P    |BAM_CPAD      |6    |
-        +-----+--------------+-----+
-        |=    |BAM_CEQUAL    |7    |
-        +-----+--------------+-----+
-        |X    |BAM_CDIFF     |8    |
-        +-----+--------------+-----+
-        |B    |BAM_CBACK     |9    |
-        +-----+--------------+-----+
-        |NM   |NM tag        |10   |
-        +-----+--------------+-----+
+        field will always be 0. (Accessing this field via index -1
+        avoids changes if more CIGAR operators are added in future.)
+
+        +-----+--------------------------+--------+
+        |M    |pysam.CIGAR_OPS.CMATCH    |0       |
+        +-----+--------------------------+--------+
+        |I    |pysam.CIGAR_OPS.CINS      |1       |
+        +-----+--------------------------+--------+
+        |D    |pysam.CIGAR_OPS.CDEL      |2       |
+        +-----+--------------------------+--------+
+        |N    |pysam.CIGAR_OPS.CREF_SKIP |3       |
+        +-----+--------------------------+--------+
+        |S    |pysam.CIGAR_OPS.CSOFT_CLIP|4       |
+        +-----+--------------------------+--------+
+        |H    |pysam.CIGAR_OPS.CHARD_CLIP|5       |
+        +-----+--------------------------+--------+
+        |P    |pysam.CIGAR_OPS.CPAD      |6       |
+        +-----+--------------------------+--------+
+        |=    |pysam.CIGAR_OPS.CEQUAL    |7       |
+        +-----+--------------------------+--------+
+        |X    |pysam.CIGAR_OPS.CDIFF     |8       |
+        +-----+--------------------------+--------+
+        |B    |pysam.CIGAR_OPS.CBACK     |9       |
+        +-----+--------------------------+--------+
+        |NM   |NM tag                    |10 or -1|
+        +-----+--------------------------+--------+
 
         If no cigar string is present, empty arrays will be returned.
 
@@ -2242,27 +2339,27 @@ cdef class AlignedSegment:
 
         The operations are:
 
-        +-----+--------------+-----+
-        |M    |BAM_CMATCH    |0    |
-        +-----+--------------+-----+
-        |I    |BAM_CINS      |1    |
-        +-----+--------------+-----+
-        |D    |BAM_CDEL      |2    |
-        +-----+--------------+-----+
-        |N    |BAM_CREF_SKIP |3    |
-        +-----+--------------+-----+
-        |S    |BAM_CSOFT_CLIP|4    |
-        +-----+--------------+-----+
-        |H    |BAM_CHARD_CLIP|5    |
-        +-----+--------------+-----+
-        |P    |BAM_CPAD      |6    |
-        +-----+--------------+-----+
-        |=    |BAM_CEQUAL    |7    |
-        +-----+--------------+-----+
-        |X    |BAM_CDIFF     |8    |
-        +-----+--------------+-----+
-        |B    |BAM_CBACK     |9    |
-        +-----+--------------+-----+
+        +-----+--------------------------+-----+
+        |M    |pysam.CIGAR_OPS.CMATCH    |0    |
+        +-----+--------------------------+-----+
+        |I    |pysam.CIGAR_OPS.CINS      |1    |
+        +-----+--------------------------+-----+
+        |D    |pysam.CIGAR_OPS.CDEL      |2    |
+        +-----+--------------------------+-----+
+        |N    |pysam.CIGAR_OPS.CREF_SKIP |3    |
+        +-----+--------------------------+-----+
+        |S    |pysam.CIGAR_OPS.CSOFT_CLIP|4    |
+        +-----+--------------------------+-----+
+        |H    |pysam.CIGAR_OPS.CHARD_CLIP|5    |
+        +-----+--------------------------+-----+
+        |P    |pysam.CIGAR_OPS.CPAD      |6    |
+        +-----+--------------------------+-----+
+        |=    |pysam.CIGAR_OPS.CEQUAL    |7    |
+        +-----+--------------------------+-----+
+        |X    |pysam.CIGAR_OPS.CDIFF     |8    |
+        +-----+--------------------------+-----+
+        |B    |pysam.CIGAR_OPS.CBACK     |9    |
+        +-----+--------------------------+-----+
 
         .. note::
             The output is a list of (operation, length) tuples, such as
@@ -2769,11 +2866,11 @@ cdef class AlignedSegment:
         def __set__(self, v):
             self.query_sequence = v
     property qual:
-        """deprecated, use :attr:`query_qualities` instead."""
+        """deprecated, use :attr:`query_qualities` or :attr:`query_qualities_str` instead."""
         def __get__(self):
-            return array_to_qualitystring(self.query_qualities)
+            return self.query_qualities_str
         def __set__(self, v):
-            self.query_qualities = qualitystring_to_array(v)
+            self.query_qualities_str = v
     property alen:
         """deprecated, use :attr:`reference_length` instead."""
         def __get__(self):
@@ -2797,34 +2894,24 @@ cdef class AlignedSegment:
         instead."""
         def __get__(self):
             return self.query_alignment_sequence
-        def __set__(self, v):
-            self.query_alignment_sequence = v
     property qqual:
-        """deprecated, use :attr:`query_alignment_qualities` 
+        """deprecated, use :attr:`query_alignment_qualities` or :attr:`query_alignment_qualities_str`
         instead."""
         def __get__(self):
-            return array_to_qualitystring(self.query_alignment_qualities)
-        def __set__(self, v):
-            self.query_alignment_qualities = qualitystring_to_array(v)
+            return self.query_alignment_qualities_str
     property qstart:
         """deprecated, use :attr:`query_alignment_start` instead."""
         def __get__(self):
             return self.query_alignment_start
-        def __set__(self, v):
-            self.query_alignment_start = v
     property qend:
         """deprecated, use :attr:`query_alignment_end` instead."""
         def __get__(self):
             return self.query_alignment_end
-        def __set__(self, v):
-            self.query_alignment_end = v
     property qlen:
         """deprecated, use :attr:`query_alignment_length` 
         instead."""
         def __get__(self):
             return self.query_alignment_length
-        def __set__(self, v):
-            self.query_alignment_length = v
     property mrnm:
         """deprecated, use :attr:`next_reference_id` instead."""
         def __get__(self):
@@ -3384,10 +3471,16 @@ cpdef enum SAM_FLAGS:
     FSUPPLEMENTARY = 2048
 
 
+# TODO Remove these and remove the enumerators from __all__
+globals().update(getattr(CIGAR_OPS, "__members__"))
+globals().update(getattr(SAM_FLAGS, "__members__"))
+
+
 __all__ = [
     "AlignedSegment",
     "PileupColumn",
     "PileupRead",
+    "CIGAR_OPS",
     "CMATCH",
     "CINS",
     "CDEL",
@@ -3398,6 +3491,7 @@ __all__ = [
     "CEQUAL",
     "CDIFF",
     "CBACK",
+    "SAM_FLAGS",
     "FPAIRED",
     "FPROPER_PAIR",
     "FUNMAP",
diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi
index 28b395aed..5723a5afb 100644
--- a/pysam/libcalignmentfile.pyi
+++ b/pysam/libcalignmentfile.pyi
@@ -11,16 +11,12 @@ from typing import (
     Union,
     Callable,
     List,
+    Literal,
     Iterable,
 )
 
-if sys.version_info < (3, 8):
-    from typing_extensions import Literal
-else:
-    from typing import Literal
-
 from pysam.libchtslib import HTSFile, _HasFileNo
-from pysam.libcalignedsegment import AlignedSegment
+from pysam.libcalignedsegment import AlignedSegment, PileupColumn
 from pysam.libcfaidx import FastaFile
 
 class IndexStats(NamedTuple):
@@ -67,7 +63,7 @@ class AlignmentHeader:
     def lengths(self) -> Tuple[int]: ...
     def to_dict(self) -> Dict: ...
     def get_reference_name(self, tid: int) -> Optional[str]: ...
-    def get_reference_length(self, reference: int) -> int: ...
+    def get_reference_length(self, reference: str) -> int: ...
     def is_valid_tid(self, tid: int) -> bool: ...
     def get_tid(self, reference: str) -> int: ...
 
@@ -204,7 +200,7 @@ class IteratorRowRegion(IteratorRow): ...
 class IteratorRowSelection(IteratorRow): ...
 
 class IteratorColumn:
-    def __iter__(self) -> IteratorRow: ...
+    def __iter__(self) -> IteratorColumn: ...
     def __next__(self) -> PileupColumn: ...
     @property
     def seq_len(self) -> int: ...
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx
index 9a92d076c..d65d06b4c 100644
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -65,13 +65,13 @@ import re
 import warnings
 import array
 from libc.errno  cimport errno, EPIPE
-from libc.string cimport strcmp, strpbrk, strerror
+from libc.string cimport strcmp, strpbrk
 from libc.stdint cimport INT32_MAX
 
 from cpython cimport array as c_array
 
 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
-from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport OSError_from_errno, encode_filename, from_string_and_size
 from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
 from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
                               HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
@@ -302,7 +302,7 @@ cdef class AlignmentHeader(object):
                 data = header_dict[record]
                 if not isinstance(data, VALID_HEADER_TYPES[record]):
                     raise ValueError(
-                        "invalid type for record %s: %s, expected %s".format(
+                        "invalid type for record {}: {}, expected {}".format(
                             record, type(data), VALID_HEADER_TYPES[record]))
                 if isinstance(data, Mapping):
                     lines.append(build_header_line(data, record))
@@ -818,6 +818,7 @@ cdef class AlignmentFile(HTSFile):
         cdef char *cindexname = NULL
         cdef char *cmode = NULL
         cdef bam_hdr_t * hdr = NULL
+        cdef int ret
 
         if threads > 1 and ignore_truncation:
            # This won't raise errors if reaching a truncated alignment,
@@ -921,9 +922,7 @@ cdef class AlignmentFile(HTSFile):
 
             if self.htsfile == NULL:
                 if errno:
-                    raise IOError(errno, "could not open alignment file `{}`: {}".format(
-                        force_str(filename),
-                        force_str(strerror(errno))))
+                    raise OSError_from_errno("Could not open alignment file", filename)
                 else:
                     raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
             if format_options and len(format_options):
@@ -939,7 +938,9 @@ cdef class AlignmentFile(HTSFile):
             if "b" in mode or "c" in mode or "h" in mode:
                 hdr = self.header.ptr
                 with nogil:
-                    sam_hdr_write(self.htsfile, hdr)
+                    ret = sam_hdr_write(self.htsfile, hdr)
+                if ret < 0:
+                    raise OSError_from_errno("Could not write headers", filename)
 
         elif mode[0] == "r":
             # open file for reading
@@ -947,12 +948,11 @@ cdef class AlignmentFile(HTSFile):
 
             if self.htsfile == NULL:
                 if errno:
-                    raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename),
-                                  force_str(strerror(errno))))
+                    raise OSError_from_errno("Could not open alignment file", filename)
                 else:
                     raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
 
-            if self.htsfile.format.category != sequence_data:
+            if hts_get_format(self.htsfile).category != sequence_data:
                 raise ValueError("file does not contain alignment data")
 
             if format_options and len(format_options):
@@ -1014,7 +1014,7 @@ cdef class AlignmentFile(HTSFile):
 
                     if not self.index and (cindexname or require_index):
                         if errno:
-                            raise IOError(errno, force_str(strerror(errno)))
+                            raise OSError_from_errno("Could not open index file", self.index_filename)
                         else:
                             raise IOError('unable to open index file `%s`' % self.index_filename)
 
@@ -1679,12 +1679,11 @@ cdef class AlignmentFile(HTSFile):
 
         self.header = None
 
-        if ret < 0:
-            global errno
-            if errno == EPIPE:
-                errno = 0
+        if ret < 0 and errno != EPIPE:
+            if isinstance(self.filename, (str, bytes)):
+                raise OSError_from_errno("Closing failed", self.filename)
             else:
-                raise IOError(errno, force_str(strerror(errno)))
+                raise OSError_from_errno("Closing failed")
 
     def __dealloc__(self):
         cdef int ret = 0
@@ -1703,12 +1702,11 @@ cdef class AlignmentFile(HTSFile):
             bam_destroy1(self.b)
             self.b = NULL
 
-        if ret < 0:
-            global errno
-            if errno == EPIPE:
-                errno = 0
+        if ret < 0 and errno != EPIPE:
+            if isinstance(self.filename, (str, bytes)):
+                raise OSError_from_errno("Closing failed", self.filename)
             else:
-                raise IOError(errno, force_str(strerror(errno)))
+                raise OSError_from_errno("Closing failed")
 
     cpdef int write(self, AlignedSegment read) except -1:
         '''
@@ -1742,8 +1740,7 @@ cdef class AlignmentFile(HTSFile):
         #      when ret == -1 we get a "SystemError: error return without
         #      exception set".
         if ret < 0:
-            raise IOError(
-            "sam_write1 failed with error code {}".format(ret))
+            raise IOError("sam_write1 failed with error code {}".format(ret))
 
         return ret
 
@@ -2298,14 +2295,14 @@ cdef class IteratorRowSelection(IteratorRow):
         # end iteration if out of positions
         if self.current_pos >= len(self.positions): return -1
 
+        cdef int ret
         cdef uint64_t pos = self.positions[self.current_pos]
         with nogil:
-            bgzf_seek(hts_get_bgzfp(self.htsfile),
-                      pos,
-                      0)
+            ret = bgzf_seek(hts_get_bgzfp(self.htsfile), pos, 0)
+        if ret < 0:
+            raise OSError_from_errno("Can't seek", self.samfile.filename)
         self.current_pos += 1
 
-        cdef int ret
         cdef bam_hdr_t * hdr = self.header.ptr
         with nogil:
             ret = sam_read1(self.htsfile,
@@ -2323,38 +2320,30 @@ cdef class IteratorRowSelection(IteratorRow):
             raise IOError(read_failure_reason(ret))
 
 
-cdef int __advance_nofilter(void *data, bam1_t *b):
+cdef int __advance_nofilter(void *data, bam1_t *b) noexcept nogil:
     '''advance without any read filtering.
     '''
     cdef __iterdata * d = <__iterdata*>data
-    cdef int ret
-    with nogil:
-        ret = sam_itr_next(d.htsfile, d.iter, b)
-    return ret
+    return sam_itr_next(d.htsfile, d.iter, b)
 
 
-cdef int __advance_raw_nofilter(void *data, bam1_t *b):
+cdef int __advance_raw_nofilter(void *data, bam1_t *b) noexcept nogil:
     '''advance (without iterator) without any read filtering.
     '''
     cdef __iterdata * d = <__iterdata*>data
-    cdef int ret
-    with nogil:
-        ret = sam_read1(d.htsfile, d.header, b)
-    return ret
+    return sam_read1(d.htsfile, d.header, b)
 
 
-cdef int __advance_all(void *data, bam1_t *b):
+cdef int __advance_all(void *data, bam1_t *b) noexcept nogil:
     '''only use reads for pileup passing basic filters such as
 
     BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
     '''
 
     cdef __iterdata * d = <__iterdata*>data
-    cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
     cdef int ret
     while 1:
-        with nogil:
-            ret = sam_itr_next(d.htsfile, d.iter, b)
+        ret = sam_itr_next(d.htsfile, d.iter, b)
         if ret < 0:
             break
         if b.core.flag & d.flag_filter:
@@ -2363,7 +2352,7 @@ cdef int __advance_all(void *data, bam1_t *b):
     return ret
 
 
-cdef int __advance_raw_all(void *data, bam1_t *b):
+cdef int __advance_raw_all(void *data, bam1_t *b) noexcept nogil:
     '''only use reads for pileup passing basic filters such as
 
     BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
@@ -2372,8 +2361,7 @@ cdef int __advance_raw_all(void *data, bam1_t *b):
     cdef __iterdata * d = <__iterdata*>data
     cdef int ret
     while 1:
-        with nogil:
-            ret = sam_read1(d.htsfile, d.header, b)
+        ret = sam_read1(d.htsfile, d.header, b)
         if ret < 0:
             break
         if b.core.flag & d.flag_filter:
@@ -2382,7 +2370,7 @@ cdef int __advance_raw_all(void *data, bam1_t *b):
     return ret
 
 
-cdef int __advance_samtools(void * data, bam1_t * b):
+cdef int __advance_samtools(void * data, bam1_t *b) nogil:
     '''advance using same filter and read processing as in
     the samtools pileup.
     '''
@@ -2391,8 +2379,7 @@ cdef int __advance_samtools(void * data, bam1_t * b):
     cdef int q
 
     while 1:
-        with nogil:
-            ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
+        ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
         if ret < 0:
             break
         if b.core.flag & d.flag_filter:
@@ -2405,13 +2392,7 @@ cdef int __advance_samtools(void * data, bam1_t * b):
             if d.seq != NULL:
                 free(d.seq)
             d.tid = b.core.tid
-            with nogil:
-                d.seq = faidx_fetch_seq(
-                    d.fastafile,
-                    d.header.target_name[d.tid],
-                    0, MAX_POS,
-                    &d.seq_len)
-
+            d.seq = faidx_fetch_seq(d.fastafile, d.header.target_name[d.tid], 0, MAX_POS, &d.seq_len)
             if d.seq == NULL:
                 raise ValueError(
                     "reference sequence for '{}' (tid={}) not found".format(
@@ -2563,19 +2544,13 @@ cdef class IteratorColumn:
 
         if self.stepper is None or self.stepper == "all":
             with nogil:
-                self.pileup_iter = bam_mplp_init(1,
-                                                 <bam_plp_auto_f>&__advance_all,
-                                                 data)
+                self.pileup_iter = bam_mplp_init(1, __advance_all, data)
         elif self.stepper == "nofilter":
             with nogil:
-                self.pileup_iter = bam_mplp_init(1,
-                                                 <bam_plp_auto_f>&__advance_nofilter,
-                                                 data)
+                self.pileup_iter = bam_mplp_init(1, __advance_nofilter, data)
         elif self.stepper == "samtools":
             with nogil:
-                self.pileup_iter = bam_mplp_init(1,
-                                                 <bam_plp_auto_f>&__advance_samtools,
-                                                 data)
+                self.pileup_iter = bam_mplp_init(1, <bam_plp_auto_f>__advance_samtools, data)
         else:
             raise ValueError(
                 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
@@ -2612,19 +2587,13 @@ cdef class IteratorColumn:
 
         if self.stepper is None or self.stepper == "all":
             with nogil:
-                self.pileup_iter = bam_mplp_init(1,
-                                                 <bam_plp_auto_f>&__advance_raw_all,
-                                                 data)
+                self.pileup_iter = bam_mplp_init(1, __advance_raw_all, data)
         elif self.stepper == "nofilter":
             with nogil:
-                self.pileup_iter = bam_mplp_init(1,
-                                                 <bam_plp_auto_f>&__advance_raw_nofilter,
-                                                 data)
+                self.pileup_iter = bam_mplp_init(1, __advance_raw_nofilter, data)
         elif self.stepper == "samtools":
             with nogil:
-                self.pileup_iter = bam_mplp_init(1,
-                                                 <bam_plp_auto_f>&__advance_samtools,
-                                                 data)
+                self.pileup_iter = bam_mplp_init(1, <bam_plp_auto_f>__advance_samtools, data)
         else:
             raise ValueError(
                 "unknown stepper option `%s` in IteratorColumn" % self.stepper)
@@ -2911,7 +2880,7 @@ cdef class IndexedReads:
             with nogil:
                 self.htsfile = hts_open(cfilename, 'r')
             if self.htsfile == NULL:
-                raise OSError("unable to reopen htsfile")
+                raise OSError_from_errno("Unable to reopen file", cfilename)
 
             # need to advance in newly opened file to position after header
             # better: use seek/tell?
diff --git a/pysam/libcbcf.pyi b/pysam/libcbcf.pyi
index d62f16966..f5a7e34cd 100644
--- a/pysam/libcbcf.pyi
+++ b/pysam/libcbcf.pyi
@@ -7,6 +7,7 @@ from typing import (
     Tuple,
     Iterator,
     List,
+    Literal,
     Iterable,
     Dict,
     overload,
@@ -15,11 +16,6 @@ from typing import (
     Generic,
 )
 
-if sys.version_info < (3, 8):
-    from typing_extensions import Literal
-else:
-    from typing import Literal
-
 from pysam.libchtslib import HTSFile, _HasFileNo
 
 _D = TypeVar("_D")
@@ -152,7 +148,7 @@ class VariantHeader:
         stop: int = ...,
         alleles: Optional[Tuple[str, ...]] = ...,
         id: Optional[str] = ...,
-        qual: Optional[int] = ...,
+        qual: Optional[float] = ...,
         filter: Optional[Any] = ...,
         info: Optional[Mapping[str, _InfoValue]] = ...,
         samples: Optional[Iterable[Optional[Mapping[str, _FormatValue]]]] = ...,
@@ -218,7 +214,7 @@ class VariantRecord:
     start: int
     stop: int
     rlen: int
-    qual: Optional[int]
+    qual: Optional[float]
     id: Optional[str]
     ref: Optional[str]
     alleles: Optional[Tuple[str, ...]]
@@ -241,8 +237,8 @@ class VariantRecordSample(_Mapping[str, _FormatValue]):
     def index(self) -> int: ...
     @property
     def name(self) -> str: ...
-    allele_indices: Optional[Tuple[Optional[int, ...]]]
-    alleles: Optional[Tuple[Optional[str, ...]]]
+    allele_indices: Optional[Tuple[Optional[int], ...]]
+    alleles: Optional[Tuple[Optional[str], ...]]
     phased: bool
     def __setitem__(self, key: str, value: _FormatValue) -> None: ...
     def __delitem__(self, key: str) -> None: ...
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx
index 8ecfe5f38..24179d6c0 100644
--- a/pysam/libcbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -143,7 +143,7 @@ cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
 ########################################################################
 
 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
+from pysam.libcutils cimport OSError_from_errno, encode_filename, from_string_and_size, decode_bytes
 
 
 ########################################################################
@@ -2111,14 +2111,10 @@ cdef class VariantHeader(object):
             rec.info.update(info)
 
         if kwargs:
-            if 'GT' in kwargs:
-                rec.samples[0]['GT'] = kwargs.pop('GT')
             rec.samples[0].update(kwargs)
 
         if samples:
             for i, sample in enumerate(samples):
-                if 'GT' in sample:
-                    rec.samples[i]['GT'] = sample.pop('GT')
                 rec.samples[i].update(sample)
 
         return rec
@@ -2162,10 +2158,12 @@ cdef class VariantHeader(object):
                     quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
 
                     key = force_bytes(key)
-                    bcf_hrec_add_key(hrec, key, <int>len(key))
+                    if bcf_hrec_add_key(hrec, key, <int>len(key)) < 0:
+                        raise MemoryError("Could not allocate VCF header record")
 
                     value = force_bytes(str(value))
-                    bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
+                    if bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted) < 0:
+                        raise MemoryError("Could not allocate VCF header record")
         except:
             bcf_hrec_destroy(hrec)
             raise
@@ -4120,42 +4118,42 @@ cdef class VariantFile(HTSFile):
         if not self.htsfile or not self.header:
             return
 
+        cdef int ret
+
         # Write header if no records were written
         if self.htsfile.is_write and not self.header_written:
             with nogil:
-                bcf_hdr_write(self.htsfile, self.header.ptr)
+                ret = bcf_hdr_write(self.htsfile, self.header.ptr)
+            if ret < 0 and errno != EPIPE:
+                raise OSError_from_errno("Can't write headers", self.filename)
 
-        cdef int ret = hts_close(self.htsfile)
+        ret = hts_close(self.htsfile)
         self.htsfile = NULL
         self.header = self.index = None
 
-        if ret < 0:
-            global errno
-            if errno == EPIPE:
-                errno = 0
-            else:
-                raise IOError(errno, force_str(strerror(errno)))
+        if ret < 0 and errno != EPIPE:
+            raise OSError_from_errno("Closing failed", self.filename)
 
     def close(self):
         """closes the :class:`pysam.VariantFile`."""
         if not self.htsfile:
             return
 
+        cdef int ret
+
         # Write header if no records were written
         if self.htsfile.is_write and not self.header_written:
             with nogil:
-                bcf_hdr_write(self.htsfile, self.header.ptr)
+                ret = bcf_hdr_write(self.htsfile, self.header.ptr)
+            if ret < 0 and errno != EPIPE:
+                raise OSError_from_errno("Can't write headers", self.filename)
 
-        cdef int ret = hts_close(self.htsfile)
+        ret = hts_close(self.htsfile)
         self.htsfile = NULL
         self.header = self.index = None
 
-        if ret < 0:
-            global errno
-            if errno == EPIPE:
-                errno = 0
-            else:
-                raise IOError(errno, force_str(strerror(errno)))
+        if ret < 0 and errno != EPIPE:
+            raise OSError_from_errno("Closing failed", self.filename)
 
     def __iter__(self):
         if not self.is_open:
@@ -4192,7 +4190,7 @@ cdef class VariantFile(HTSFile):
             elif ret == -2:
                 raise IOError('truncated file')
             elif errno:
-                raise IOError(errno, strerror(errno))
+                raise OSError_from_errno("Unable to fetch next record", self.filename)
             else:
                 raise IOError('unable to fetch next record')
 
@@ -4250,6 +4248,7 @@ cdef class VariantFile(HTSFile):
         """
         cdef bcf_hdr_t *hdr
         cdef BGZF *bgzfp
+        cdef const htsFormat *fmt
         cdef hts_idx_t *idx
         cdef tbx_t *tidx
         cdef char *cfilename
@@ -4339,11 +4338,12 @@ cdef class VariantFile(HTSFile):
 
             if not self.htsfile:
                 if errno:
-                    raise IOError(errno, 'could not open variant file `{}`: {}'.format(filename, force_str(strerror(errno))))
+                    raise OSError_from_errno("Could not open variant file", filename)
                 else:
                     raise ValueError('could not open variant file `{}`'.format(filename))
 
-            if self.htsfile.format.format not in (bcf, vcf):
+            fmt = hts_get_format(self.htsfile)
+            if fmt.format not in (bcf, vcf):
                 raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
 
             self.check_truncation(ignore_truncation)
@@ -4362,14 +4362,14 @@ cdef class VariantFile(HTSFile):
                 cfilename = NULL
 
             # check for index and open if present
-            if self.htsfile.format.format == bcf and cfilename:
+            if fmt.format == bcf and cfilename:
                 if index_filename is not None:
                     cindex_filename = index_filename
                 with nogil:
                     idx = bcf_index_load2(cfilename, cindex_filename)
                 self.index = makeBCFIndex(self.header, idx)
 
-            elif self.htsfile.format.compression == bgzf and cfilename:
+            elif fmt.compression == bgzf and cfilename:
                 if index_filename is not None:
                     cindex_filename = index_filename
                 with nogil:
@@ -4489,15 +4489,19 @@ cdef class VariantFile(HTSFile):
             raise ValueError('record must not be None')
 
         if not self.is_open:
-            return ValueError('I/O operation on closed file')
+            raise ValueError('I/O operation on closed file')
 
         if not self.htsfile.is_write:
             raise ValueError('cannot write to a Variantfile opened for reading')
 
+        cdef int ret
+
         if not self.header_written:
             self.header_written = True
             with nogil:
-                bcf_hdr_write(self.htsfile, self.header.ptr)
+                ret = bcf_hdr_write(self.htsfile, self.header.ptr)
+            if ret < 0:
+                raise OSError_from_errno("Can't write headers", self.filename)
 
         #if record.header is not self.header:
         #    record.translate(self.header)
@@ -4510,13 +4514,11 @@ cdef class VariantFile(HTSFile):
         # Sync END annotation before writing
         bcf_sync_end(record)
 
-        cdef int ret
-
         with nogil:
             ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
 
         if ret < 0:
-            raise IOError(errno, strerror(errno))
+            raise OSError_from_errno("Can't write record", self.filename)
 
         return ret
 
diff --git a/pysam/libcbgzf.pxd b/pysam/libcbgzf.pxd
new file mode 100644
index 000000000..d81984d7f
--- /dev/null
+++ b/pysam/libcbgzf.pxd
@@ -0,0 +1,7 @@
+# cython: language_level=3
+
+from pysam.libchtslib cimport BGZF
+
+cdef class BGZFile(object):
+    cdef BGZF *bgzf
+    cdef readonly object name, index
diff --git a/pysam/libcbgzf.pyi b/pysam/libcbgzf.pyi
index 4d64e8dbe..6c19d785e 100644
--- a/pysam/libcbgzf.pyi
+++ b/pysam/libcbgzf.pyi
@@ -1,11 +1,6 @@
 import sys
 
-from typing import Optional, Union, Any, NoReturn
-
-if sys.version_info < (3, 8):
-    from typing_extensions import Literal
-else:
-    from typing import Literal
+from typing import Literal, Optional, Union, Any, NoReturn
 
 BUFFER_SIZE: int
 
diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx
index d66a3c612..9a333f4cc 100644
--- a/pysam/libcbgzf.pyx
+++ b/pysam/libcbgzf.pyx
@@ -34,9 +34,6 @@ cdef class BGZFile(object):
     This class only supports opening files in binary mode. If you need to open a
     compressed file in text mode, use the gzip.open() function.
     """
-    cdef BGZF* bgzf
-    cdef readonly object name, index
-
     def __init__(self, filename, mode=None, index=None):
         """Constructor for the BGZFile class.
 
diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd
index c17d0ba0f..8380e90eb 100644
--- a/pysam/libcfaidx.pxd
+++ b/pysam/libcfaidx.pxd
@@ -9,31 +9,19 @@ cimport cython
 from cpython cimport array
 from pysam.libchtslib cimport faidx_t, kstring_t, BGZF
 
-# These functions are put here and not in chtslib.pxd in order
-# to avoid warnings for unused functions.
-cdef extern from "pysam_stream.h" nogil:
-
-    ctypedef struct kstream_t:
-        pass
-
+cdef extern from "htslib/kseq.h" nogil:
+    """
+    struct __kstream_t;
+    #define kstream_t  struct __kstream_t
+    __KSEQ_TYPE(type_t_unused_here)
+    #undef kstream_t
+    """
     ctypedef struct kseq_t:
         kstring_t name
         kstring_t comment
         kstring_t seq
         kstring_t qual
 
-    kseq_t *kseq_init(BGZF *)
-    int kseq_read(kseq_t *)
-    void kseq_destroy(kseq_t *)
-    kstream_t *ks_init(BGZF *)
-    void ks_destroy(kstream_t *)
-
-    # Retrieve characters from stream until delimiter
-    # is reached placing results in str.
-    int ks_getuntil(kstream_t *,
-                    int delimiter,
-                    kstring_t * str,
-                    int * dret)
 
 cdef class FastaFile:
     cdef bint is_remote
diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx
index d4e7427b7..dd214170c 100644
--- a/pysam/libcfaidx.pyx
+++ b/pysam/libcfaidx.pyx
@@ -70,6 +70,17 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
 from pysam.libcutils cimport encode_filename, from_string_and_size
 from pysam.libcutils cimport qualitystring_to_array, parse_region
 
+cdef extern from "htslib/kseq.h" nogil:
+    """
+    #undef __KSEQ_TYPE
+    #define __KSEQ_TYPE(type_t)
+    KSEQ_INIT2(static, BGZF *, bgzf_read)
+    """
+    kseq_t *kseq_init(BGZF *)
+    int kseq_read(kseq_t *)
+    void kseq_destroy(kseq_t *)
+
+
 cdef class FastqProxy
 cdef makeFastqProxy(kseq_t * src):
     '''enter src into AlignedRead.'''
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd
index 56e746074..99ea15760 100644
--- a/pysam/libchtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -573,6 +573,12 @@ cdef extern from "htslib/hts.h" nogil:
     # @param mode     Open mode, as per hts_open()
     htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
 
+    # @abstract  For output streams, flush any buffered data
+    # @param fp  The file handle to be flushed
+    # @return    0 for success, or negative if an error occurred.
+    # @since     1.14
+    int hts_flush(htsFile *fp)
+
     # @abstract  Close a file handle, flushing buffered data for output streams
     # @param fp  The file handle to be closed
     # @return    0 for success, or negative if an error occurred.
@@ -1865,10 +1871,10 @@ cdef extern from "htslib/vcf.h" nogil:
     #
     bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
     bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
-    void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
-    void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
+    int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
+    int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
     int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
-    void hrec_add_idx(bcf_hrec_t *hrec, int idx)
+    int hrec_add_idx(bcf_hrec_t *hrec, int idx)
     void bcf_hrec_destroy(bcf_hrec_t *hrec)
 
     #************************************************************************
diff --git a/pysam/libchtslib.pyi b/pysam/libchtslib.pyi
index fcd793596..ffa1b43da 100644
--- a/pysam/libchtslib.pyi
+++ b/pysam/libchtslib.pyi
@@ -1,10 +1,5 @@
 import sys
-from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, TypeVar
-
-if sys.version_info < (3, 8):
-    from typing_extensions import Protocol
-else:
-    from typing import Protocol
+from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, Protocol, TypeVar
 
 class _HasFileNo(Protocol):
     def fileno(self) -> int: ...
@@ -64,6 +59,7 @@ class HTSFile:
     @property
     def duplicate_filehandle(self) -> bool: ...
     def close(self) -> None: ...
+    def flush(self) -> None: ...
     def check_truncation(self, ignore_truncation: bool = ...) -> None: ...
     @property
     def category(self) -> str: ...
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx
index 3cb7b7aff..ce471765d 100644
--- a/pysam/libchtslib.pyx
+++ b/pysam/libchtslib.pyx
@@ -338,12 +338,18 @@ cdef class HTSFile(object):
             hts_close(self.htsfile)
             self.htsfile = NULL
 
+    def flush(self):
+        """Flush any buffered data to the underlying output stream."""
+        if self.htsfile:
+            if hts_flush(self.htsfile) < 0:
+                raise OSError(errno, f'Flushing {type(self).__name__} failed', force_str(self.filename))
+
     def check_truncation(self, ignore_truncation=False):
         """Check if file is truncated."""
         if not self.htsfile:
             return
 
-        if self.htsfile.format.compression != bgzf:
+        if hts_get_format(self.htsfile).compression != bgzf:
             return
 
         cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
@@ -373,7 +379,7 @@ cdef class HTSFile(object):
         VARIANTS, INDEX, REGIONS"""
         if not self.htsfile:
             raise ValueError('metadata not available on closed file')
-        return FORMAT_CATEGORIES[self.htsfile.format.category]
+        return FORMAT_CATEGORIES[hts_get_format(self.htsfile).category]
 
     @property
     def format(self):
@@ -384,14 +390,15 @@ cdef class HTSFile(object):
         """
         if not self.htsfile:
             raise ValueError('metadata not available on closed file')
-        return FORMATS[self.htsfile.format.format]
+        return FORMATS[hts_get_format(self.htsfile).format]
 
     @property
     def version(self):
         """Tuple of file format version numbers (major, minor)"""
         if not self.htsfile:
             raise ValueError('metadata not available on closed file')
-        return self.htsfile.format.version.major, self.htsfile.format.version.minor
+        cdef const htsFormat *fmt = hts_get_format(self.htsfile)
+        return fmt.version.major, fmt.version.minor
 
     @property
     def compression(self):
@@ -400,14 +407,14 @@ cdef class HTSFile(object):
         One of NONE, GZIP, BGZF, CUSTOM."""
         if not self.htsfile:
             raise ValueError('metadata not available on closed file')
-        return COMPRESSION[self.htsfile.format.compression]
+        return COMPRESSION[hts_get_format(self.htsfile).compression]
 
     @property
     def description(self):
         """Vaguely human readable description of the file format"""
         if not self.htsfile:
             raise ValueError('metadata not available on closed file')
-        cdef char *desc = hts_format_description(&self.htsfile.format)
+        cdef char *desc = hts_format_description(hts_get_format(self.htsfile))
         try:
             return charptr_to_str(desc)
         finally:
@@ -441,27 +448,27 @@ cdef class HTSFile(object):
     @property
     def is_sam(self):
         """return True if HTSFile is reading or writing a SAM alignment file"""
-        return self.htsfile != NULL and self.htsfile.format.format == sam
+        return self.htsfile != NULL and hts_get_format(self.htsfile).format == sam
 
     @property
     def is_bam(self):
         """return True if HTSFile is reading or writing a BAM alignment file"""
-        return self.htsfile != NULL and self.htsfile.format.format == bam
+        return self.htsfile != NULL and hts_get_format(self.htsfile).format == bam
 
     @property
     def is_cram(self):
         """return True if HTSFile is reading or writing a BAM alignment file"""
-        return self.htsfile != NULL and self.htsfile.format.format == cram
+        return self.htsfile != NULL and hts_get_format(self.htsfile).format == cram
 
     @property
     def is_vcf(self):
         """return True if HTSFile is reading or writing a VCF variant file"""
-        return self.htsfile != NULL and self.htsfile.format.format == vcf
+        return self.htsfile != NULL and hts_get_format(self.htsfile).format == vcf
 
     @property
     def is_bcf(self):
         """return True if HTSFile is reading or writing a BCF variant file"""
-        return self.htsfile != NULL and self.htsfile.format.format == bcf
+        return self.htsfile != NULL and hts_get_format(self.htsfile).format == bcf
 
     def reset(self):
         """reset file position to beginning of file just after the header.
@@ -484,14 +491,14 @@ cdef class HTSFile(object):
         whence = libc_whence_from_io(whence)
 
         cdef int64_t ret
-        if self.htsfile.format.compression == bgzf:
+        cdef htsCompression compression = hts_get_format(self.htsfile).compression
+        if compression == bgzf:
             with nogil:
                 ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
-        elif self.htsfile.format.compression == no_compression:
+        elif compression == no_compression:
             ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
         else:
-            raise NotImplementedError("seek not implemented in files compressed by method {}".format(
-                self.htsfile.format.compression))
+            raise NotImplementedError(f"seek not implemented in files compressed by method {compression}")
         return ret
 
     def tell(self):
@@ -502,17 +509,17 @@ cdef class HTSFile(object):
             raise IOError('tell not available in streams')
 
         cdef int64_t ret
-        if self.htsfile.format.compression == bgzf:
+        cdef const htsFormat *fmt = hts_get_format(self.htsfile)
+        if fmt.compression == bgzf:
             with nogil:
                 ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
-        elif self.htsfile.format.compression == no_compression:
+        elif fmt.compression == no_compression:
             ret = htell(self.htsfile.fp.hfile)
-        elif self.htsfile.format.format == cram:
+        elif fmt.format == cram:
             with nogil:
                 ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
         else:
-            raise NotImplementedError("seek not implemented in files compressed by method {}".format(
-                self.htsfile.format.compression))
+            raise NotImplementedError(f"seek not implemented in files compressed by method {fmt.compression}")
 
         return ret
 
diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd
index 174dd8b48..4096790c9 100644
--- a/pysam/libctabix.pxd
+++ b/pysam/libctabix.pxd
@@ -18,36 +18,9 @@ from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \
     tbx_t, kstring_t, BGZF, HTSFile
 
 
-# These functions are put here and not in chtslib.pxd in order
-# to avoid warnings for unused functions.
-cdef extern from "pysam_stream.h" nogil:
-
-    ctypedef struct kstream_t:
-        pass
-
-    ctypedef struct kseq_t:
-        kstring_t name
-        kstring_t comment
-        kstring_t seq
-        kstring_t qual
-
-    kseq_t *kseq_init(BGZF *)
-    int kseq_read(kseq_t *)
-    void kseq_destroy(kseq_t *)
-    kstream_t *ks_init(BGZF *)
-    void ks_destroy(kstream_t *)
-
-    # Retrieve characters from stream until delimiter
-    # is reached placing results in str.
-    int ks_getuntil(kstream_t *,
-                    int delimiter,
-                    kstring_t * str,
-                    int * dret)
-
-
 cdef class tabix_file_iterator:
     cdef BGZF * fh
-    cdef kstream_t * kstream
+    cdef void * unused
     cdef kstring_t buffer
     cdef size_t size
     cdef Parser parser
@@ -109,7 +82,7 @@ cdef class TabixIteratorParsed(TabixIterator):
 cdef class GZIterator:
     cdef object _filename
     cdef BGZF * gzipfile
-    cdef kstream_t * kstream
+    cdef void * unused
     cdef kstring_t buffer
     cdef int __cnext__(self)
     cdef encoding
diff --git a/pysam/libctabix.pyi b/pysam/libctabix.pyi
index e1ba2113e..3a4f5b571 100644
--- a/pysam/libctabix.pyi
+++ b/pysam/libctabix.pyi
@@ -1,4 +1,5 @@
-from typing import Optional, Literal, List, Any
+import sys
+from typing import Optional, List, Literal, Any
 
 from pysam.libchtslib import HTSFile
 
diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx
index b38cf55cd..54a2006a2 100644
--- a/pysam/libctabix.pyx
+++ b/pysam/libctabix.pyx
@@ -69,7 +69,7 @@ from cpython cimport PyErr_SetString, PyBytes_Check, \
 cimport pysam.libctabixproxies as ctabixproxies
 
 from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
-    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
+    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_getline, bgzf_write, \
     tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
     tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
     tbx_destroy, hisremote, region_list, hts_getline, \
@@ -79,6 +79,7 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
 from pysam.libcutils cimport encode_filename, from_string_and_size
 
+
 cdef class Parser:
 
     def __init__(self, encoding="ascii"):
@@ -389,7 +390,7 @@ cdef class TabixFile:
         if self.htsfile == NULL:
             raise IOError("could not open file `%s`" % filename)
         
-        #if self.htsfile.format.category != region_list:
+        #if hts_get_format(self.htsfile).category != region_list:
         #    raise ValueError("file does not contain region data")
 
         with nogil:
@@ -658,6 +659,12 @@ cdef class TabixIterator:
 
         return retval
 
+    def _itr_error(self, err: int):
+        if err == -5:
+            return IOError("iteration on closed file")
+        else:
+            return ValueError(f"iteration failed (error code {err})")
+
     def __next__(self): 
         """python version of next().
 
@@ -665,10 +672,8 @@ cdef class TabixIterator:
         """
         
         cdef int retval = self.__cnext__()
-        if retval == -5:
-            raise IOError("iteration on closed file")
-        elif retval < 0:
-            raise StopIteration
+        if retval < 0:
+            raise StopIteration if retval == -1 else self._itr_error(retval)
 
         return charptr_to_str(self.buffer.s, self.encoding)
 
@@ -708,10 +713,8 @@ cdef class TabixIteratorParsed(TabixIterator):
         """
         
         cdef int retval = self.__cnext__()
-        if retval == -5:
-            raise IOError("iteration on closed file")
-        elif retval < 0:
-            raise StopIteration
+        if retval < 0:
+            raise StopIteration if retval == -1 else self._itr_error(retval)
 
         return self.parser.parse(self.buffer.s,
                                  self.buffer.l)
@@ -730,7 +733,6 @@ cdef class GZIterator:
         with nogil:
             self.gzipfile = bgzf_open(cfilename, "r")
         self._filename = filename
-        self.kstream = ks_init(self.gzipfile)
         self.encoding = encoding
 
         self.buffer.l = 0
@@ -744,24 +746,15 @@ cdef class GZIterator:
             self.gzipfile = NULL
         if self.buffer.s != NULL:
             free(self.buffer.s)
-        if self.kstream != NULL:
-            ks_destroy(self.kstream)
 
     def __iter__(self):
         return self
 
     cdef int __cnext__(self):
-        cdef int dret = 0
-        cdef int retval = 0
-        while 1:
-            with nogil:
-                retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
-            
-            if retval < 0: 
-                break
-
-            return dret
-        return -1
+        cdef int retval
+        with nogil:
+            retval = bgzf_getline(self.gzipfile, b'\n', &self.buffer)
+        return retval
 
     def __next__(self):
         """python version of next().
@@ -1019,70 +1012,6 @@ def tabix_index(filename,
     
     return filename
 
-# #########################################################
-# cdef class tabix_file_iterator_old:
-#     '''iterate over ``infile``.
-
-#     This iterator is not safe. If the :meth:`__next__()` method is called 
-#     after ``infile`` is closed, the result is undefined (see ``fclose()``).
-
-#     The iterator might either raise a StopIteration or segfault.
-#     '''
-
-
-#     def __cinit__(self, 
-#                   infile, 
-#                   Parser parser,
-#                   int buffer_size = 65536 ):
-
-#         cdef int fd = PyObject_AsFileDescriptor( infile )
-#         if fd == -1: raise ValueError( "I/O operation on closed file." )
-#         self.infile = fdopen( fd, 'r')
-
-#         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
-
-#         self.buffer = <char*>malloc( buffer_size )        
-#         self.size = buffer_size
-#         self.parser = parser
-
-#     def __iter__(self):
-#         return self
-
-#     cdef __cnext__(self):
-
-#         cdef char * b
-#         cdef size_t nbytes
-#         b = self.buffer
-
-#         while not feof( self.infile ):
-#             nbytes = getline( &b, &self.size, self.infile)
-
-#             # stop at first error or eof
-#             if (nbytes == -1): break
-#             # skip comments
-#             if (b[0] == '#'): continue
-
-#             # skip empty lines
-#             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
-
-#             # make sure that entry is complete
-#             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
-#                 result = b
-#                 raise ValueError( "incomplete line at %s" % result )
-
-#             # make sure that this goes fully through C
-#             # otherwise buffer is copied to/from a
-#             # Python object causing segfaults as
-#             # the wrong memory is freed
-#             return self.parser.parse( b, nbytes )
-
-#         raise StopIteration
-
-#     def __dealloc__(self):
-#         free(self.buffer)
-
-#     def __next__(self):
-#         return self.__cnext__()
 
 #########################################################
 #########################################################
@@ -1125,8 +1054,6 @@ cdef class tabix_file_iterator:
         if self.fh == NULL: 
             raise IOError('%s' % strerror(errno))
 
-        self.kstream = ks_init(self.fh) 
-        
         self.buffer.s = <char*>malloc(buffer_size)
         #if self.buffer == NULL:
         #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
@@ -1139,12 +1066,11 @@ cdef class tabix_file_iterator:
     cdef __cnext__(self):
 
         cdef char * b
-        cdef int dret = 0
         cdef int retval = 0
         while 1:
             with nogil:
-                retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
-            
+                retval = bgzf_getline(self.fh, b'\n', &self.buffer)
+
             if retval < 0: 
                 break
                 #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
@@ -1168,7 +1094,6 @@ cdef class tabix_file_iterator:
 
     def __dealloc__(self):
         free(self.buffer.s)
-        ks_destroy(self.kstream)
         bgzf_close(self.fh)
         
     def __next__(self):
diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd
index 1bce05707..7e2e098b6 100644
--- a/pysam/libcutils.pxd
+++ b/pysam/libcutils.pxd
@@ -9,6 +9,8 @@ cpdef parse_region(contig=*, start=*, stop=*, region=*, reference=*, end=*)
 
 cdef int libc_whence_from_io(int whence)
 
+cdef OSError_from_errno(message, filename=*)
+
 #########################################################################
 # Utility functions for quality string conversions
 
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx
index 64cb97ae6..7a03f3c44 100644
--- a/pysam/libcutils.pyx
+++ b/pysam/libcutils.pyx
@@ -12,13 +12,14 @@ from codecs import register_error
 from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
 from cpython cimport PyBytes_Check, PyUnicode_Check
 from cpython cimport array as c_array
+from libc.errno cimport errno
 from libc.stdlib cimport calloc, free
-from libc.string cimport strncpy
+from libc.string cimport strerror, strncpy
 from libc.stdint cimport INT32_MAX, int32_t
 from libc.stdio cimport fprintf, stderr, fflush
 from libc.stdio cimport stdout as c_stdout
-from posix.fcntl cimport open as c_open, O_WRONLY
-from posix.unistd cimport SEEK_SET, SEEK_CUR, SEEK_END
+from posix.fcntl cimport open as c_open, O_WRONLY, O_CREAT, O_TRUNC
+from posix.unistd cimport dup as c_dup, SEEK_SET, SEEK_CUR, SEEK_END, STDOUT_FILENO
 
 from pysam.libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
     samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn
@@ -174,6 +175,12 @@ cdef decode_bytes(bytes s, encoding=None, errors=None):
         return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
 
 
+cdef OSError_from_errno(message, filename=None):
+    cdef int err = errno
+    if filename is not None: filename = os.fsdecode(filename)
+    return OSError(err, f"{message}: {strerror(err).decode()}", filename)
+
+
 cpdef parse_region(contig=None,
                    start=None,
                    stop=None,
@@ -321,9 +328,9 @@ def _pysam_dispatch(collection,
     if save_stdout:
         stdout_f = save_stdout
         stdout_h = c_open(force_bytes(stdout_f),
-                          O_WRONLY)
+                          O_WRONLY|O_CREAT|O_TRUNC, 0666)
         if stdout_h == -1:
-            raise IOError("error while opening {} for writing".format(stdout_f))
+            raise OSError_from_errno("Could not redirect standard output", stdout_f)
 
         samtools_set_stdout_fn(force_bytes(stdout_f))
         bcftools_set_stdout_fn(force_bytes(stdout_f))
@@ -344,7 +351,7 @@ def _pysam_dispatch(collection,
         if collection == "bcftools":
             # in bcftools, most methods accept -o, the exceptions
             # are below:
-            if method not in ("index", "roh", "stats"):
+            if method not in ("head", "index", "roh", "stats"):
                 stdout_option = "-o {}"
         elif method in MAP_STDOUT_OPTIONS[collection]:
             # special case - samtools view -c outputs on stdout
@@ -360,7 +367,8 @@ def _pysam_dispatch(collection,
     else:
         samtools_set_stdout_fn("-")
         bcftools_set_stdout_fn("-")
-        stdout_h = c_open(b"/dev/null", O_WRONLY)
+        if catch_stdout is None: stdout_h = c_dup(STDOUT_FILENO)
+        else: stdout_h = c_open(b"/dev/null", O_WRONLY)
 
     # setup the function call to samtools/bcftools main
     cdef char ** cargs
diff --git a/pysam/pysam_stream.h b/pysam/pysam_stream.h
deleted file mode 100644
index 3a4eb16cf..000000000
--- a/pysam/pysam_stream.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef PYSAM_STREAM_H
-#define PYSAM_STREAM_H
-
-#include "htslib/kseq.h"
-
-// #######################################################
-// fastq parsing
-// KSEQ_INIT(gzFile, gzread)
-KSEQ_INIT(BGZF *, bgzf_read)
-
-//KSTREAM_INIT( gzFile, gzread, 16384)
-
-#endif
diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c
deleted file mode 100644
index 349af44d8..000000000
--- a/pysam/pysam_util.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <ctype.h>
-#include <assert.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "htslib/khash.h"
-#include "htslib/ksort.h"
-#include "htslib/knetfile.h"
-
-#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700)
-/*
- * A rudimentary emulation of getline() for systems that dont support it
- * natively.  Since this is used for PPD file reading, it assumes (possibly
- * falsely) that BUFSIZ is big enough.
- */
-ssize_t
-getline(char **line, size_t *linelen, FILE *fp)
-{
-  if (*linelen == 0) 
-    {
-      *linelen = BUFSIZ;
-      *line = malloc(*linelen);
-    }
-
-  memset(*line, 0, *linelen);
-  fgets(*line, *linelen, fp);
-
-  return (strlen(*line));
-
-}
-#endif
-
-
-
diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h
deleted file mode 100644
index 789e9d0d3..000000000
--- a/pysam/pysam_util.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifndef PYSAM_UTIL_H
-#define PYSAM_UTIL_H
-
-
-#endif
diff --git a/pysam/samtools.py b/pysam/samtools.py
index 046658f95..56f6fb077 100644
--- a/pysam/samtools.py
+++ b/pysam/samtools.py
@@ -1,197 +1,59 @@
-import platform
-from typing import (
-    Callable,
-    List,
-    Tuple,
-    Iterable,
-    Union,
-)
-try:
-    from typing import Final
-    HAVE_FINAL = True
-except ImportError:
-    HAVE_FINAL = False
-
-from pysam.utils import PysamDispatcher
-
-
-# samtools command line options to export in python
-_SAMTOOLS_DISPATCH = {
-    # samtools 'documented' commands
-    "view": ("view", ()),
-    "head": ("head", ()),
-    "sort": ("sort", ()),
-    "mpileup": ("mpileup", ()),
-    "consensus": ("consensus", ()),
-    "depth": ("depth", ()),
-    "faidx": ("faidx", ()),
-    "fqidx": ("fqidx", ()),
-    "tview": ("tview", ()),
-    "index": ("index", ()),
-    "idxstats": ("idxstats", ()),
-    "fixmate": ("fixmate", ()),
-    "flagstat": ("flagstat", ()),
-    "calmd": ("calmd", ()),
-    "merge": ("merge", ()),
-    "markdup": ("markdup", ()),
-    "rmdup": ("rmdup", ()),
-    "reference": ("reference", ()),
-    "reheader": ("reheader", ()),
-    "reset": ("reset", ()),
-    "cat": ("cat", ()),
-    "targetcut": ("targetcut", ()),
-    "phase": ("phase", ()),
-    "bam2fq": ("bam2fq", ()),
-    "dict": ("dict", ()),
-    "addreplacerg": ("addreplacerg", ()),
-    "pad2unpad": ("pad2unpad", ()),
-    "depad": ("pad2unpad", ()),
-    "bedcov": ("bedcov", ()),
-    "coverage": ("coverage", ()),
-    "bamshuf": ("bamshuf", ()),
-    "collate": ("collate", ()),
-    "stats": ("stats", ()),
-    "fasta": ("fasta", ()),
-    "fastq": ("fastq", ()),
-    "cram_size": ("cram-size", ()),
-    "quickcheck": ("quickcheck", ()),
-    "split": ("split", ()),
-    "flags": ("flags", ()),
-    "ampliconclip": ("ampliconclip", ()),
-    "ampliconstats": ("ampliconstats", ()),
-    "version": ("version", ()),
-    "fqimport": ("import", ()),
-    "import_": ("import", ()),
-    "samples": ("samples", ()),
-}
-
-
-def _wrap_command(
-    dispatch: str,
-    parsers: Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]],
-) -> PysamDispatcher:
-    return PysamDispatcher("samtools", dispatch, parsers)
-
-
-if not HAVE_FINAL:
-    # python 3.7
-    for key, options in _SAMTOOLS_DISPATCH.items():
-        cmd, parser = options
-        globals()[key] = PysamDispatcher("samtools", cmd, parser)
-
-    __all__ = list(_SAMTOOLS_DISPATCH)
-else:
-    # python >=3.8
-    view: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["view"][0], _SAMTOOLS_DISPATCH["view"][1])
-
-    head: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["head"][0], _SAMTOOLS_DISPATCH["head"][1])
-
-    sort: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["sort"][0], _SAMTOOLS_DISPATCH["sort"][1])
-
-    mpileup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["mpileup"][0], _SAMTOOLS_DISPATCH["mpileup"][1])
-
-    consensus: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["consensus"][0],
-        _SAMTOOLS_DISPATCH["consensus"][1],
-    )
-
-    depth: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["depth"][0], _SAMTOOLS_DISPATCH["depth"][1])
-
-    faidx: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["faidx"][0], _SAMTOOLS_DISPATCH["faidx"][1])
-
-    fqidx: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fqidx"][0], _SAMTOOLS_DISPATCH["fqidx"][1])
-
-    tview: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["tview"][0], _SAMTOOLS_DISPATCH["tview"][1])
-
-    index: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["index"][0], _SAMTOOLS_DISPATCH["index"][1])
-
-    idxstats: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["idxstats"][0], _SAMTOOLS_DISPATCH["idxstats"][1])
-
-    fixmate: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fixmate"][0], _SAMTOOLS_DISPATCH["fixmate"][1])
-
-    flagstat: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["flagstat"][0], _SAMTOOLS_DISPATCH["flagstat"][1])
-
-    calmd: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["calmd"][0], _SAMTOOLS_DISPATCH["calmd"][1])
-
-    merge: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["merge"][0], _SAMTOOLS_DISPATCH["merge"][1])
-
-    markdup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["markdup"][0], _SAMTOOLS_DISPATCH["markdup"][1])
-
-    rmdup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["rmdup"][0], _SAMTOOLS_DISPATCH["rmdup"][1])
-
-    reference: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["reference"][0],
-        _SAMTOOLS_DISPATCH["reference"][1],
-    )
-
-    reheader: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["reheader"][0], _SAMTOOLS_DISPATCH["reheader"][1])
-
-    reset: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["reset"][0], _SAMTOOLS_DISPATCH["reset"][1])
-
-    cat: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["cat"][0], _SAMTOOLS_DISPATCH["cat"][1])
-
-    targetcut: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["targetcut"][0],
-        _SAMTOOLS_DISPATCH["targetcut"][1],
-    )
-
-    phase: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["phase"][0], _SAMTOOLS_DISPATCH["phase"][1])
-
-    bam2fq: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bam2fq"][0], _SAMTOOLS_DISPATCH["bam2fq"][1])
-
-    dict: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["dict"][0], _SAMTOOLS_DISPATCH["dict"][1])
-
-    addreplacerg: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["addreplacerg"][0],
-        _SAMTOOLS_DISPATCH["addreplacerg"][1],
-    )
-
-    pad2unpad: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["pad2unpad"][0],
-        _SAMTOOLS_DISPATCH["pad2unpad"][1],
-    )
-
-    depad: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["depad"][0], _SAMTOOLS_DISPATCH["depad"][1])
-
-    bedcov: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bedcov"][0], _SAMTOOLS_DISPATCH["bedcov"][1])
-
-    coverage: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["coverage"][0], _SAMTOOLS_DISPATCH["coverage"][1])
-
-    bamshuf: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bamshuf"][0], _SAMTOOLS_DISPATCH["bamshuf"][1])
-
-    collate: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["collate"][0], _SAMTOOLS_DISPATCH["collate"][1])
-
-    stats: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["stats"][0], _SAMTOOLS_DISPATCH["stats"][1])
-
-    fasta: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fasta"][0], _SAMTOOLS_DISPATCH["fasta"][1])
-
-    fastq: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fastq"][0], _SAMTOOLS_DISPATCH["fastq"][1])
-
-    cram_size: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["cram_size"][0], _SAMTOOLS_DISPATCH["cram_size"][1])
-
-    quickcheck: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["quickcheck"][0],
-        _SAMTOOLS_DISPATCH["quickcheck"][1],
-    )
-
-    split: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["split"][0], _SAMTOOLS_DISPATCH["split"][1])
-
-    flags: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["flags"][0], _SAMTOOLS_DISPATCH["flags"][1])
-
-    ampliconclip: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["ampliconclip"][0],
-        _SAMTOOLS_DISPATCH["ampliconclip"][1],
-    )
-
-    ampliconstats: Final[PysamDispatcher] = _wrap_command(
-        _SAMTOOLS_DISPATCH["ampliconstats"][0],
-        _SAMTOOLS_DISPATCH["ampliconstats"][1],
-    )
-
-    version: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["version"][0], _SAMTOOLS_DISPATCH["version"][1])
-
-    fqimport: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fqimport"][0], _SAMTOOLS_DISPATCH["fqimport"][1])
-
-    import_: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["import_"][0], _SAMTOOLS_DISPATCH["import_"][1])
-
-    samples: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["samples"][0], _SAMTOOLS_DISPATCH["samples"][1])
+import pysam.utils
+
+addreplacerg = pysam.utils.PysamDispatcher('samtools', 'addreplacerg')
+ampliconclip = pysam.utils.PysamDispatcher('samtools', 'ampliconclip')
+ampliconstats = pysam.utils.PysamDispatcher('samtools', 'ampliconstats')
+bam2fq = pysam.utils.PysamDispatcher('samtools', 'bam2fq')
+bamshuf = pysam.utils.PysamDispatcher('samtools', 'bamshuf')
+bedcov = pysam.utils.PysamDispatcher('samtools', 'bedcov')
+calmd = pysam.utils.PysamDispatcher('samtools', 'calmd')
+cat = pysam.utils.PysamDispatcher('samtools', 'cat')
+collate = pysam.utils.PysamDispatcher('samtools', 'collate')
+consensus = pysam.utils.PysamDispatcher('samtools', 'consensus')
+coverage = pysam.utils.PysamDispatcher('samtools', 'coverage')
+cram_size = pysam.utils.PysamDispatcher('samtools', 'cram-size')
+depad = pysam.utils.PysamDispatcher('samtools', 'depad')
+depth = pysam.utils.PysamDispatcher('samtools', 'depth')
+dict = pysam.utils.PysamDispatcher('samtools', 'dict')
+faidx = pysam.utils.PysamDispatcher('samtools', 'faidx')
+fasta = pysam.utils.PysamDispatcher('samtools', 'fasta')
+fastq = pysam.utils.PysamDispatcher('samtools', 'fastq')
+fixmate = pysam.utils.PysamDispatcher('samtools', 'fixmate')
+flags = pysam.utils.PysamDispatcher('samtools', 'flags')
+flagstat = pysam.utils.PysamDispatcher('samtools', 'flagstat')
+fqidx = pysam.utils.PysamDispatcher('samtools', 'fqidx')
+fqimport = pysam.utils.PysamDispatcher('samtools', 'import')
+head = pysam.utils.PysamDispatcher('samtools', 'head')
+idxstats = pysam.utils.PysamDispatcher('samtools', 'idxstats')
+index = pysam.utils.PysamDispatcher('samtools', 'index')
+markdup = pysam.utils.PysamDispatcher('samtools', 'markdup')
+merge = pysam.utils.PysamDispatcher('samtools', 'merge')
+mpileup = pysam.utils.PysamDispatcher('samtools', 'mpileup')
+pad2unpad = pysam.utils.PysamDispatcher('samtools', 'pad2unpad')
+phase = pysam.utils.PysamDispatcher('samtools', 'phase')
+quickcheck = pysam.utils.PysamDispatcher('samtools', 'quickcheck')
+reference = pysam.utils.PysamDispatcher('samtools', 'reference')
+reheader = pysam.utils.PysamDispatcher('samtools', 'reheader')
+reset = pysam.utils.PysamDispatcher('samtools', 'reset')
+rmdup = pysam.utils.PysamDispatcher('samtools', 'rmdup')
+samples = pysam.utils.PysamDispatcher('samtools', 'samples')
+sort = pysam.utils.PysamDispatcher('samtools', 'sort')
+split = pysam.utils.PysamDispatcher('samtools', 'split')
+stats = pysam.utils.PysamDispatcher('samtools', 'stats')
+targetcut = pysam.utils.PysamDispatcher('samtools', 'targetcut')
+tview = pysam.utils.PysamDispatcher('samtools', 'tview')
+version = pysam.utils.PysamDispatcher('samtools', 'version')
+view = pysam.utils.PysamDispatcher('samtools', 'view')
+
+__all__ = [
+    'addreplacerg', 'ampliconclip', 'ampliconstats',
+    'bam2fq', 'bamshuf', 'bedcov', 'calmd', 'cat',
+    'collate', 'consensus', 'coverage', 'cram_size',
+    'depad', 'depth', 'dict', 'faidx', 'fasta',
+    'fastq', 'fixmate', 'flags', 'flagstat', 'fqidx',
+    'fqimport', 'head', 'idxstats', 'index',
+    'markdup', 'merge', 'mpileup', 'pad2unpad',
+    'phase', 'quickcheck', 'reference', 'reheader',
+    'reset', 'rmdup', 'samples', 'sort', 'split',
+    'stats', 'targetcut', 'tview', 'version', 'view',
+]
diff --git a/pysam/utils.py b/pysam/utils.py
index d15f43171..dab58ff61 100644
--- a/pysam/utils.py
+++ b/pysam/utils.py
@@ -1,6 +1,7 @@
 from typing import (
     Callable,
     List,
+    Optional,
     Tuple,
     Iterable,
     Union,
@@ -48,7 +49,7 @@ def __init__(
         self,
         collection: str,
         dispatch: str,
-        parsers: Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]],
+        parsers: Optional[Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]]] = None,
     ):
         self.collection = collection
         self.dispatch = dispatch
diff --git a/pysam/version.h b/pysam/version.h
index 645557ba7..1fb0cff53 100644
--- a/pysam/version.h
+++ b/pysam/version.h
@@ -1,5 +1,5 @@
 // Version information used while compiling samtools, bcftools, and htslib
 
-#define SAMTOOLS_VERSION "1.18 (pysam)"
-#define BCFTOOLS_VERSION "1.18 (pysam)"
-#define HTS_VERSION_TEXT "1.18 (pysam)"
+#define SAMTOOLS_VERSION "1.21 (pysam)"
+#define BCFTOOLS_VERSION "1.21 (pysam)"
+#define HTS_VERSION_TEXT "1.21 (pysam)"
diff --git a/pysam/version.py b/pysam/version.py
index 8625167b4..0e26758fc 100644
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,6 +1,6 @@
 # pysam versioning information
-__version__ = "0.22.1"
+__version__ = "0.23.3"
 
-__samtools_version__ = "1.18"
-__bcftools_version__ = "1.18"
-__htslib_version__ = "1.18"
+__samtools_version__ = "1.21"
+__bcftools_version__ = "1.21"
+__htslib_version__ = "1.21"
diff --git a/samtools/LICENSE b/samtools/LICENSE
index e72eb6360..f096c2de0 100644
--- a/samtools/LICENSE
+++ b/samtools/LICENSE
@@ -1,6 +1,6 @@
 The MIT/Expat License
 
-Copyright (C) 2008-2023 Genome Research Ltd.
+Copyright (C) 2008-2024 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/samtools/README b/samtools/README
index 8f4f2369a..d681d281e 100644
--- a/samtools/README
+++ b/samtools/README
@@ -9,7 +9,7 @@ Building samtools
 The typical simple case of building Samtools using the HTSlib bundled within
 this Samtools release tarball is done as follows:
 
-    cd .../samtools-1.18 # Within the unpacked release directory
+    cd .../samtools-1.21 # Within the unpacked release directory
     ./configure
     make
 
@@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing.  Building for
 installation using the HTSlib bundled within this Samtools release tarball,
 and building the various HTSlib utilities such as bgzip is done as follows:
 
-    cd .../samtools-1.18 # Within the unpacked release directory
+    cd .../samtools-1.21 # Within the unpacked release directory
     ./configure --prefix=/path/to/location
     make all all-htslib
     make install install-htslib
@@ -48,7 +48,7 @@ There are two advantages to this:
 To build with plug-ins, you need to use the --enable-plugins configure option
 as follows:
 
-    cd .../samtools-1.18 # Within the unpacked release directory
+    cd .../samtools-1.21 # Within the unpacked release directory
     ./configure --enable-plugins --prefix=/path/to/location
     make all all-htslib
     make install install-htslib
@@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from
 the source distribution instead of installing the package.  In that case
 you can use:
 
-    cd .../samtools-1.18 # Within the unpacked release directory
-    ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.18
+    cd .../samtools-1.21 # Within the unpacked release directory
+    ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.21
     make all all-htslib
 
 It is possible to override the built-in search path using the HTS_PATH
diff --git a/samtools/amplicon_stats.c b/samtools/amplicon_stats.c
index 3842fb319..5dff050c3 100644
--- a/samtools/amplicon_stats.c
+++ b/samtools/amplicon_stats.c
@@ -1,6 +1,6 @@
 /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
 
-    Copyright (C) 2020-2021 Genome Research Ltd.
+    Copyright (C) 2020-2021, 2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -1652,15 +1652,34 @@ int main_ampliconstats(int argc, char **argv) {
         {"single-ref", no_argument, NULL, 'S'},
         {NULL, 0, NULL, 0}
     };
-    int opt;
+    int opt, tmp_flag;
 
     while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
         switch (opt) {
-        case 'f': args.flag_require = bam_str2flag(optarg); break;
+        case 'f':
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("ampliconstats", "Unknown flag '%s'\n", optarg);
+                return 1;
+            }
+
+            args.flag_require = tmp_flag;
+            break;
+
         case 'F':
+            tmp_flag = bam_str2flag(optarg);
+
             if (args.flag_filter & 0x10000)
                 args.flag_filter = 0; // strip default on first -F usage
-            args.flag_filter |= bam_str2flag(optarg); break;
+
+            if (tmp_flag < 0) {
+                print_error("ampliconstats", "Unknown flag '%s'\n", optarg);
+                return 1;
+            }
+
+            args.flag_filter |= tmp_flag;
+            break;
 
         case 'm': args.max_delta = atoi(optarg); break; // margin
         case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
@@ -1718,7 +1737,7 @@ int main_ampliconstats(int argc, char **argv) {
         return usage(&oargs, stderr, EXIT_FAILURE);
 
     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
-    if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
+    if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash, NULL, NULL)) {
         print_error_errno("ampliconstats",
                           "Could not read file \"%s\"", argv[optind]);
         return 1;
diff --git a/samtools/amplicon_stats.c.pysam.c b/samtools/amplicon_stats.c.pysam.c
index b71ac4adf..99e804b57 100644
--- a/samtools/amplicon_stats.c.pysam.c
+++ b/samtools/amplicon_stats.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
 
-    Copyright (C) 2020-2021 Genome Research Ltd.
+    Copyright (C) 2020-2021, 2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -1654,15 +1654,34 @@ int main_ampliconstats(int argc, char **argv) {
         {"single-ref", no_argument, NULL, 'S'},
         {NULL, 0, NULL, 0}
     };
-    int opt;
+    int opt, tmp_flag;
 
     while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
         switch (opt) {
-        case 'f': args.flag_require = bam_str2flag(optarg); break;
+        case 'f':
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("ampliconstats", "Unknown flag '%s'\n", optarg);
+                return 1;
+            }
+
+            args.flag_require = tmp_flag;
+            break;
+
         case 'F':
+            tmp_flag = bam_str2flag(optarg);
+
             if (args.flag_filter & 0x10000)
                 args.flag_filter = 0; // strip default on first -F usage
-            args.flag_filter |= bam_str2flag(optarg); break;
+
+            if (tmp_flag < 0) {
+                print_error("ampliconstats", "Unknown flag '%s'\n", optarg);
+                return 1;
+            }
+
+            args.flag_filter |= tmp_flag;
+            break;
 
         case 'm': args.max_delta = atoi(optarg); break; // margin
         case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
@@ -1720,7 +1739,7 @@ int main_ampliconstats(int argc, char **argv) {
         return usage(&oargs, samtools_stderr, EXIT_FAILURE);
 
     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
-    if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
+    if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash, NULL, NULL)) {
         print_error_errno("ampliconstats",
                           "Could not read file \"%s\"", argv[optind]);
         return 1;
diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c
index 9266b6199..c3e4f04d4 100644
--- a/samtools/bam2depth.c
+++ b/samtools/bam2depth.c
@@ -1,7 +1,7 @@
 /*  bam2depth.c -- depth subcommand.
 
     Copyright (C) 2011, 2012 Broad Institute.
-    Copyright (C) 2012-2016, 2018, 2019-2022 Genome Research Ltd.
+    Copyright (C) 2012-2016, 2018, 2019-2022, 2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
     Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
@@ -158,6 +158,42 @@ hts_pos_t qlen_used(bam1_t *b) {
 
 }
 
+// Without HTS_OPT3 gcc11 and earlier don't vectorize this, nor even
+// unroll it.  By adding HTS_OPT3 we can force a better level of optimization.
+// On an Illumina BAM with gcc11 -O2, with HTS_OPT3 is 9% quicker for the
+// entire process (ie decompress, iterator, aggregate & report)
+static inline void HTS_OPT3 incr_hist(int *hist, int oplen) {
+    const int N = 16;
+    int k;
+    for (k = 0; k < (oplen & ~(N-1)); k+=N) {
+        for (int i = 0; i < N; i++)
+            hist[k+i]++;
+    }
+    for (; k < oplen; k++)
+        hist[k]++;
+}
+
+static inline void HTS_OPT3
+incr_hist_qual(int *hist, uint8_t *qual, int min_qual, int oplen) {
+    if (!min_qual) {
+        incr_hist(hist, oplen);
+        return;
+    }
+
+    int k;
+    for (k = 0; k < (oplen & ~31); k+=32) {
+        int pass[32];
+        // Two separate loops helps clang to vectorize
+        for (int i = 0; i < 32; i++)
+            pass[i]=qual[k+i]>=min_qual;
+
+        for (int i = 0; i < 32; i++)
+            hist[k+i]+=pass[i];
+    }
+    for (; k < oplen; k++)
+        hist[k]+=qual[k]>=min_qual;
+}
+
 // Adds the depth for a single read to a depth_hist struct.
 // For just one file, this is easy.  We just have a circular buffer
 // where we increment values for bits that overlap existing data
@@ -379,7 +415,6 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
                 // We've explicitly asked to include them, and the quality
                 // is wrong anyway (it's the neighbouring base).  We do this
                 // for now for compatibility with the old depth command.
-
                 if (spos < b->core.l_qseq)
                     for (; k < oplen; k++, i++)
                         hist[i & hmask]+=qual[spos]>=min_qual;
@@ -391,80 +426,30 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
 
         case BAM_CMATCH:
         case BAM_CEQUAL:
-        case BAM_CDIFF:
-            if ((i & hmask) < ((i+oplen) & hmask)) {
-                // Optimisation when not wrapping around
-
-                // Unrolling doesn't help clang, but helps gcc,
-                // especially when not using -O3.
-                int *hist = &dh->hist[file][i & hmask];
-                if (min_qual || overlap_clip) {
-                    k = 0;
-                    if (overlap_clip) {
-                        if (i+oplen < overlap_clip) {
-                            i += oplen;
-                            spos += oplen;
-                            break;
-                        } else if (i < overlap_clip) {
-                            oplen -= overlap_clip - i;
-                            spos += overlap_clip - i;
-                            hist += overlap_clip - i;
-                            i = overlap_clip;
-                        }
-                    }
-
-                    // approx 50% of this func cpu time in this loop
-                    for (; k < (oplen & ~7); k+=8) {
-                        hist[k+0]+=qual[spos+0]>=min_qual;
-                        hist[k+1]+=qual[spos+1]>=min_qual;
-                        hist[k+2]+=qual[spos+2]>=min_qual;
-                        hist[k+3]+=qual[spos+3]>=min_qual;
-                        hist[k+4]+=qual[spos+4]>=min_qual;
-                        hist[k+5]+=qual[spos+5]>=min_qual;
-                        hist[k+6]+=qual[spos+6]>=min_qual;
-                        hist[k+7]+=qual[spos+7]>=min_qual;
-                        spos += 8;
-                    }
-                } else {
-                    // easier to vectorize when no min_qual
-                    for (k = 0; k < (oplen & ~7); k+=8) {
-                        hist[k+0]++;
-                        hist[k+1]++;
-                        hist[k+2]++;
-                        hist[k+3]++;
-                        hist[k+4]++;
-                        hist[k+5]++;
-                        hist[k+6]++;
-                        hist[k+7]++;
-                    }
-                    spos += k;
-                }
-                for (; k < oplen && spos < b->core.l_qseq; k++, spos++)
-                    hist[k]+=qual[spos]>=min_qual;
-                for (; k < oplen; k++, spos++)
-                    hist[k]++;
-                i += oplen;
-            } else {
-                // Simple to understand case, but slower.
-                // We use this only for reads with wrap-around.
-                int *hist = dh->hist[file];
-                k = 0;
-                if (overlap_clip) {
-                    if (i+oplen < overlap_clip) {
-                        i += oplen;
-                        break;
-                    } else if (i < overlap_clip) {
-                        oplen -= overlap_clip - i;
-                        spos += overlap_clip - i;
-                        i = overlap_clip;
-                    }
+        case BAM_CDIFF: {
+            int *hist = dh->hist[file];
+            k = 0;
+            if (overlap_clip) {
+                if (i+oplen < overlap_clip) {
+                    i += oplen;
+                    break;
+                } else if (i < overlap_clip) {
+                    oplen -= overlap_clip - i;
+                    spos += overlap_clip - i;
+                    i = overlap_clip;
                 }
-                for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++)
-                    hist[i & hmask]+=qual[spos]>=min_qual;
-                for (; k < oplen; k++, i++, spos++)
-                    hist[i & hmask]++;
             }
+
+            int len = (i & hmask) < ((i+oplen) & hmask)
+                ? oplen                  // doesn't wrap around
+                : dh->size - (i&hmask);  // does wrap around
+            incr_hist_qual(&hist[i & hmask], &qual[spos], min_qual, len);
+            if (oplen > len)
+                incr_hist_qual(hist, &qual[spos+len], min_qual, oplen-len);
+            spos += oplen;
+            i += oplen;
             break;
+        }
 
         case BAM_CINS:
         case BAM_CSOFT_CLIP:
@@ -745,7 +730,7 @@ static void usage_exit(FILE *fp, int exit_status)
 
 int main_depth(int argc, char *argv[])
 {
-    int nfiles, i;
+    int nfiles, i, tmp_flag;
     samFile **fp;
     sam_hdr_t **header;
     int c, has_index_file = 0;
@@ -806,16 +791,44 @@ int main_depth(int argc, char *argv[])
             break;
 
         case 'g':
-            opt.flag &= ~bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.flag &= ~tmp_flag;
             break;
         case 'G': // reject if any set
-            opt.flag |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.flag |= tmp_flag;
             break;
         case 1: // reject unless at least one set (0 means ignore option)
-            opt.incl_flag |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.incl_flag |= tmp_flag;
             break;
         case 2: // reject unless all set
-            opt.require_flag |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.require_flag |= tmp_flag;
             break;
 
         case 'l':
diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c
index abe6141be..1a1176c74 100644
--- a/samtools/bam2depth.c.pysam.c
+++ b/samtools/bam2depth.c.pysam.c
@@ -3,7 +3,7 @@
 /*  bam2depth.c -- depth subcommand.
 
     Copyright (C) 2011, 2012 Broad Institute.
-    Copyright (C) 2012-2016, 2018, 2019-2022 Genome Research Ltd.
+    Copyright (C) 2012-2016, 2018, 2019-2022, 2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
     Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
@@ -160,6 +160,42 @@ hts_pos_t qlen_used(bam1_t *b) {
 
 }
 
+// Without HTS_OPT3 gcc11 and earlier don't vectorize this, nor even
+// unroll it.  By adding HTS_OPT3 we can force a better level of optimization.
+// On an Illumina BAM with gcc11 -O2, with HTS_OPT3 is 9% quicker for the
+// entire process (ie decompress, iterator, aggregate & report)
+static inline void HTS_OPT3 incr_hist(int *hist, int oplen) {
+    const int N = 16;
+    int k;
+    for (k = 0; k < (oplen & ~(N-1)); k+=N) {
+        for (int i = 0; i < N; i++)
+            hist[k+i]++;
+    }
+    for (; k < oplen; k++)
+        hist[k]++;
+}
+
+static inline void HTS_OPT3
+incr_hist_qual(int *hist, uint8_t *qual, int min_qual, int oplen) {
+    if (!min_qual) {
+        incr_hist(hist, oplen);
+        return;
+    }
+
+    int k;
+    for (k = 0; k < (oplen & ~31); k+=32) {
+        int pass[32];
+        // Two separate loops helps clang to vectorize
+        for (int i = 0; i < 32; i++)
+            pass[i]=qual[k+i]>=min_qual;
+
+        for (int i = 0; i < 32; i++)
+            hist[k+i]+=pass[i];
+    }
+    for (; k < oplen; k++)
+        hist[k]+=qual[k]>=min_qual;
+}
+
 // Adds the depth for a single read to a depth_hist struct.
 // For just one file, this is easy.  We just have a circular buffer
 // where we increment values for bits that overlap existing data
@@ -381,7 +417,6 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
                 // We've explicitly asked to include them, and the quality
                 // is wrong anyway (it's the neighbouring base).  We do this
                 // for now for compatibility with the old depth command.
-
                 if (spos < b->core.l_qseq)
                     for (; k < oplen; k++, i++)
                         hist[i & hmask]+=qual[spos]>=min_qual;
@@ -393,80 +428,30 @@ static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
 
         case BAM_CMATCH:
         case BAM_CEQUAL:
-        case BAM_CDIFF:
-            if ((i & hmask) < ((i+oplen) & hmask)) {
-                // Optimisation when not wrapping around
-
-                // Unrolling doesn't help clang, but helps gcc,
-                // especially when not using -O3.
-                int *hist = &dh->hist[file][i & hmask];
-                if (min_qual || overlap_clip) {
-                    k = 0;
-                    if (overlap_clip) {
-                        if (i+oplen < overlap_clip) {
-                            i += oplen;
-                            spos += oplen;
-                            break;
-                        } else if (i < overlap_clip) {
-                            oplen -= overlap_clip - i;
-                            spos += overlap_clip - i;
-                            hist += overlap_clip - i;
-                            i = overlap_clip;
-                        }
-                    }
-
-                    // approx 50% of this func cpu time in this loop
-                    for (; k < (oplen & ~7); k+=8) {
-                        hist[k+0]+=qual[spos+0]>=min_qual;
-                        hist[k+1]+=qual[spos+1]>=min_qual;
-                        hist[k+2]+=qual[spos+2]>=min_qual;
-                        hist[k+3]+=qual[spos+3]>=min_qual;
-                        hist[k+4]+=qual[spos+4]>=min_qual;
-                        hist[k+5]+=qual[spos+5]>=min_qual;
-                        hist[k+6]+=qual[spos+6]>=min_qual;
-                        hist[k+7]+=qual[spos+7]>=min_qual;
-                        spos += 8;
-                    }
-                } else {
-                    // easier to vectorize when no min_qual
-                    for (k = 0; k < (oplen & ~7); k+=8) {
-                        hist[k+0]++;
-                        hist[k+1]++;
-                        hist[k+2]++;
-                        hist[k+3]++;
-                        hist[k+4]++;
-                        hist[k+5]++;
-                        hist[k+6]++;
-                        hist[k+7]++;
-                    }
-                    spos += k;
-                }
-                for (; k < oplen && spos < b->core.l_qseq; k++, spos++)
-                    hist[k]+=qual[spos]>=min_qual;
-                for (; k < oplen; k++, spos++)
-                    hist[k]++;
-                i += oplen;
-            } else {
-                // Simple to understand case, but slower.
-                // We use this only for reads with wrap-around.
-                int *hist = dh->hist[file];
-                k = 0;
-                if (overlap_clip) {
-                    if (i+oplen < overlap_clip) {
-                        i += oplen;
-                        break;
-                    } else if (i < overlap_clip) {
-                        oplen -= overlap_clip - i;
-                        spos += overlap_clip - i;
-                        i = overlap_clip;
-                    }
+        case BAM_CDIFF: {
+            int *hist = dh->hist[file];
+            k = 0;
+            if (overlap_clip) {
+                if (i+oplen < overlap_clip) {
+                    i += oplen;
+                    break;
+                } else if (i < overlap_clip) {
+                    oplen -= overlap_clip - i;
+                    spos += overlap_clip - i;
+                    i = overlap_clip;
                 }
-                for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++)
-                    hist[i & hmask]+=qual[spos]>=min_qual;
-                for (; k < oplen; k++, i++, spos++)
-                    hist[i & hmask]++;
             }
+
+            int len = (i & hmask) < ((i+oplen) & hmask)
+                ? oplen                  // doesn't wrap around
+                : dh->size - (i&hmask);  // does wrap around
+            incr_hist_qual(&hist[i & hmask], &qual[spos], min_qual, len);
+            if (oplen > len)
+                incr_hist_qual(hist, &qual[spos+len], min_qual, oplen-len);
+            spos += oplen;
+            i += oplen;
             break;
+        }
 
         case BAM_CINS:
         case BAM_CSOFT_CLIP:
@@ -747,7 +732,7 @@ static void usage_exit(FILE *fp, int exit_status)
 
 int main_depth(int argc, char *argv[])
 {
-    int nfiles, i;
+    int nfiles, i, tmp_flag;
     samFile **fp;
     sam_hdr_t **header;
     int c, has_index_file = 0;
@@ -808,16 +793,44 @@ int main_depth(int argc, char *argv[])
             break;
 
         case 'g':
-            opt.flag &= ~bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.flag &= ~tmp_flag;
             break;
         case 'G': // reject if any set
-            opt.flag |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.flag |= tmp_flag;
             break;
         case 1: // reject unless at least one set (0 means ignore option)
-            opt.incl_flag |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.incl_flag |= tmp_flag;
             break;
         case 2: // reject unless all set
-            opt.require_flag |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("depth", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            opt.require_flag |= tmp_flag;
             break;
 
         case 'l':
diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c
index 72f39bd5d..fe911c81e 100644
--- a/samtools/bam_ampliconclip.c
+++ b/samtools/bam_ampliconclip.c
@@ -1,7 +1,7 @@
 /*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
                           from the 5' end.
 
-    Copyright (C) 2020-2023 Genome Research Ltd.
+    Copyright (C) 2020-2024 Genome Research Ltd.
 
     Authors: Andrew Whitwham <aw7@sanger.ac.uk>
              Rob Davies <rmd+git@sanger.ac.uk>
@@ -62,6 +62,7 @@ typedef struct {
     int unmap_len;
     char *arg_list;
     char *stats_file;
+    char *primer_counts_file;
     char *rejects_file;
 } cl_param_t;
 
@@ -72,49 +73,63 @@ static int bed_entry_sort(const void *av, const void *bv) {
     return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
 }
 
-
-int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
+int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists, char ***ref_list, size_t *num_refs_out) {
     hFILE *fp;
     int line_count = 0, ret;
+
+    //variables to store the bed file data for each record
+    char ref[1024] = "";
     int64_t left, right;
+    char name[1024] = "", score[1024] = "";
+    char strand;
+
+    //hash table to store clipping results and bed file data
     kstring_t line = KS_INITIALIZE;
     bed_entry_list_t *list;
     khiter_t bed_itr;
 
+    //ordered ref names list
+    size_t ref_list_sz = 0;
+    size_t num_refs = 0;
+
+    if (ref_list)
+        *ref_list = NULL;
+
+    if (num_refs_out)
+        *num_refs_out = 0;
+
     if ((fp = hopen(infile, "r")) == NULL) {
         print_error_errno("amplicon", "unable to open file %s.", infile);
         return 1;
     }
 
-    char ref[1024];
 
     while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
         line_count++;
         int hret;
-        char strand;
 
         if (line.l == 0 || *line.s == '#') continue;
         if (strncmp(line.s, "track ", 6) == 0) continue;
         if (strncmp(line.s, "browser ", 8) == 0) continue;
 
-        if (get_strand) {
-            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
-                       ref, &left, &right, &strand) != 4) {
-                fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
-                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
-                                    line_count, infile);
-                ret = 1;
-                goto error;
-            }
-        } else {
-            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
-                       ref, &left, &right) != 3) {
-                fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
-                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
-                                    line_count, infile);
-                ret = 1;
-                goto error;
-            }
+        // A list of the maximal number of columns we may want to parse.
+        // There may be more, but we don't use the ones beyond these.
+        const char *const scanf_str =
+            "%1023s %"SCNd64" %"SCNd64" %1023s %1023s %c";
+
+        // Extract the data from the line into the variables.
+        // Variables corresponding to any missing columns will remain
+        // uninitialised.  We asked for all columns, but cols_parsed will
+        // return how many we found which can be validated against num_columns.
+        int cols_parsed = sscanf(line.s, scanf_str, ref,
+                                 &left, &right, name, score, &strand);
+        if (cols_parsed < (get_strand ? 6 : 3)) {
+            fprintf(stderr, "[amplicon] error: invalid bed file format in line %d of %s.\n"
+                    "Parsed %d columns, but need at least %d\n"
+                    "(N.B. ref/chrom name limited to 1023 characters.)\n",
+                    line_count, infile, cols_parsed, get_strand ? 6 : 3);
+            ret = 1;
+            goto error;
         }
 
         bed_itr = kh_get(bed_list_hash, bed_lists, ref);
@@ -128,6 +143,14 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
                 goto error;
             }
 
+            if (ref_list) {
+                if (hts_resize(char **, num_refs + 1, &ref_list_sz, ref_list, 0) < 0) {
+                    fprintf(stderr, "[amplicon] error: unable to allocate memory for ref name list.\n");
+                    ret = 1;
+                    goto error;
+                }
+                (*ref_list)[num_refs++] = ref_name;
+            }
             bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
 
             if (hret > 0) {
@@ -147,6 +170,7 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
             list = &kh_val(bed_lists, bed_itr);
         }
 
+        // add the bed entry to the list, growing the list if necessary
         if (list->length == list->size) {
            bed_entry_t *tmp;
 
@@ -163,6 +187,26 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
 
         list->bp[list->length].left  = left;
         list->bp[list->length].right = right;
+        list->bp[list->length].name  = NULL;
+        list->bp[list->length].score = NULL;
+        if (cols_parsed >= 4) {
+            list->bp[list->length].name = strdup(name);
+            if (list->bp[list->length].name == NULL) {
+                fprintf(stderr, "[amplicon] error: unable to allocate memory for name in line %d of %s: %s.\n", line_count, infile, line.s);
+                ret = 1;
+                goto error;
+            }
+        }
+        if (cols_parsed >= 5) {
+            list->bp[list->length].score = strdup(score);
+            if (list->bp[list->length].score == NULL) {
+                fprintf(stderr, "[amplicon] error: unable to allocate memory for score in line %d of %s: %s\n", line_count, infile, line.s);
+                ret = 1;
+                goto error;
+            }
+        }
+
+        list->bp[list->length].num_reads = 0;
 
         if (get_strand) {
             if (strand == '+') {
@@ -198,6 +242,9 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
         ret = 1;
     }
 
+    if (num_refs_out)
+        *num_refs_out = num_refs;
+
 error:
     ks_free(&line);
 
@@ -214,7 +261,12 @@ void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
 
     for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
        if (kh_exist(hash, itr)) {
-           free(kh_val(hash, itr).bp);
+           bed_entry_list_t list = kh_val(hash, itr);
+           for (int i = 0; i < list.length; i++) {
+               free(list.bp[i].name);
+               free(list.bp[i].score);
+           }
+           free(list.bp);
            free((char *)kh_key(hash, itr));
            kh_key(hash, itr) = NULL;
         }
@@ -227,7 +279,7 @@ void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
 static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
                               int is_rev, int use_strand, int64_t longest,
                               cl_param_t *param) {
-    int i, size;  // may need this to be variable
+    int i, size, used_i;
     int tol = param->tol;
     int l = 0, mid = sites->length / 2, r = sites->length;
     int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
@@ -242,6 +294,7 @@ static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
     }
 
     size = 0;
+    used_i = -1;
 
     for (i = l; i < sites->length; i++) {
         hts_pos_t mod_left, mod_right;
@@ -268,15 +321,19 @@ static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
             if (is_rev) {
                 if (size < pos - sites->bp[i].left) {
                     size = pos - sites->bp[i].left;
+                    used_i = i;
                 }
             } else {
                 if (size < sites->bp[i].right - pos) {
                     size = sites->bp[i].right - pos;
+                    used_i = i;
                 }
             }
         }
     }
-
+    if (used_i >= 0) {
+        sites->bp[used_i].num_reads++;
+    }
     return size;
 }
 
@@ -411,7 +468,7 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
     }
 
     // Copy remaining QUAL
-    memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+    memmove(new_qual, orig_qual + qry_removed, rec->core.l_qseq - qry_removed);
 
     // Set new l_qseq
     rec_out->core.l_qseq -= qry_removed;
@@ -642,9 +699,13 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
     kstring_t seq = KS_INITIALIZE;
     bed_entry_list_t *sites;
     FILE *stats_fp = stderr;
+    FILE *bed_count_summary_fp = stderr;
     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+    char **bed_ref_list = NULL;
+    size_t num_bed_refs = 0;
 
-    if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
+    if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash,
+                                &bed_ref_list, &num_bed_refs)) {
         fprintf(stderr, "[ampliconclip] error: unable to load bed file.\n");
         goto fail;
     }
@@ -938,11 +999,43 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
 
     if (file_open) {
         fclose(stats_fp);
+        file_open = 0;
+    }
+
+    if (param->primer_counts_file) {
+        if ((bed_count_summary_fp = fopen(param->primer_counts_file, "w")) == NULL) {
+            fprintf(stderr, "[ampliconclip] warning: cannot write count summary to %s.\n", param->primer_counts_file);
+        } else {
+            file_open = 1;
+        }
+
+        //print out the number of reads for each bed entry, bedgraph format
+        fprintf(bed_count_summary_fp, "#CHR\tLEFT\tRIGHT\tNAME\tSCORE\tSTRAND\tNUM_CLIPPED\n");
+        size_t refidx;
+        for (refidx = 0; refidx < num_bed_refs; refidx++) {
+            khiter_t itr = kh_get(bed_list_hash, bed_hash, bed_ref_list[refidx]);
+            if (itr >= kh_end(bed_hash)) {
+                fprintf(stderr, "[ampliconclip] error: %s has gone missing from the hash table\n", bed_ref_list[refidx]);
+                goto fail;
+            }
+            sites = &kh_val(bed_hash, itr);
+            int i;
+            for (i = 0; i < sites->length; i++) {
+                char* strand_out = param->use_strand ? (sites->bp[i].rev ? "-" : "+") : ".";
+                fprintf(bed_count_summary_fp, "%s\t%"PRId64"\t%"PRId64"\t%s\t%s\t%s\t%"PRId64"\n",
+                        kh_key(bed_hash, itr), sites->bp[i].left, sites->bp[i].right, sites->bp[i].name,
+                        sites->bp[i].score, strand_out, sites->bp[i].num_reads);
+            }
+        }
+        if (file_open) {
+            fclose(bed_count_summary_fp);
+        }
     }
 
     ret = 0;
 
 fail:
+    free(bed_ref_list);
     destroy_bed_hash(bed_hash);
     ks_free(&oat);
     ks_free(&seq);
@@ -956,25 +1049,26 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
 static void usage(void) {
     fprintf(stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
     fprintf(stderr, "Option: \n");
-    fprintf(stderr, " -b  FILE            BED file of regions (eg amplicon primers) to be removed.\n");
-    fprintf(stderr, " -o  FILE            output file name (default stdout).\n");
-    fprintf(stderr, " -f  FILE            write stats to file name (default stderr)\n");
-    fprintf(stderr, " -u                  Output uncompressed data\n");
-    fprintf(stderr, " --soft-clip         soft clip amplicon primers from reads (default)\n");
-    fprintf(stderr, " --hard-clip         hard clip amplicon primers from reads.\n");
-    fprintf(stderr, " --both-ends         clip on both 5' and 3' ends.\n");
-    fprintf(stderr, " --strand            use strand data from BED file to match read direction.\n");
-    fprintf(stderr, " --clipped           only output clipped reads.\n");
-    fprintf(stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
-    fprintf(stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
-    fprintf(stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
-    fprintf(stderr, " --unmap-len  INT    unmap reads INT size or shorter, default 0.\n");
-    fprintf(stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
-    fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n");
-    fprintf(stderr, " --original          for clipped entries add an OA tag with original data.\n");
-    fprintf(stderr, " --keep-tag          for clipped entries keep the old NM and MD tags.\n");
-    fprintf(stderr, " --tolerance         match region within this number of bases, default 5.\n");
-    fprintf(stderr, " --no-PG             do not add an @PG line.\n");
+    fprintf(stderr, " -b  FILE             BED file of regions (eg amplicon primers) to be removed.\n");
+    fprintf(stderr, " -o  FILE             output file name (default: stdout).\n");
+    fprintf(stderr, " -f  FILE             write stats to file name (default: stderr)\n");
+    fprintf(stderr, " -u                   Output uncompressed data\n");
+    fprintf(stderr, " --soft-clip          soft clip amplicon primers from reads (default)\n");
+    fprintf(stderr, " --hard-clip          hard clip amplicon primers from reads.\n");
+    fprintf(stderr, " --both-ends          clip on both 5' and 3' ends.\n");
+    fprintf(stderr, " --strand             use strand data from BED file to match read direction.\n");
+    fprintf(stderr, " --clipped            only output clipped reads.\n");
+    fprintf(stderr, " --fail               mark unclipped, mapped reads as QCFAIL.\n");
+    fprintf(stderr, " --filter-len INT     do not output reads INT size or shorter.\n");
+    fprintf(stderr, " --fail-len   INT     mark as QCFAIL reads INT size or shorter.\n");
+    fprintf(stderr, " --unmap-len  INT     unmap reads INT size or shorter, default 0.\n");
+    fprintf(stderr, " --no-excluded        do not write excluded reads (unmapped or QCFAIL).\n");
+    fprintf(stderr, " --rejects-file FILE  file to write filtered reads.\n");
+    fprintf(stderr, " --primer-counts FILE file to write read counts per bed entry (bedgraph format).\n");
+    fprintf(stderr, " --original           for clipped entries add an OA tag with original data.\n");
+    fprintf(stderr, " --keep-tag           for clipped entries keep the old NM and MD tags.\n");
+    fprintf(stderr, " --tolerance          match region within this number of bases, default 5.\n");
+    fprintf(stderr, " --no-PG              do not add an @PG line.\n");
     sam_global_opt_help(stderr, "-.O..@-.");
     fprintf(stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
                     "Default clipping is only on the 5' end.\n\n");
@@ -1004,10 +1098,11 @@ int amplicon_clip_main(int argc, char **argv) {
         {"fail-len", required_argument, NULL, 1010},
         {"no-excluded", no_argument, NULL, 1011},
         {"rejects-file", required_argument, NULL, 1012},
-        {"original", no_argument, NULL, 1013},
-        {"keep-tag", no_argument, NULL, 1014},
-        {"tolerance", required_argument, NULL, 1015},
-        {"unmap-len", required_argument, NULL, 1016},
+        {"primer-counts", required_argument, NULL, 1013},
+        {"original", no_argument, NULL, 1014},
+        {"keep-tag", no_argument, NULL, 1015},
+        {"tolerance", required_argument, NULL, 1016},
+        {"unmap-len", required_argument, NULL, 1017},
         {NULL, 0, NULL, 0}
     };
 
@@ -1028,10 +1123,11 @@ int amplicon_clip_main(int argc, char **argv) {
             case 1010: param.fail_len = atoi(optarg); break;
             case 1011: param.unmapped = 1; break;
             case 1012: param.rejects_file = optarg; break;
-            case 1013: param.oa_tag = 1; break;
-            case 1014: param.del_tag = 0; break;
-            case 1015: param.tol = atoi(optarg); break;
-            case 1016: param.unmap_len = atoi(optarg); break;
+            case 1013: param.primer_counts_file = optarg; break;
+            case 1014: param.oa_tag = 1; break;
+            case 1015: param.del_tag = 0; break;
+            case 1016: param.tol = atoi(optarg); break;
+            case 1017: param.unmap_len = atoi(optarg); break;
             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                       /* else fall-through */
             case '?': usage(); exit(1);
diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c
index 0c368508e..0a86fdb31 100644
--- a/samtools/bam_ampliconclip.c.pysam.c
+++ b/samtools/bam_ampliconclip.c.pysam.c
@@ -3,7 +3,7 @@
 /*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
                           from the 5' end.
 
-    Copyright (C) 2020-2023 Genome Research Ltd.
+    Copyright (C) 2020-2024 Genome Research Ltd.
 
     Authors: Andrew Whitwham <aw7@sanger.ac.uk>
              Rob Davies <rmd+git@sanger.ac.uk>
@@ -64,6 +64,7 @@ typedef struct {
     int unmap_len;
     char *arg_list;
     char *stats_file;
+    char *primer_counts_file;
     char *rejects_file;
 } cl_param_t;
 
@@ -74,49 +75,63 @@ static int bed_entry_sort(const void *av, const void *bv) {
     return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
 }
 
-
-int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
+int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists, char ***ref_list, size_t *num_refs_out) {
     hFILE *fp;
     int line_count = 0, ret;
+
+    //variables to store the bed file data for each record
+    char ref[1024] = "";
     int64_t left, right;
+    char name[1024] = "", score[1024] = "";
+    char strand;
+
+    //hash table to store clipping results and bed file data
     kstring_t line = KS_INITIALIZE;
     bed_entry_list_t *list;
     khiter_t bed_itr;
 
+    //ordered ref names list
+    size_t ref_list_sz = 0;
+    size_t num_refs = 0;
+
+    if (ref_list)
+        *ref_list = NULL;
+
+    if (num_refs_out)
+        *num_refs_out = 0;
+
     if ((fp = hopen(infile, "r")) == NULL) {
         print_error_errno("amplicon", "unable to open file %s.", infile);
         return 1;
     }
 
-    char ref[1024];
 
     while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
         line_count++;
         int hret;
-        char strand;
 
         if (line.l == 0 || *line.s == '#') continue;
         if (strncmp(line.s, "track ", 6) == 0) continue;
         if (strncmp(line.s, "browser ", 8) == 0) continue;
 
-        if (get_strand) {
-            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
-                       ref, &left, &right, &strand) != 4) {
-                fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
-                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
-                                    line_count, infile);
-                ret = 1;
-                goto error;
-            }
-        } else {
-            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
-                       ref, &left, &right) != 3) {
-                fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
-                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
-                                    line_count, infile);
-                ret = 1;
-                goto error;
-            }
+        // A list of the maximal number of columns we may want to parse.
+        // There may be more, but we don't use the ones beyond these.
+        const char *const scanf_str =
+            "%1023s %"SCNd64" %"SCNd64" %1023s %1023s %c";
+
+        // Extract the data from the line into the variables.
+        // Variables corresponding to any missing columns will remain
+        // uninitialised.  We asked for all columns, but cols_parsed will
+        // return how many we found which can be validated against num_columns.
+        int cols_parsed = sscanf(line.s, scanf_str, ref,
+                                 &left, &right, name, score, &strand);
+        if (cols_parsed < (get_strand ? 6 : 3)) {
+            fprintf(samtools_stderr, "[amplicon] error: invalid bed file format in line %d of %s.\n"
+                    "Parsed %d columns, but need at least %d\n"
+                    "(N.B. ref/chrom name limited to 1023 characters.)\n",
+                    line_count, infile, cols_parsed, get_strand ? 6 : 3);
+            ret = 1;
+            goto error;
         }
 
         bed_itr = kh_get(bed_list_hash, bed_lists, ref);
@@ -130,6 +145,14 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
                 goto error;
             }
 
+            if (ref_list) {
+                if (hts_resize(char **, num_refs + 1, &ref_list_sz, ref_list, 0) < 0) {
+                    fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for ref name list.\n");
+                    ret = 1;
+                    goto error;
+                }
+                (*ref_list)[num_refs++] = ref_name;
+            }
             bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
 
             if (hret > 0) {
@@ -149,6 +172,7 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
             list = &kh_val(bed_lists, bed_itr);
         }
 
+        // add the bed entry to the list, growing the list if necessary
         if (list->length == list->size) {
            bed_entry_t *tmp;
 
@@ -165,6 +189,26 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
 
         list->bp[list->length].left  = left;
         list->bp[list->length].right = right;
+        list->bp[list->length].name  = NULL;
+        list->bp[list->length].score = NULL;
+        if (cols_parsed >= 4) {
+            list->bp[list->length].name = strdup(name);
+            if (list->bp[list->length].name == NULL) {
+                fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for name in line %d of %s: %s.\n", line_count, infile, line.s);
+                ret = 1;
+                goto error;
+            }
+        }
+        if (cols_parsed >= 5) {
+            list->bp[list->length].score = strdup(score);
+            if (list->bp[list->length].score == NULL) {
+                fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for score in line %d of %s: %s\n", line_count, infile, line.s);
+                ret = 1;
+                goto error;
+            }
+        }
+
+        list->bp[list->length].num_reads = 0;
 
         if (get_strand) {
             if (strand == '+') {
@@ -200,6 +244,9 @@ int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash
         ret = 1;
     }
 
+    if (num_refs_out)
+        *num_refs_out = num_refs;
+
 error:
     ks_free(&line);
 
@@ -216,7 +263,12 @@ void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
 
     for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
        if (kh_exist(hash, itr)) {
-           free(kh_val(hash, itr).bp);
+           bed_entry_list_t list = kh_val(hash, itr);
+           for (int i = 0; i < list.length; i++) {
+               free(list.bp[i].name);
+               free(list.bp[i].score);
+           }
+           free(list.bp);
            free((char *)kh_key(hash, itr));
            kh_key(hash, itr) = NULL;
         }
@@ -229,7 +281,7 @@ void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
 static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
                               int is_rev, int use_strand, int64_t longest,
                               cl_param_t *param) {
-    int i, size;  // may need this to be variable
+    int i, size, used_i;
     int tol = param->tol;
     int l = 0, mid = sites->length / 2, r = sites->length;
     int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
@@ -244,6 +296,7 @@ static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
     }
 
     size = 0;
+    used_i = -1;
 
     for (i = l; i < sites->length; i++) {
         hts_pos_t mod_left, mod_right;
@@ -270,15 +323,19 @@ static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
             if (is_rev) {
                 if (size < pos - sites->bp[i].left) {
                     size = pos - sites->bp[i].left;
+                    used_i = i;
                 }
             } else {
                 if (size < sites->bp[i].right - pos) {
                     size = sites->bp[i].right - pos;
+                    used_i = i;
                 }
             }
         }
     }
-
+    if (used_i >= 0) {
+        sites->bp[used_i].num_reads++;
+    }
     return size;
 }
 
@@ -413,7 +470,7 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
     }
 
     // Copy remaining QUAL
-    memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+    memmove(new_qual, orig_qual + qry_removed, rec->core.l_qseq - qry_removed);
 
     // Set new l_qseq
     rec_out->core.l_qseq -= qry_removed;
@@ -644,9 +701,13 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
     kstring_t seq = KS_INITIALIZE;
     bed_entry_list_t *sites;
     FILE *stats_fp = samtools_stderr;
+    FILE *bed_count_summary_fp = samtools_stderr;
     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+    char **bed_ref_list = NULL;
+    size_t num_bed_refs = 0;
 
-    if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
+    if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash,
+                                &bed_ref_list, &num_bed_refs)) {
         fprintf(samtools_stderr, "[ampliconclip] error: unable to load bed file.\n");
         goto fail;
     }
@@ -940,11 +1001,43 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
 
     if (file_open) {
         fclose(stats_fp);
+        file_open = 0;
+    }
+
+    if (param->primer_counts_file) {
+        if ((bed_count_summary_fp = fopen(param->primer_counts_file, "w")) == NULL) {
+            fprintf(samtools_stderr, "[ampliconclip] warning: cannot write count summary to %s.\n", param->primer_counts_file);
+        } else {
+            file_open = 1;
+        }
+
+        //print out the number of reads for each bed entry, bedgraph format
+        fprintf(bed_count_summary_fp, "#CHR\tLEFT\tRIGHT\tNAME\tSCORE\tSTRAND\tNUM_CLIPPED\n");
+        size_t refidx;
+        for (refidx = 0; refidx < num_bed_refs; refidx++) {
+            khiter_t itr = kh_get(bed_list_hash, bed_hash, bed_ref_list[refidx]);
+            if (itr >= kh_end(bed_hash)) {
+                fprintf(samtools_stderr, "[ampliconclip] error: %s has gone missing from the hash table\n", bed_ref_list[refidx]);
+                goto fail;
+            }
+            sites = &kh_val(bed_hash, itr);
+            int i;
+            for (i = 0; i < sites->length; i++) {
+                char* strand_out = param->use_strand ? (sites->bp[i].rev ? "-" : "+") : ".";
+                fprintf(bed_count_summary_fp, "%s\t%"PRId64"\t%"PRId64"\t%s\t%s\t%s\t%"PRId64"\n",
+                        kh_key(bed_hash, itr), sites->bp[i].left, sites->bp[i].right, sites->bp[i].name,
+                        sites->bp[i].score, strand_out, sites->bp[i].num_reads);
+            }
+        }
+        if (file_open) {
+            fclose(bed_count_summary_fp);
+        }
     }
 
     ret = 0;
 
 fail:
+    free(bed_ref_list);
     destroy_bed_hash(bed_hash);
     ks_free(&oat);
     ks_free(&seq);
@@ -958,25 +1051,26 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
 static void usage(void) {
     fprintf(samtools_stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
     fprintf(samtools_stderr, "Option: \n");
-    fprintf(samtools_stderr, " -b  FILE            BED file of regions (eg amplicon primers) to be removed.\n");
-    fprintf(samtools_stderr, " -o  FILE            output file name (default samtools_stdout).\n");
-    fprintf(samtools_stderr, " -f  FILE            write stats to file name (default samtools_stderr)\n");
-    fprintf(samtools_stderr, " -u                  Output uncompressed data\n");
-    fprintf(samtools_stderr, " --soft-clip         soft clip amplicon primers from reads (default)\n");
-    fprintf(samtools_stderr, " --hard-clip         hard clip amplicon primers from reads.\n");
-    fprintf(samtools_stderr, " --both-ends         clip on both 5' and 3' ends.\n");
-    fprintf(samtools_stderr, " --strand            use strand data from BED file to match read direction.\n");
-    fprintf(samtools_stderr, " --clipped           only output clipped reads.\n");
-    fprintf(samtools_stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
-    fprintf(samtools_stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
-    fprintf(samtools_stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
-    fprintf(samtools_stderr, " --unmap-len  INT    unmap reads INT size or shorter, default 0.\n");
-    fprintf(samtools_stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
-    fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
-    fprintf(samtools_stderr, " --original          for clipped entries add an OA tag with original data.\n");
-    fprintf(samtools_stderr, " --keep-tag          for clipped entries keep the old NM and MD tags.\n");
-    fprintf(samtools_stderr, " --tolerance         match region within this number of bases, default 5.\n");
-    fprintf(samtools_stderr, " --no-PG             do not add an @PG line.\n");
+    fprintf(samtools_stderr, " -b  FILE             BED file of regions (eg amplicon primers) to be removed.\n");
+    fprintf(samtools_stderr, " -o  FILE             output file name (default: samtools_stdout).\n");
+    fprintf(samtools_stderr, " -f  FILE             write stats to file name (default: samtools_stderr)\n");
+    fprintf(samtools_stderr, " -u                   Output uncompressed data\n");
+    fprintf(samtools_stderr, " --soft-clip          soft clip amplicon primers from reads (default)\n");
+    fprintf(samtools_stderr, " --hard-clip          hard clip amplicon primers from reads.\n");
+    fprintf(samtools_stderr, " --both-ends          clip on both 5' and 3' ends.\n");
+    fprintf(samtools_stderr, " --strand             use strand data from BED file to match read direction.\n");
+    fprintf(samtools_stderr, " --clipped            only output clipped reads.\n");
+    fprintf(samtools_stderr, " --fail               mark unclipped, mapped reads as QCFAIL.\n");
+    fprintf(samtools_stderr, " --filter-len INT     do not output reads INT size or shorter.\n");
+    fprintf(samtools_stderr, " --fail-len   INT     mark as QCFAIL reads INT size or shorter.\n");
+    fprintf(samtools_stderr, " --unmap-len  INT     unmap reads INT size or shorter, default 0.\n");
+    fprintf(samtools_stderr, " --no-excluded        do not write excluded reads (unmapped or QCFAIL).\n");
+    fprintf(samtools_stderr, " --rejects-file FILE  file to write filtered reads.\n");
+    fprintf(samtools_stderr, " --primer-counts FILE file to write read counts per bed entry (bedgraph format).\n");
+    fprintf(samtools_stderr, " --original           for clipped entries add an OA tag with original data.\n");
+    fprintf(samtools_stderr, " --keep-tag           for clipped entries keep the old NM and MD tags.\n");
+    fprintf(samtools_stderr, " --tolerance          match region within this number of bases, default 5.\n");
+    fprintf(samtools_stderr, " --no-PG              do not add an @PG line.\n");
     sam_global_opt_help(samtools_stderr, "-.O..@-.");
     fprintf(samtools_stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
                     "Default clipping is only on the 5' end.\n\n");
@@ -1006,10 +1100,11 @@ int amplicon_clip_main(int argc, char **argv) {
         {"fail-len", required_argument, NULL, 1010},
         {"no-excluded", no_argument, NULL, 1011},
         {"rejects-file", required_argument, NULL, 1012},
-        {"original", no_argument, NULL, 1013},
-        {"keep-tag", no_argument, NULL, 1014},
-        {"tolerance", required_argument, NULL, 1015},
-        {"unmap-len", required_argument, NULL, 1016},
+        {"primer-counts", required_argument, NULL, 1013},
+        {"original", no_argument, NULL, 1014},
+        {"keep-tag", no_argument, NULL, 1015},
+        {"tolerance", required_argument, NULL, 1016},
+        {"unmap-len", required_argument, NULL, 1017},
         {NULL, 0, NULL, 0}
     };
 
@@ -1030,10 +1125,11 @@ int amplicon_clip_main(int argc, char **argv) {
             case 1010: param.fail_len = atoi(optarg); break;
             case 1011: param.unmapped = 1; break;
             case 1012: param.rejects_file = optarg; break;
-            case 1013: param.oa_tag = 1; break;
-            case 1014: param.del_tag = 0; break;
-            case 1015: param.tol = atoi(optarg); break;
-            case 1016: param.unmap_len = atoi(optarg); break;
+            case 1013: param.primer_counts_file = optarg; break;
+            case 1014: param.oa_tag = 1; break;
+            case 1015: param.del_tag = 0; break;
+            case 1016: param.tol = atoi(optarg); break;
+            case 1017: param.unmap_len = atoi(optarg); break;
             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                       /* else fall-through */
             case '?': usage(); samtools_exit(1);
diff --git a/samtools/bam_ampliconclip.h b/samtools/bam_ampliconclip.h
index ef3535702..d17e74fd3 100644
--- a/samtools/bam_ampliconclip.h
+++ b/samtools/bam_ampliconclip.h
@@ -30,7 +30,10 @@ DEALINGS IN THE SOFTWARE.  */
 typedef struct {
     int64_t left;
     int64_t right;
+    char *name;
+    char *score;
     int rev;
+    int64_t num_reads;
 } bed_entry_t;
 
 typedef struct {
@@ -40,13 +43,14 @@ typedef struct {
     int size;
 } bed_entry_list_t;
 
-KHASH_MAP_INIT_STR(bed_list_hash, bed_entry_list_t);
+KHASH_MAP_INIT_STR(bed_list_hash, bed_entry_list_t)
 
 #define BED_LIST_INIT {NULL, 0, 0, 0, {0}}
 
 
 int load_bed_file_multi_ref(char *infile, int get_strand,
-                        int sort_by_pos, khash_t(bed_list_hash) *bed_lists);
+                            int sort_by_pos, khash_t(bed_list_hash) *bed_lists,
+                            char ***ref_list, size_t *num_refs);
 
 void destroy_bed_hash(khash_t(bed_list_hash) *hash);
 
diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c
index ed8cf58c5..ec045c61b 100644
--- a/samtools/bam_cat.c
+++ b/samtools/bam_cat.c
@@ -1,6 +1,7 @@
 /*  bam_cat.c -- efficiently concatenates bam files.
 
-    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
+    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021,
+                  2023-2024 Genome Research Ltd.
     Modified SAMtools work copyright (C) 2010 Illumina, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -46,55 +47,80 @@ Illumina.
 #include "htslib/sam.h"
 #include "htslib/cram.h"
 #include "htslib/kstring.h"
+#include "htslib/hfile.h"
 #include "samtools.h"
 #include "sam_opts.h"
 
-/*
- * Check the files are consistent and capable of being concatenated.
- * Also fills out the version numbers and produces a new sam_hdr_t
- * structure with merged RG lines.
- * Note it is only a simple merge.
- *
- * Returns updated header on success;
- *        NULL on failure.
- */
-static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h,
-                                     int *vers_maj_p, int *vers_min_p) {
+/// cat_check_merge_hdr - check compatibility and merge RG hearders merges RGon both CRAM and BAM.
+/** @param firstfile - pointer to the 1sr file opened in caller
+ *  @param nfn - number of files to be processed, including the firstfile
+ *  @param fn - array of file paths to be processed
+ *  @param h - sam header pointer which contains explicitly given header
+ *  @param vers_maj_p - cram major version set and send out for output creation
+ *  @param vers_min_p - cram min version set and send out for output creation
+ *  @param out_h - pointer to sam header pointer, outputs the merged header
+ * returns array of opened samFile pointers on success and NULL on failure
+ * This method has the merged header processing for cram and bam.
+ * RG lines are merged for both cram and bam. For cram, version match for each
+ * file and order match of RG lines are compared as well.
+ * Note: it is a simple merge of RG lines alone.
+*/
+static samFile** cat_check_merge_hdr(samFile * const firstfile, int nfn, char * const *fn, const sam_hdr_t *h,
+                                     int *vers_maj_p, int *vers_min_p, sam_hdr_t **out_h) {
     int i, vers_maj = -1, vers_min = -1;
     sam_hdr_t *new_h = NULL, *old_h = NULL;
     samFile *in = NULL;
     kstring_t ks = KS_INITIALIZE;
-
-    if (h) {
-        new_h = sam_hdr_dup(h);
-        if (!new_h) {
-            fprintf(stderr, "[%s] ERROR: header duplication failed.\n",
-                    __func__);
-            goto fail;
+    samFile **files = calloc(nfn, sizeof(samFile *));
+    if(!files) {
+        fprintf(stderr, "[%s] ERROR: failed to allocate space for file handles.\n", __func__);
+        return NULL;
+    }
+    if (!out_h || !firstfile) {
+        fprintf(stderr, "[%s] ERROR: header check failed.\n", __func__);
+        goto fail;
+    }
+    if (*out_h) {           //use header if one is already present
+        new_h = *out_h;
+    }
+    else {
+        if (h) {            //use the explicit header given
+            new_h = sam_hdr_dup(h);
+            if (!new_h) {
+                fprintf(stderr, "[%s] ERROR: header duplication failed.\n",
+                        __func__);
+                goto fail;
+            }
         }
     }
 
     for (i = 0; i < nfn; ++i) {
-        cram_fd *in_c;
         int ki;
-
-        in = sam_open(fn[i], "rc");
+        //1st file is already open and passed, rest open locally
+        files[i] = in = i ? sam_open(fn[i], "r") : firstfile;
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
             goto fail;
         }
-        in_c = in->fp.cram;
-
-        int vmaj = cram_major_vers(in_c);
-        int vmin = cram_minor_vers(in_c);
-        if ((vers_maj != -1 && vers_maj != vmaj) ||
-            (vers_min != -1 && vers_min != vmin)) {
-            fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
-                    __func__);
+        if (firstfile->format.format != in->format.format) {
+            print_error("cat", "File %s is of different format!", fn[i]);
             goto fail;
         }
-        vers_maj = vmaj;
-        vers_min = vmin;
+        if (firstfile->format.format == cram) {     //version check for cram
+            cram_fd *in_c;
+            in_c = in->fp.cram;
+
+            int vmaj = cram_major_vers(in_c);
+            int vmin = cram_minor_vers(in_c);
+            if ((vers_maj != -1 && vers_maj != vmaj) ||
+                (vers_min != -1 && vers_min != vmin)) {
+                fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
+                        __func__);
+                goto fail;
+            }
+            vers_maj = vmaj;
+            vers_min = vmin;
+        }
 
         old_h = sam_hdr_read(in);
         if (!old_h) {
@@ -111,10 +137,10 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
                 goto fail;
             }
             sam_hdr_destroy(old_h);
-            sam_close(in);
+            old_h = NULL;
             continue;
         }
-
+        //merge RG lines
         int old_count = sam_hdr_count_lines(old_h, "RG");
         for (ki = 0; ki < old_count; ki++) {
             const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
@@ -136,7 +162,8 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
             }
         }
 
-        if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
+        if (firstfile->format.format == cram && old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
+            //RG order check for cram
             for (ki = 0; ki < old_count; ki++) {
                 const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
                 const char *new_name = sam_hdr_line_name(new_h, "RG", ki);
@@ -148,107 +175,281 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
             }
         }
 
-        sam_hdr_destroy(old_h);
-        sam_close(in);
+        sam_hdr_destroy(old_h); old_h = NULL;
     }
 
     ks_free(&ks);
 
-    *vers_maj_p = vers_maj;
-    *vers_min_p = vers_min;
-
-    return new_h;
+    if (vers_maj_p) {
+        *vers_maj_p = vers_maj;
+    }
+    if (vers_min_p) {
+        *vers_min_p = vers_min;
+    }
+    *out_h = new_h;
+    return files;
 
 fail:
     ks_free(&ks);
     if (old_h) sam_hdr_destroy(old_h);
     if (new_h) sam_hdr_destroy(new_h);
-    if (in) sam_close(in);
+    *out_h = NULL;
+    for (i = 1; i < nfn; ++i) {         //close files other than the firstfile
+        if (files[i]) {
+            sam_close(files[i]);
+        }
+    }
+    free(files);
 
     return NULL;
 }
 
 
-/*
- * CRAM files don't store the RG:Z:ID per read in the aux field.
- * Instead they have a numerical data series (RG) to point each read
- * back to the Nth @RG line in the file.  This means that we may need
- * to edit the RG data series (if the files were produced from
- * "samtools split" for example).
- *
- * The encoding method is stored in the compression header. Typical
- * examples:
- *
- * RG => EXTERNAL {18}           # Block content-id 18 holds RG values
- *                               # as a series of ITF8 encoded values
- *
- * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
- *                               # One RG value #-1.  (No RG)
- *
- * RG => HUFFMAN {1, 0, 1, 0}    # One RG value #0 (always first RG)
- *
- * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
- *                               # Two RG values, #0 and #1, written
- *                               # to the CORE block and possibly
- *                               # mixed with other data series.
- *
- * A single value can (but may not be) implemented as a zero bit
- * huffman code.  In this situation we can change the meta-data in the
- * compression header to renumber an RG value..
+/* ----------------------------------------------------------------------
+ * CRAM cat
  */
-int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg)
+
+// Reports the number of CRAM containers spanning a specified region if
+// specified, or the entire file if not.
+// This is the implements the "samtools cat -q [-r reg]" functionality.
+//
+// Returns 0 on success, <0 on error.
+static int cram_query_ncont(int nfn, char * const *fn, char *reg) {
+    int i;
+    hts_idx_t *idx = NULL;
+    sam_hdr_t *hdr = NULL;
+    hts_itr_t *iter = NULL;
+    samFile *in = NULL;
+
+    for (i = 0; i < nfn; i++) {
+        in = sam_open(fn[i], "r");
+        if (!in) {
+            print_error_errno("cat", "Couldn't open file %s", fn[i]);
+            return -1;
+        }
+        idx = sam_index_load(in, fn[i]);
+        if (!idx) {
+            print_error("cat", "No index found");
+            goto err;
+        }
+
+        off_t cstart = 0, cend = 0;
+        if (reg) {
+            sam_hdr_t *hdr = sam_hdr_read(in);
+            hts_itr_t *iter = sam_itr_querys(idx, hdr, reg);
+            if (!hdr) {
+                print_error("cat", "Unable to read header");
+                goto err;
+            }
+            if (!iter) {
+                print_error("cat", "Unable to parse region");
+                goto err;
+            }
+
+            if (cram_index_extents(in->fp.cram, iter->tid, iter->beg,
+                                   iter->end, &cstart, &cend) < 0) {
+                print_error("cat", "Failed to query index");
+                goto err;
+            }
+
+            hts_itr_destroy(iter);
+            sam_hdr_destroy(hdr);
+        }
+
+        int64_t first, last;
+        int64_t nc = cram_num_containers_between(in->fp.cram, cstart, cend,
+                                                 &first, &last);
+        printf("%s\t%"PRId64"\t%"PRId64"\t%"PRId64"\n",
+               fn[i], nc, first, last);
+        sam_close(in);
+        hts_idx_destroy(idx);
+    }
+
+    return 0;
+
+ err:
+    if (idx)
+        hts_idx_destroy(idx);
+    if (hdr)
+        sam_hdr_destroy(hdr);
+    if (iter)
+        hts_itr_destroy(iter);
+
+    sam_close(in);
+
+    return -1;
+}
+
+// Container range #:A-B or #:A.
+// Returns 0 on success, -1 on failure.
+static int cram_handle_cnum_region(cram_fd *fd, hts_idx_t *idx,
+                                   char *reg, off_t *cstart, off_t *cend) {
+    int cnum_start, cnum_end; // container versions
+    int n = sscanf(reg, "#:%d-%d", &cnum_start, &cnum_end);
+
+    if (n == 0) {
+        print_error("cat", "ERROR: Malformed region: %s", reg);
+        return -1;
+    } else if (n == 1) {
+        cnum_end = cnum_start;
+    }
+
+    int64_t nc = cram_num_containers(fd);
+    if (cnum_end >= nc) {
+        print_error("cat", "Too many containers.  "
+                    "The end range should be < %"PRId64, nc);
+        return -1;
+    }
+
+    // Container number to offset
+    *cstart = cram_container_num2offset(fd, cnum_start);
+    *cend   = cram_container_num2offset(fd, cnum_end);
+    if (*cstart < 0 || *cend < 0)
+        return -1;
+
+    // Seek manually
+    return cram_seek(fd, *cstart, SEEK_SET);
+}
+
+// Normal range chr:start-end.
+// Returns an hts iterator on success, NULL on failure.
+static hts_itr_t *cram_handle_region(cram_fd *fd, hts_idx_t *idx, sam_hdr_t *h,
+                                     char *reg, off_t *cstart, off_t *cend) {
+    hts_itr_t *iter;
+
+    if (!idx) {
+        fprintf(stderr, "[%s] ERROR: No index found.\n", __func__);
+        return NULL;
+    }
+
+    // This does an implicit seek and modifies the cram_fd.
+    if (!(iter = sam_itr_querys(idx, h, reg))) {
+        print_error("cat", "Unable to parse region %s", reg);
+        return NULL;
+    }
+    if (cram_index_extents(fd, iter->tid, iter->beg,
+                           iter->end, cstart, cend) < 0) {
+        print_error("cat", "Failed to query index");
+        return NULL;
+    }
+
+    return iter;
+}
+
+// Handle the -p A/B option to subdivide our region or file into portions.
+// Updates cstart/cend.
+// returns 0 on success (do something),
+//         1 on success (but nothing to do),
+//        -1 on failure.
+static int cram_subdivide_part(cram_fd *fd, hts_idx_t *idx, char *part,
+                               off_t *cstart, off_t *cend) {
+    int a, b;
+    // Part N of M
+    if (sscanf(part, "%d/%d", &a, &b) != 2) {
+        print_error("cat", "malformed region %s. Should be e.g. '1/10'", part);
+        return -1;
+    }
+
+    // Inclusive container numbers for range, 0 to NC-1
+    // Our part N/M is in container percentages as we can't have
+    // partial containers, so convert to that first.
+    int64_t cnum1, cnum2;
+    if (cstart) {
+        cnum1 = cram_container_offset2num(fd, *cstart);
+        cnum2 = cram_container_offset2num(fd, *cend);
+    } else {
+        cnum1 = 0;
+        cnum2 = cram_num_containers(fd)-1;
+    }
+
+    // Subdivide cnum1/cnum2 container numbers to new range cnum_start/end
+    int64_t nc = cnum2 - cnum1 + 1;
+    if (b > nc)
+        b = nc;
+
+    int cnum_start = (a-1)*(double)nc/b;
+    int cnum_end   = a*(double)nc/b - 1;
+    if (cnum_start < 0 || cnum_end >= nc)
+        return 1;
+
+    // Then convert back to file offsets so we can seek and do htell
+    // to detect EOR/EOF.
+    *cstart = cram_container_num2offset(fd, cnum_start + cnum1);
+    *cend   = cram_container_num2offset(fd, cnum_end   + cnum1);
+
+    return 0;
+}
+
+// The main cram_cat interface.
+// Returns 0 on success, < 0 on error.
+int cram_cat(samFile * const firstfile, int nfn, char * const *fn,
+             const sam_hdr_t *h, const char* outcram, sam_global_args *ga,
+             char *arg_list, int no_pg, char *reg, char *part, int fast_reg)
 {
-    samFile *out;
+    samFile *out = NULL;
     cram_fd *out_c;
-    int i, vers_maj, vers_min;
+    int i, vers_maj, vers_min, ret = -1;
     sam_hdr_t *new_h = NULL;
-
-    /* Check consistent versioning and compatible headers */
-    if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min)))
+    samFile **files = NULL;
+    hts_idx_t *idx = NULL;
+    hts_itr_t *iter = NULL;
+    sam_hdr_t *old_h = NULL;
+
+    // Check consistent versioning and compatible headers;
+    // merges RG lines, opens all files and returns them that multiple
+    // non-seekable stream inputs can be handled
+    if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, &vers_maj,
+                                      &vers_min, &new_h)))
         return -1;
 
+    if (!new_h) {
+        print_error_errno("cat", "failed to make output header");
+        goto closefiles;
+    }
+
     /* Open the file with cram_vers */
     char vers[100];
-    sprintf(vers, "%d.%d", vers_maj, vers_min);
-    out = sam_open_format(outcram, "wc", &ga->out);
+    snprintf(vers, sizeof(vers), "%d.%d", vers_maj, vers_min);
+
+    // Can override level=1 with e.g. "--output-fmt-option level=9"
+    out = sam_open_format(outcram, "wc1", &ga->out);
     if (out == 0) {
         print_error_errno("cat", "fail to open output file '%s'", outcram);
-        return -1;
+        goto closefiles;
     }
     out_c = out->fp.cram;
     cram_set_option(out_c, CRAM_OPT_VERSION, vers);
-    //fprintf(stderr, "Creating cram vers %s\n", vers);
 
     if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
                                  "VN", samtools_version(),
                                  arg_list ? "CL": NULL,
                                  arg_list ? arg_list : NULL,
                                  NULL))
-        return -1;
+        goto closefiles;
 
     if (sam_hdr_write(out, new_h) < 0) {
         print_error_errno("cat", "Couldn't write header");
-        return -1;
+        goto closefiles;
     }
+    out_c = out->fp.cram;
 
     for (i = 0; i < nfn; ++i) {
         samFile *in;
         cram_fd *in_c;
         cram_container *c;
-        sam_hdr_t *old_h;
         int new_rg = -1;
 
-        in = sam_open(fn[i], "rc");
+        in = files[i];
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
-            return -1;
+            goto closefiles;
         }
         in_c = in->fp.cram;
 
         old_h = sam_hdr_read(in);
         if (!old_h) {
             print_error("cat", "fail to read the header of file '%s'", fn[i]);
-            return -1;
+            goto closefiles;
         }
 
         // Compute RG mapping if suitable for changing.
@@ -258,71 +459,237 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                 new_rg = sam_hdr_line_index(new_h, "RG", old_name);
                 if (new_rg < 0) {
                     print_error("cat", "fail to find @RG line '%s' in the new header", old_name);
-                    return -1;
+                    goto closefiles;
                 }
             } else {
                 print_error("cat", "fail to find @RG line in file '%s'", fn[i]);
-                return -1;
+                goto closefiles;
             }
         } else {
             new_rg = 0;
         }
 
+        // We have multiple region syntax.  Either the standard chr:start-end
+        // or a cat-specific #:num-num for explicit container numbers.
+        // Both of these seek to a specific file offset and also have the
+        // end offset known so we can use htell to detect when we're done.
+        // Those offsets are in cstart and cend.
+        //
+        // However for the e.g. -p 1/10 syntax we need to know the container
+        // numbers corresponding to the cstart/cend offsets as we can't
+        // start half way through a container when doing fractions.
+        // These are cnum1 and cnum2.  (We could short cut this for the
+        // #:num-num, but it's simpler to just treat all regions identically.)
+        off_t cstart = 0, cend = 0;
+        int filter_by_cnum = 0;
+
+        if (reg || part) {
+            idx = sam_index_load(in, fn[i]);
+            if (!idx) {
+                print_error("cat", "failed to load index");
+                goto closefiles;
+            }
+        }
+
+        if (reg) {
+            if (strncmp(reg, "#:", 2) == 0) {
+                // Region as container numbers
+                if (cram_handle_cnum_region(in_c, idx, reg, &cstart,&cend) < 0)
+                    goto closefiles;
+
+                filter_by_cnum = 1;
+            } else {
+                // Normal range chr:start-end
+                if (!(iter = cram_handle_region(in_c, idx, old_h, reg,
+                                                &cstart, &cend)))
+                    goto closefiles;
+            }
+        }
+
+        // We can also take a range above and subdivide it into parts.
+        // Eg -r chr1 -p 1/10 (... to -p 10/10).  Part only implies
+        // portions of the entire file.
+        if (part) {
+            int r = cram_subdivide_part(in_c, idx, part, &cstart, &cend);
+            if (r != 0) {
+                if (r > 0) // Not an error, just nothing to do
+                    ret = 0;
+                goto closefiles;
+            }
+        }
+
+        if (cstart) // reg or part
+            if (0 != cram_seek(in_c, cstart, SEEK_SET))
+                goto closefiles;
+
+
+        // Make refid -2 ("*") come after other chromosomes, for easy sort
+        int itid = iter
+            ? (iter->tid == HTS_IDX_NOCOOR ? INT_MAX : iter->tid)
+            : 0;
+        int last_ref_id = -99;
+
+        off_t before_hdr = htell(cram_fd_get_fp(in_c));
+
         // Copy contains and blocks within them
         while ((c = cram_read_container(in_c))) {
             if (cram_container_is_empty(in_c)) {
                 cram_block *blk;
                 // Container compression header
                 if (!(blk = cram_read_block(in_c)))
-                    return -1;
+                    goto closefiles;
                 cram_free_block(blk);
                 cram_free_container(c);
                 continue;
             }
 
+            int filter = 0;
+
             // If we have just one RG key and new_rg != 0 then
             // we need to edit the compression header. IF WE CAN.
             if (new_rg) {
+                if (reg) {
+                    print_error("cat", "Cannot specify a region while "
+                                "transcoding RG lines");
+                    goto closefiles;
+                }
                 int zero = 0;
                 //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg);
                 cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
             } else {
                 int32_t num_slices;
-                cram_block *blk;
 
-                // Not switching rg so do the usual read/write loop
-                if (cram_write_container(out_c, c) != 0)
-                    return -1;
+                if (reg) {
+                    if (before_hdr > cend) {
+                        cram_free_container(c);
+                        break;
+                    }
+                }
 
-                // Container compression header
-                if (!(blk = cram_read_block(in_c)))
-                    return -1;
-                if (cram_write_block(out_c, blk) != 0) {
-                    cram_free_block(blk);
-                    return -1;
+                // For chr:start-end regions, do we need to filter or skip?
+                if (iter && reg) {
+                    int refid;
+                    hts_pos_t start, span, end;
+
+                    cram_container_get_coords(c, &refid, &start, &span);
+                    end = start+span;
+
+                    // Make refid -1 ("*") come after other chromosomes
+                    if (refid == -1)
+                        refid = INT_MAX;
+
+                    if (refid > itid || start > iter->end) {
+                        // Beyond the requested range
+                        break;
+                    } else if (refid == -2) {
+                        // Multi-ref containers.  We only support this if
+                        // the RI data series is in a container by itself.
+                        filter = 3;
+                    } else if (refid < itid || end < iter->beg) {
+                        // Skip, in case of mixed size containers.
+                        // Eg: Use "=", skip "-".
+                        // ==========|======|=
+                        //   ----  ==|==  ==|== ----
+                        //     ----- | ====   ----
+                        filter = 2;
+                    } else if (start < iter->beg || end > iter->end) {
+                        // Container overlaps region.
+                        // Fast mode just copies overlapping containers.
+                        // Slow mode does a precise filtering by reading and
+                        // filtering each record in turn so it's the same as
+                        // a samtools view command.
+                        filter = fast_reg ? 0 : 1;
+                    }
+                    // else we're in an "internal" container, so just copy
                 }
-                cram_free_block(blk);
 
+                if (filter && last_ref_id == -1 && itid == INT_MAX)
+                    // Multi-ref containers consisting solely of ref "*" are
+                    // common, but if it's sorted then we know it's "*" from
+                    // here on so we don't need to filter despite multi-ref.
+                    filter = 0;
 
-                // Container num_blocks can be invalid, due to a bug.
-                // Instead we iterate in slice context instead.
-                (void)cram_container_get_landmarks(c, &num_slices);
-                cram_copy_slice(in_c, out_c, num_slices);
-            }
+                if (filter) {
+                    // Filter or skip
+                    cram_filter_container(in_c, out_c, c, &last_ref_id);
+                } else {
+                    // Copy. Consider adding a cram_copy_container API instead.
 
+                    // Container compression header
+                    cram_block *blk;
+                    if (!(blk = cram_read_block(in_c)))
+                        goto closefiles;
+
+                    // Not switching rg so do the usual read/write loop
+                    if (cram_write_container(out_c, c) != 0)
+                        goto closefiles;
+
+                    // Contatiner compression header
+                    if (cram_write_block(out_c, blk) != 0) {
+                        cram_free_block(blk);
+                        goto closefiles;
+                    }
+
+                    // Container num_blocks can be invalid, due to a bug.
+                    // Instead we iterate in slice context instead.
+                    (void)cram_container_get_landmarks(c, &num_slices);
+                    if (cram_copy_slice(in_c, out_c, num_slices) < 0) {
+                        cram_free_block(blk);
+                        goto closefiles;
+                    }
+                    cram_free_block(blk);
+                }
+            }
             cram_free_container(c);
-        }
 
+            // Location of next container start
+            before_hdr = htell(cram_fd_get_fp(in_c));
+            if (filter_by_cnum && before_hdr > cend)
+                break;
+        }
         sam_hdr_destroy(old_h);
-        sam_close(in);
+        old_h = NULL;
+
+        if (idx) {
+            hts_idx_destroy(idx);
+            idx = NULL;
+        }
+
+        if (iter) {
+            hts_itr_destroy(iter);
+            iter = NULL;
+        }
     }
-    sam_close(out);
-    sam_hdr_destroy(new_h);
+    ret = 0;
 
-    return 0;
-}
+closefiles:
+    if (old_h)
+        sam_hdr_destroy(old_h);
+
+    if (idx)
+        hts_idx_destroy(idx);
+
+    if (iter)
+        hts_itr_destroy(iter);
 
+    if (out)
+        sam_close(out);
+
+    if (new_h)
+        sam_hdr_destroy(new_h);
+
+    for (i = 1; i < nfn; ++i) {     //skip firstfile and close rest
+        if (files[i]) {
+            sam_close(files[i]);
+        }
+    }
+    free(files);
+    return ret;
+}
 
+/* ----------------------------------------------------------------------
+ * BAM cat
+ */
 #define BUF_SIZE 0x10000
 
 #define GZIPID1 31
@@ -330,31 +697,40 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
 
 #define BGZF_EMPTY_BLOCK_SIZE 28
 
-int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
+int bam_cat(samFile * const firstfile, int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
 {
-    BGZF *fp, *in = NULL;
+    BGZF *fp = NULL, *in = NULL;
     uint8_t *buf = NULL;
     uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
     const int es=BGZF_EMPTY_BLOCK_SIZE;
     int i;
+    samFile **files = NULL;
+    sam_hdr_t *new_h = NULL;
 
+    /* merges RG lines, opens all files and returns them that multiple non-seekable
+    stream inputs can be handled */
+    if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, NULL, NULL, &new_h)))
+        return -1;
+    if (!new_h) {
+        print_error_errno("cat", "failed to make output header");
+        goto fail;
+    }
     fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
     if (fp == 0) {
         print_error_errno("cat", "fail to open output file '%s'", outbam);
-        return -1;
+        goto fail;
     }
-    if (h) {
-        if (!no_pg && sam_hdr_add_pg(h, "samtools",
-                                     "VN", samtools_version(),
-                                     arg_list ? "CL": NULL,
-                                     arg_list ? arg_list : NULL,
-                                     NULL))
-            goto fail;
 
-        if (bam_hdr_write(fp, h) < 0) {
-            print_error_errno("cat", "Couldn't write header");
-            goto fail;
-        }
+    if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
+                                    "VN", samtools_version(),
+                                    arg_list ? "CL": NULL,
+                                    arg_list ? arg_list : NULL,
+                                    NULL))
+        goto fail;
+
+    if (bam_hdr_write(fp, new_h) < 0) {
+        print_error_errno("cat", "Couldn't write header");
+        goto fail;
     }
 
     buf = (uint8_t*) malloc(BUF_SIZE);
@@ -363,35 +739,13 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a
         goto fail;
     }
     for(i = 0; i < nfn; ++i){
-        sam_hdr_t *old;
         int len,j;
-
-        in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
+        in = files[i]->fp.bgzf;
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
             goto fail;
         }
-        if (in->is_write) return -1;
-
-        old = bam_hdr_read(in);
-        if (old == NULL) {
-            fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
-                    __func__, fn[i]);
-            goto fail;
-        }
-        if (h == 0 && i == 0) {
-            if (!no_pg && sam_hdr_add_pg(old, "samtools",
-                                         "VN", samtools_version(),
-                                         arg_list ? "CL": NULL,
-                                         arg_list ? arg_list : NULL,
-                                         NULL))
-                goto fail;
-
-            if (bam_hdr_write(fp, old) < 0) {
-                print_error_errno("cat", "Couldn't write header");
-                goto fail;
-            }
-        }
+        if (in->is_write) goto fail;
 
         if (in->block_offset < in->block_length) {
             if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
@@ -432,39 +786,55 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a
                 if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
             }
         }
-        sam_hdr_destroy(old);
-        bgzf_close(in);
         in = NULL;
     }
     free(buf);
     if (bgzf_close(fp) < 0) {
         fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
-        return -1;
+        goto fail;
+    }
+    for (i = 1; i < nfn; ++i) {     //skip firstfile and close rest
+        if (files[i]) {
+            sam_close(files[i]);
+        }
     }
+    free(files);
+    sam_hdr_destroy(new_h);
     return 0;
 
  write_fail:
     fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
  fail:
-    if (in) bgzf_close(in);
+    if (new_h) {
+        sam_hdr_destroy(new_h);
+    }
     if (fp) bgzf_close(fp);
     free(buf);
+
+    if (files) {
+        for(i = 1; i < nfn; ++i) {  //except the firstfile
+            if(files[i]) {
+                sam_close(files[i]);
+            }
+        }
+        free(files);
+    }
     return -1;
 }
 
-
 int main_cat(int argc, char *argv[])
 {
     sam_hdr_t *h = 0;
     char *outfn = 0;
     char **infns = NULL; // files to concatenate
     int infns_size = 0;
-    int c, ret = 0, no_pg = 0, usage = 0;
+    int c, ret = 0, no_pg = 0, usage = 0, query_ncont = 0, fast_mode = 0;
     samFile *in;
     sam_global_args ga;
+    char *reg = NULL, *part = NULL;
 
     static const struct option lopts[] = {
-        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'),
+        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '-'),
         {"no-PG", no_argument, NULL, 1},
         { NULL, 0, NULL, 0 }
     };
@@ -472,21 +842,22 @@ int main_cat(int argc, char *argv[])
     char *arg_list = NULL;
 
     sam_global_args_init(&ga);
-
-    while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:o:b:r:p:qf", lopts, NULL)) >= 0) {
         switch (c) {
             case 'h': {
                 samFile *fph = sam_open(optarg, "r");
                 if (fph == 0) {
                     fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg);
-                    return 1;
+                    ret = 1;
+                    goto end;
                 }
                 h = sam_hdr_read(fph);
                 if (h == NULL) {
                     fprintf(stderr,
                             "[%s] ERROR: failed to read the header from '%s'.\n",
                             __func__, optarg);
-                    return 1;
+                    ret = 1;
+                    goto end;
                 }
                 sam_close(fph);
                 break;
@@ -512,6 +883,19 @@ int main_cat(int argc, char *argv[])
             case 1:
                 no_pg = 1;
                 break;
+            case 'r':
+                reg = optarg;
+                break;
+            case 'p':
+                part = optarg;
+                break;
+            case 'f':
+                fast_mode = 1;
+                break;
+            case 'q':
+                query_ncont=1;
+                break;
+
             default:
                 if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                 /* else fall-through */
@@ -521,7 +905,8 @@ int main_cat(int argc, char *argv[])
 
     if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
         print_error("cat", "failed to create arg_list");
-        return 1;
+        ret = 1;
+        goto end;
     }
 
     // Append files specified in argv to the list.
@@ -541,34 +926,48 @@ int main_cat(int argc, char *argv[])
         fprintf(stderr, "         -h FILE  copy the header from FILE [default is 1st input file]\n");
         fprintf(stderr, "         -o FILE  output BAM/CRAM\n");
         fprintf(stderr, "         --no-PG  do not add a PG line\n");
-        sam_global_opt_help(stderr, "--..-@-.");
-        return 1;
+        fprintf(stderr, "\nCRAM only options for filtering:\n");
+        fprintf(stderr, "         -r REG   filter to region REG.\n");
+        fprintf(stderr, "                  REG can also be #:cstart-cend for specific container numbers\n");
+        fprintf(stderr, "         -p N/M   Specify part N of M (where N is 1 to M inclusive)\n");
+        fprintf(stderr, "         -f       Fast mode: don't filter containers to exactly match region\n");
+        fprintf(stderr, "         -q       Query the total number of indexed containers\n");
+        fprintf(stderr, "\nStandard options:\n");
+        sam_global_opt_help(stderr, "---.---.");
+        ret = 1;
+        goto end;
     }
 
     in = sam_open(infns[0], "r");
     if (!in) {
         print_error_errno("cat", "failed to open file '%s'", infns[0]);
-        return 1;
+        ret = 1;
+        goto end;
     }
 
     switch (hts_get_format(in)->format) {
     case bam:
-        sam_close(in);
-        if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
+        if (bam_cat(in, infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
             ret = 1;
         break;
 
     case cram:
-        sam_close(in);
-        if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0)
+        if (query_ncont) {
+            if (cram_query_ncont(infns_size+nargv_fns, infns, reg) < 0)
+                ret = 1;
+        } else {
+            if (cram_cat(in, infns_size+nargv_fns, infns, h,
+                         outfn? outfn : "-", &ga, arg_list, no_pg, reg,
+                         part, fast_mode) < 0)
             ret = 1;
+        }
         break;
 
     default:
-        sam_close(in);
         fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
-        return 1;
+        ret = 1;
     }
+    sam_close(in);
 
  end:
     if (infns_size > 0) {
@@ -582,6 +981,7 @@ int main_cat(int argc, char *argv[])
     free(arg_list);
     if (h)
         sam_hdr_destroy(h);
+    sam_global_args_free(&ga);
 
     return ret;
 }
diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c
index ef2199c78..af70d00e2 100644
--- a/samtools/bam_cat.c.pysam.c
+++ b/samtools/bam_cat.c.pysam.c
@@ -2,7 +2,8 @@
 
 /*  bam_cat.c -- efficiently concatenates bam files.
 
-    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
+    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021,
+                  2023-2024 Genome Research Ltd.
     Modified SAMtools work copyright (C) 2010 Illumina, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -48,55 +49,80 @@ Illumina.
 #include "htslib/sam.h"
 #include "htslib/cram.h"
 #include "htslib/kstring.h"
+#include "htslib/hfile.h"
 #include "samtools.h"
 #include "sam_opts.h"
 
-/*
- * Check the files are consistent and capable of being concatenated.
- * Also fills out the version numbers and produces a new sam_hdr_t
- * structure with merged RG lines.
- * Note it is only a simple merge.
- *
- * Returns updated header on success;
- *        NULL on failure.
- */
-static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h,
-                                     int *vers_maj_p, int *vers_min_p) {
+/// cat_check_merge_hdr - check compatibility and merge RG hearders merges RGon both CRAM and BAM.
+/** @param firstfile - pointer to the 1sr file opened in caller
+ *  @param nfn - number of files to be processed, including the firstfile
+ *  @param fn - array of file paths to be processed
+ *  @param h - sam header pointer which contains explicitly given header
+ *  @param vers_maj_p - cram major version set and send out for output creation
+ *  @param vers_min_p - cram min version set and send out for output creation
+ *  @param out_h - pointer to sam header pointer, outputs the merged header
+ * returns array of opened samFile pointers on success and NULL on failure
+ * This method has the merged header processing for cram and bam.
+ * RG lines are merged for both cram and bam. For cram, version match for each
+ * file and order match of RG lines are compared as well.
+ * Note: it is a simple merge of RG lines alone.
+*/
+static samFile** cat_check_merge_hdr(samFile * const firstfile, int nfn, char * const *fn, const sam_hdr_t *h,
+                                     int *vers_maj_p, int *vers_min_p, sam_hdr_t **out_h) {
     int i, vers_maj = -1, vers_min = -1;
     sam_hdr_t *new_h = NULL, *old_h = NULL;
     samFile *in = NULL;
     kstring_t ks = KS_INITIALIZE;
-
-    if (h) {
-        new_h = sam_hdr_dup(h);
-        if (!new_h) {
-            fprintf(samtools_stderr, "[%s] ERROR: header duplication failed.\n",
-                    __func__);
-            goto fail;
+    samFile **files = calloc(nfn, sizeof(samFile *));
+    if(!files) {
+        fprintf(samtools_stderr, "[%s] ERROR: failed to allocate space for file handles.\n", __func__);
+        return NULL;
+    }
+    if (!out_h || !firstfile) {
+        fprintf(samtools_stderr, "[%s] ERROR: header check failed.\n", __func__);
+        goto fail;
+    }
+    if (*out_h) {           //use header if one is already present
+        new_h = *out_h;
+    }
+    else {
+        if (h) {            //use the explicit header given
+            new_h = sam_hdr_dup(h);
+            if (!new_h) {
+                fprintf(samtools_stderr, "[%s] ERROR: header duplication failed.\n",
+                        __func__);
+                goto fail;
+            }
         }
     }
 
     for (i = 0; i < nfn; ++i) {
-        cram_fd *in_c;
         int ki;
-
-        in = sam_open(fn[i], "rc");
+        //1st file is already open and passed, rest open locally
+        files[i] = in = i ? sam_open(fn[i], "r") : firstfile;
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
             goto fail;
         }
-        in_c = in->fp.cram;
-
-        int vmaj = cram_major_vers(in_c);
-        int vmin = cram_minor_vers(in_c);
-        if ((vers_maj != -1 && vers_maj != vmaj) ||
-            (vers_min != -1 && vers_min != vmin)) {
-            fprintf(samtools_stderr, "[%s] ERROR: input files have differing version numbers.\n",
-                    __func__);
+        if (firstfile->format.format != in->format.format) {
+            print_error("cat", "File %s is of different format!", fn[i]);
             goto fail;
         }
-        vers_maj = vmaj;
-        vers_min = vmin;
+        if (firstfile->format.format == cram) {     //version check for cram
+            cram_fd *in_c;
+            in_c = in->fp.cram;
+
+            int vmaj = cram_major_vers(in_c);
+            int vmin = cram_minor_vers(in_c);
+            if ((vers_maj != -1 && vers_maj != vmaj) ||
+                (vers_min != -1 && vers_min != vmin)) {
+                fprintf(samtools_stderr, "[%s] ERROR: input files have differing version numbers.\n",
+                        __func__);
+                goto fail;
+            }
+            vers_maj = vmaj;
+            vers_min = vmin;
+        }
 
         old_h = sam_hdr_read(in);
         if (!old_h) {
@@ -113,10 +139,10 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
                 goto fail;
             }
             sam_hdr_destroy(old_h);
-            sam_close(in);
+            old_h = NULL;
             continue;
         }
-
+        //merge RG lines
         int old_count = sam_hdr_count_lines(old_h, "RG");
         for (ki = 0; ki < old_count; ki++) {
             const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
@@ -138,7 +164,8 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
             }
         }
 
-        if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
+        if (firstfile->format.format == cram && old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
+            //RG order check for cram
             for (ki = 0; ki < old_count; ki++) {
                 const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
                 const char *new_name = sam_hdr_line_name(new_h, "RG", ki);
@@ -150,107 +177,281 @@ static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t
             }
         }
 
-        sam_hdr_destroy(old_h);
-        sam_close(in);
+        sam_hdr_destroy(old_h); old_h = NULL;
     }
 
     ks_free(&ks);
 
-    *vers_maj_p = vers_maj;
-    *vers_min_p = vers_min;
-
-    return new_h;
+    if (vers_maj_p) {
+        *vers_maj_p = vers_maj;
+    }
+    if (vers_min_p) {
+        *vers_min_p = vers_min;
+    }
+    *out_h = new_h;
+    return files;
 
 fail:
     ks_free(&ks);
     if (old_h) sam_hdr_destroy(old_h);
     if (new_h) sam_hdr_destroy(new_h);
-    if (in) sam_close(in);
+    *out_h = NULL;
+    for (i = 1; i < nfn; ++i) {         //close files other than the firstfile
+        if (files[i]) {
+            sam_close(files[i]);
+        }
+    }
+    free(files);
 
     return NULL;
 }
 
 
-/*
- * CRAM files don't store the RG:Z:ID per read in the aux field.
- * Instead they have a numerical data series (RG) to point each read
- * back to the Nth @RG line in the file.  This means that we may need
- * to edit the RG data series (if the files were produced from
- * "samtools split" for example).
- *
- * The encoding method is stored in the compression header. Typical
- * examples:
- *
- * RG => EXTERNAL {18}           # Block content-id 18 holds RG values
- *                               # as a series of ITF8 encoded values
- *
- * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
- *                               # One RG value #-1.  (No RG)
- *
- * RG => HUFFMAN {1, 0, 1, 0}    # One RG value #0 (always first RG)
- *
- * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
- *                               # Two RG values, #0 and #1, written
- *                               # to the CORE block and possibly
- *                               # mixed with other data series.
- *
- * A single value can (but may not be) implemented as a zero bit
- * huffman code.  In this situation we can change the meta-data in the
- * compression header to renumber an RG value..
+/* ----------------------------------------------------------------------
+ * CRAM cat
  */
-int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg)
+
+// Reports the number of CRAM containers spanning a specified region if
+// specified, or the entire file if not.
+// This is the implements the "samtools cat -q [-r reg]" functionality.
+//
+// Returns 0 on success, <0 on error.
+static int cram_query_ncont(int nfn, char * const *fn, char *reg) {
+    int i;
+    hts_idx_t *idx = NULL;
+    sam_hdr_t *hdr = NULL;
+    hts_itr_t *iter = NULL;
+    samFile *in = NULL;
+
+    for (i = 0; i < nfn; i++) {
+        in = sam_open(fn[i], "r");
+        if (!in) {
+            print_error_errno("cat", "Couldn't open file %s", fn[i]);
+            return -1;
+        }
+        idx = sam_index_load(in, fn[i]);
+        if (!idx) {
+            print_error("cat", "No index found");
+            goto err;
+        }
+
+        off_t cstart = 0, cend = 0;
+        if (reg) {
+            sam_hdr_t *hdr = sam_hdr_read(in);
+            hts_itr_t *iter = sam_itr_querys(idx, hdr, reg);
+            if (!hdr) {
+                print_error("cat", "Unable to read header");
+                goto err;
+            }
+            if (!iter) {
+                print_error("cat", "Unable to parse region");
+                goto err;
+            }
+
+            if (cram_index_extents(in->fp.cram, iter->tid, iter->beg,
+                                   iter->end, &cstart, &cend) < 0) {
+                print_error("cat", "Failed to query index");
+                goto err;
+            }
+
+            hts_itr_destroy(iter);
+            sam_hdr_destroy(hdr);
+        }
+
+        int64_t first, last;
+        int64_t nc = cram_num_containers_between(in->fp.cram, cstart, cend,
+                                                 &first, &last);
+        fprintf(samtools_stdout, "%s\t%"PRId64"\t%"PRId64"\t%"PRId64"\n",
+               fn[i], nc, first, last);
+        sam_close(in);
+        hts_idx_destroy(idx);
+    }
+
+    return 0;
+
+ err:
+    if (idx)
+        hts_idx_destroy(idx);
+    if (hdr)
+        sam_hdr_destroy(hdr);
+    if (iter)
+        hts_itr_destroy(iter);
+
+    sam_close(in);
+
+    return -1;
+}
+
+// Container range #:A-B or #:A.
+// Returns 0 on success, -1 on failure.
+static int cram_handle_cnum_region(cram_fd *fd, hts_idx_t *idx,
+                                   char *reg, off_t *cstart, off_t *cend) {
+    int cnum_start, cnum_end; // container versions
+    int n = sscanf(reg, "#:%d-%d", &cnum_start, &cnum_end);
+
+    if (n == 0) {
+        print_error("cat", "ERROR: Malformed region: %s", reg);
+        return -1;
+    } else if (n == 1) {
+        cnum_end = cnum_start;
+    }
+
+    int64_t nc = cram_num_containers(fd);
+    if (cnum_end >= nc) {
+        print_error("cat", "Too many containers.  "
+                    "The end range should be < %"PRId64, nc);
+        return -1;
+    }
+
+    // Container number to offset
+    *cstart = cram_container_num2offset(fd, cnum_start);
+    *cend   = cram_container_num2offset(fd, cnum_end);
+    if (*cstart < 0 || *cend < 0)
+        return -1;
+
+    // Seek manually
+    return cram_seek(fd, *cstart, SEEK_SET);
+}
+
+// Normal range chr:start-end.
+// Returns an hts iterator on success, NULL on failure.
+static hts_itr_t *cram_handle_region(cram_fd *fd, hts_idx_t *idx, sam_hdr_t *h,
+                                     char *reg, off_t *cstart, off_t *cend) {
+    hts_itr_t *iter;
+
+    if (!idx) {
+        fprintf(samtools_stderr, "[%s] ERROR: No index found.\n", __func__);
+        return NULL;
+    }
+
+    // This does an implicit seek and modifies the cram_fd.
+    if (!(iter = sam_itr_querys(idx, h, reg))) {
+        print_error("cat", "Unable to parse region %s", reg);
+        return NULL;
+    }
+    if (cram_index_extents(fd, iter->tid, iter->beg,
+                           iter->end, cstart, cend) < 0) {
+        print_error("cat", "Failed to query index");
+        return NULL;
+    }
+
+    return iter;
+}
+
+// Handle the -p A/B option to subdivide our region or file into portions.
+// Updates cstart/cend.
+// returns 0 on success (do something),
+//         1 on success (but nothing to do),
+//        -1 on failure.
+static int cram_subdivide_part(cram_fd *fd, hts_idx_t *idx, char *part,
+                               off_t *cstart, off_t *cend) {
+    int a, b;
+    // Part N of M
+    if (sscanf(part, "%d/%d", &a, &b) != 2) {
+        print_error("cat", "malformed region %s. Should be e.g. '1/10'", part);
+        return -1;
+    }
+
+    // Inclusive container numbers for range, 0 to NC-1
+    // Our part N/M is in container percentages as we can't have
+    // partial containers, so convert to that first.
+    int64_t cnum1, cnum2;
+    if (cstart) {
+        cnum1 = cram_container_offset2num(fd, *cstart);
+        cnum2 = cram_container_offset2num(fd, *cend);
+    } else {
+        cnum1 = 0;
+        cnum2 = cram_num_containers(fd)-1;
+    }
+
+    // Subdivide cnum1/cnum2 container numbers to new range cnum_start/end
+    int64_t nc = cnum2 - cnum1 + 1;
+    if (b > nc)
+        b = nc;
+
+    int cnum_start = (a-1)*(double)nc/b;
+    int cnum_end   = a*(double)nc/b - 1;
+    if (cnum_start < 0 || cnum_end >= nc)
+        return 1;
+
+    // Then convert back to file offsets so we can seek and do htell
+    // to detect EOR/EOF.
+    *cstart = cram_container_num2offset(fd, cnum_start + cnum1);
+    *cend   = cram_container_num2offset(fd, cnum_end   + cnum1);
+
+    return 0;
+}
+
+// The main cram_cat interface.
+// Returns 0 on success, < 0 on error.
+int cram_cat(samFile * const firstfile, int nfn, char * const *fn,
+             const sam_hdr_t *h, const char* outcram, sam_global_args *ga,
+             char *arg_list, int no_pg, char *reg, char *part, int fast_reg)
 {
-    samFile *out;
+    samFile *out = NULL;
     cram_fd *out_c;
-    int i, vers_maj, vers_min;
+    int i, vers_maj, vers_min, ret = -1;
     sam_hdr_t *new_h = NULL;
-
-    /* Check consistent versioning and compatible headers */
-    if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min)))
+    samFile **files = NULL;
+    hts_idx_t *idx = NULL;
+    hts_itr_t *iter = NULL;
+    sam_hdr_t *old_h = NULL;
+
+    // Check consistent versioning and compatible headers;
+    // merges RG lines, opens all files and returns them that multiple
+    // non-seekable stream inputs can be handled
+    if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, &vers_maj,
+                                      &vers_min, &new_h)))
         return -1;
 
+    if (!new_h) {
+        print_error_errno("cat", "failed to make output header");
+        goto closefiles;
+    }
+
     /* Open the file with cram_vers */
     char vers[100];
-    sprintf(vers, "%d.%d", vers_maj, vers_min);
-    out = sam_open_format(outcram, "wc", &ga->out);
+    snprintf(vers, sizeof(vers), "%d.%d", vers_maj, vers_min);
+
+    // Can override level=1 with e.g. "--output-fmt-option level=9"
+    out = sam_open_format(outcram, "wc1", &ga->out);
     if (out == 0) {
         print_error_errno("cat", "fail to open output file '%s'", outcram);
-        return -1;
+        goto closefiles;
     }
     out_c = out->fp.cram;
     cram_set_option(out_c, CRAM_OPT_VERSION, vers);
-    //fprintf(samtools_stderr, "Creating cram vers %s\n", vers);
 
     if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
                                  "VN", samtools_version(),
                                  arg_list ? "CL": NULL,
                                  arg_list ? arg_list : NULL,
                                  NULL))
-        return -1;
+        goto closefiles;
 
     if (sam_hdr_write(out, new_h) < 0) {
         print_error_errno("cat", "Couldn't write header");
-        return -1;
+        goto closefiles;
     }
+    out_c = out->fp.cram;
 
     for (i = 0; i < nfn; ++i) {
         samFile *in;
         cram_fd *in_c;
         cram_container *c;
-        sam_hdr_t *old_h;
         int new_rg = -1;
 
-        in = sam_open(fn[i], "rc");
+        in = files[i];
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
-            return -1;
+            goto closefiles;
         }
         in_c = in->fp.cram;
 
         old_h = sam_hdr_read(in);
         if (!old_h) {
             print_error("cat", "fail to read the header of file '%s'", fn[i]);
-            return -1;
+            goto closefiles;
         }
 
         // Compute RG mapping if suitable for changing.
@@ -260,71 +461,237 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                 new_rg = sam_hdr_line_index(new_h, "RG", old_name);
                 if (new_rg < 0) {
                     print_error("cat", "fail to find @RG line '%s' in the new header", old_name);
-                    return -1;
+                    goto closefiles;
                 }
             } else {
                 print_error("cat", "fail to find @RG line in file '%s'", fn[i]);
-                return -1;
+                goto closefiles;
             }
         } else {
             new_rg = 0;
         }
 
+        // We have multiple region syntax.  Either the standard chr:start-end
+        // or a cat-specific #:num-num for explicit container numbers.
+        // Both of these seek to a specific file offset and also have the
+        // end offset known so we can use htell to detect when we're done.
+        // Those offsets are in cstart and cend.
+        //
+        // However for the e.g. -p 1/10 syntax we need to know the container
+        // numbers corresponding to the cstart/cend offsets as we can't
+        // start half way through a container when doing fractions.
+        // These are cnum1 and cnum2.  (We could short cut this for the
+        // #:num-num, but it's simpler to just treat all regions identically.)
+        off_t cstart = 0, cend = 0;
+        int filter_by_cnum = 0;
+
+        if (reg || part) {
+            idx = sam_index_load(in, fn[i]);
+            if (!idx) {
+                print_error("cat", "failed to load index");
+                goto closefiles;
+            }
+        }
+
+        if (reg) {
+            if (strncmp(reg, "#:", 2) == 0) {
+                // Region as container numbers
+                if (cram_handle_cnum_region(in_c, idx, reg, &cstart,&cend) < 0)
+                    goto closefiles;
+
+                filter_by_cnum = 1;
+            } else {
+                // Normal range chr:start-end
+                if (!(iter = cram_handle_region(in_c, idx, old_h, reg,
+                                                &cstart, &cend)))
+                    goto closefiles;
+            }
+        }
+
+        // We can also take a range above and subdivide it into parts.
+        // Eg -r chr1 -p 1/10 (... to -p 10/10).  Part only implies
+        // portions of the entire file.
+        if (part) {
+            int r = cram_subdivide_part(in_c, idx, part, &cstart, &cend);
+            if (r != 0) {
+                if (r > 0) // Not an error, just nothing to do
+                    ret = 0;
+                goto closefiles;
+            }
+        }
+
+        if (cstart) // reg or part
+            if (0 != cram_seek(in_c, cstart, SEEK_SET))
+                goto closefiles;
+
+
+        // Make refid -2 ("*") come after other chromosomes, for easy sort
+        int itid = iter
+            ? (iter->tid == HTS_IDX_NOCOOR ? INT_MAX : iter->tid)
+            : 0;
+        int last_ref_id = -99;
+
+        off_t before_hdr = htell(cram_fd_get_fp(in_c));
+
         // Copy contains and blocks within them
         while ((c = cram_read_container(in_c))) {
             if (cram_container_is_empty(in_c)) {
                 cram_block *blk;
                 // Container compression header
                 if (!(blk = cram_read_block(in_c)))
-                    return -1;
+                    goto closefiles;
                 cram_free_block(blk);
                 cram_free_container(c);
                 continue;
             }
 
+            int filter = 0;
+
             // If we have just one RG key and new_rg != 0 then
             // we need to edit the compression header. IF WE CAN.
             if (new_rg) {
+                if (reg) {
+                    print_error("cat", "Cannot specify a region while "
+                                "transcoding RG lines");
+                    goto closefiles;
+                }
                 int zero = 0;
                 //fprintf(samtools_stderr, "Transcode RG %d to %d\n", 0, new_rg);
                 cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
             } else {
                 int32_t num_slices;
-                cram_block *blk;
 
-                // Not switching rg so do the usual read/write loop
-                if (cram_write_container(out_c, c) != 0)
-                    return -1;
+                if (reg) {
+                    if (before_hdr > cend) {
+                        cram_free_container(c);
+                        break;
+                    }
+                }
 
-                // Container compression header
-                if (!(blk = cram_read_block(in_c)))
-                    return -1;
-                if (cram_write_block(out_c, blk) != 0) {
-                    cram_free_block(blk);
-                    return -1;
+                // For chr:start-end regions, do we need to filter or skip?
+                if (iter && reg) {
+                    int refid;
+                    hts_pos_t start, span, end;
+
+                    cram_container_get_coords(c, &refid, &start, &span);
+                    end = start+span;
+
+                    // Make refid -1 ("*") come after other chromosomes
+                    if (refid == -1)
+                        refid = INT_MAX;
+
+                    if (refid > itid || start > iter->end) {
+                        // Beyond the requested range
+                        break;
+                    } else if (refid == -2) {
+                        // Multi-ref containers.  We only support this if
+                        // the RI data series is in a container by itself.
+                        filter = 3;
+                    } else if (refid < itid || end < iter->beg) {
+                        // Skip, in case of mixed size containers.
+                        // Eg: Use "=", skip "-".
+                        // ==========|======|=
+                        //   ----  ==|==  ==|== ----
+                        //     ----- | ====   ----
+                        filter = 2;
+                    } else if (start < iter->beg || end > iter->end) {
+                        // Container overlaps region.
+                        // Fast mode just copies overlapping containers.
+                        // Slow mode does a precise filtering by reading and
+                        // filtering each record in turn so it's the same as
+                        // a samtools view command.
+                        filter = fast_reg ? 0 : 1;
+                    }
+                    // else we're in an "internal" container, so just copy
                 }
-                cram_free_block(blk);
 
+                if (filter && last_ref_id == -1 && itid == INT_MAX)
+                    // Multi-ref containers consisting solely of ref "*" are
+                    // common, but if it's sorted then we know it's "*" from
+                    // here on so we don't need to filter despite multi-ref.
+                    filter = 0;
 
-                // Container num_blocks can be invalid, due to a bug.
-                // Instead we iterate in slice context instead.
-                (void)cram_container_get_landmarks(c, &num_slices);
-                cram_copy_slice(in_c, out_c, num_slices);
-            }
+                if (filter) {
+                    // Filter or skip
+                    cram_filter_container(in_c, out_c, c, &last_ref_id);
+                } else {
+                    // Copy. Consider adding a cram_copy_container API instead.
 
+                    // Container compression header
+                    cram_block *blk;
+                    if (!(blk = cram_read_block(in_c)))
+                        goto closefiles;
+
+                    // Not switching rg so do the usual read/write loop
+                    if (cram_write_container(out_c, c) != 0)
+                        goto closefiles;
+
+                    // Contatiner compression header
+                    if (cram_write_block(out_c, blk) != 0) {
+                        cram_free_block(blk);
+                        goto closefiles;
+                    }
+
+                    // Container num_blocks can be invalid, due to a bug.
+                    // Instead we iterate in slice context instead.
+                    (void)cram_container_get_landmarks(c, &num_slices);
+                    if (cram_copy_slice(in_c, out_c, num_slices) < 0) {
+                        cram_free_block(blk);
+                        goto closefiles;
+                    }
+                    cram_free_block(blk);
+                }
+            }
             cram_free_container(c);
-        }
 
+            // Location of next container start
+            before_hdr = htell(cram_fd_get_fp(in_c));
+            if (filter_by_cnum && before_hdr > cend)
+                break;
+        }
         sam_hdr_destroy(old_h);
-        sam_close(in);
+        old_h = NULL;
+
+        if (idx) {
+            hts_idx_destroy(idx);
+            idx = NULL;
+        }
+
+        if (iter) {
+            hts_itr_destroy(iter);
+            iter = NULL;
+        }
     }
-    sam_close(out);
-    sam_hdr_destroy(new_h);
+    ret = 0;
 
-    return 0;
-}
+closefiles:
+    if (old_h)
+        sam_hdr_destroy(old_h);
+
+    if (idx)
+        hts_idx_destroy(idx);
+
+    if (iter)
+        hts_itr_destroy(iter);
 
+    if (out)
+        sam_close(out);
+
+    if (new_h)
+        sam_hdr_destroy(new_h);
+
+    for (i = 1; i < nfn; ++i) {     //skip firstfile and close rest
+        if (files[i]) {
+            sam_close(files[i]);
+        }
+    }
+    free(files);
+    return ret;
+}
 
+/* ----------------------------------------------------------------------
+ * BAM cat
+ */
 #define BUF_SIZE 0x10000
 
 #define GZIPID1 31
@@ -332,31 +699,40 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
 
 #define BGZF_EMPTY_BLOCK_SIZE 28
 
-int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
+int bam_cat(samFile * const firstfile, int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
 {
-    BGZF *fp, *in = NULL;
+    BGZF *fp = NULL, *in = NULL;
     uint8_t *buf = NULL;
     uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
     const int es=BGZF_EMPTY_BLOCK_SIZE;
     int i;
+    samFile **files = NULL;
+    sam_hdr_t *new_h = NULL;
 
+    /* merges RG lines, opens all files and returns them that multiple non-seekable
+    stream inputs can be handled */
+    if (!(files = cat_check_merge_hdr(firstfile, nfn, fn, h, NULL, NULL, &new_h)))
+        return -1;
+    if (!new_h) {
+        print_error_errno("cat", "failed to make output header");
+        goto fail;
+    }
     fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(samtools_stdout), "w");
     if (fp == 0) {
         print_error_errno("cat", "fail to open output file '%s'", outbam);
-        return -1;
+        goto fail;
     }
-    if (h) {
-        if (!no_pg && sam_hdr_add_pg(h, "samtools",
-                                     "VN", samtools_version(),
-                                     arg_list ? "CL": NULL,
-                                     arg_list ? arg_list : NULL,
-                                     NULL))
-            goto fail;
 
-        if (bam_hdr_write(fp, h) < 0) {
-            print_error_errno("cat", "Couldn't write header");
-            goto fail;
-        }
+    if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
+                                    "VN", samtools_version(),
+                                    arg_list ? "CL": NULL,
+                                    arg_list ? arg_list : NULL,
+                                    NULL))
+        goto fail;
+
+    if (bam_hdr_write(fp, new_h) < 0) {
+        print_error_errno("cat", "Couldn't write header");
+        goto fail;
     }
 
     buf = (uint8_t*) malloc(BUF_SIZE);
@@ -365,35 +741,13 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a
         goto fail;
     }
     for(i = 0; i < nfn; ++i){
-        sam_hdr_t *old;
         int len,j;
-
-        in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
+        in = files[i]->fp.bgzf;
         if (in == 0) {
             print_error_errno("cat", "fail to open file '%s'", fn[i]);
             goto fail;
         }
-        if (in->is_write) return -1;
-
-        old = bam_hdr_read(in);
-        if (old == NULL) {
-            fprintf(samtools_stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
-                    __func__, fn[i]);
-            goto fail;
-        }
-        if (h == 0 && i == 0) {
-            if (!no_pg && sam_hdr_add_pg(old, "samtools",
-                                         "VN", samtools_version(),
-                                         arg_list ? "CL": NULL,
-                                         arg_list ? arg_list : NULL,
-                                         NULL))
-                goto fail;
-
-            if (bam_hdr_write(fp, old) < 0) {
-                print_error_errno("cat", "Couldn't write header");
-                goto fail;
-            }
-        }
+        if (in->is_write) goto fail;
 
         if (in->block_offset < in->block_length) {
             if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
@@ -434,39 +788,55 @@ int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *a
                 if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
             }
         }
-        sam_hdr_destroy(old);
-        bgzf_close(in);
         in = NULL;
     }
     free(buf);
     if (bgzf_close(fp) < 0) {
         fprintf(samtools_stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
-        return -1;
+        goto fail;
+    }
+    for (i = 1; i < nfn; ++i) {     //skip firstfile and close rest
+        if (files[i]) {
+            sam_close(files[i]);
+        }
     }
+    free(files);
+    sam_hdr_destroy(new_h);
     return 0;
 
  write_fail:
     fprintf(samtools_stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
  fail:
-    if (in) bgzf_close(in);
+    if (new_h) {
+        sam_hdr_destroy(new_h);
+    }
     if (fp) bgzf_close(fp);
     free(buf);
+
+    if (files) {
+        for(i = 1; i < nfn; ++i) {  //except the firstfile
+            if(files[i]) {
+                sam_close(files[i]);
+            }
+        }
+        free(files);
+    }
     return -1;
 }
 
-
 int main_cat(int argc, char *argv[])
 {
     sam_hdr_t *h = 0;
     char *outfn = 0;
     char **infns = NULL; // files to concatenate
     int infns_size = 0;
-    int c, ret = 0, no_pg = 0, usage = 0;
+    int c, ret = 0, no_pg = 0, usage = 0, query_ncont = 0, fast_mode = 0;
     samFile *in;
     sam_global_args ga;
+    char *reg = NULL, *part = NULL;
 
     static const struct option lopts[] = {
-        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'),
+        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '-'),
         {"no-PG", no_argument, NULL, 1},
         { NULL, 0, NULL, 0 }
     };
@@ -474,21 +844,22 @@ int main_cat(int argc, char *argv[])
     char *arg_list = NULL;
 
     sam_global_args_init(&ga);
-
-    while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:o:b:r:p:qf", lopts, NULL)) >= 0) {
         switch (c) {
             case 'h': {
                 samFile *fph = sam_open(optarg, "r");
                 if (fph == 0) {
                     fprintf(samtools_stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg);
-                    return 1;
+                    ret = 1;
+                    goto end;
                 }
                 h = sam_hdr_read(fph);
                 if (h == NULL) {
                     fprintf(samtools_stderr,
                             "[%s] ERROR: failed to read the header from '%s'.\n",
                             __func__, optarg);
-                    return 1;
+                    ret = 1;
+                    goto end;
                 }
                 sam_close(fph);
                 break;
@@ -514,6 +885,19 @@ int main_cat(int argc, char *argv[])
             case 1:
                 no_pg = 1;
                 break;
+            case 'r':
+                reg = optarg;
+                break;
+            case 'p':
+                part = optarg;
+                break;
+            case 'f':
+                fast_mode = 1;
+                break;
+            case 'q':
+                query_ncont=1;
+                break;
+
             default:
                 if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                 /* else fall-through */
@@ -523,7 +907,8 @@ int main_cat(int argc, char *argv[])
 
     if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
         print_error("cat", "failed to create arg_list");
-        return 1;
+        ret = 1;
+        goto end;
     }
 
     // Append files specified in argv to the list.
@@ -543,34 +928,48 @@ int main_cat(int argc, char *argv[])
         fprintf(samtools_stderr, "         -h FILE  copy the header from FILE [default is 1st input file]\n");
         fprintf(samtools_stderr, "         -o FILE  output BAM/CRAM\n");
         fprintf(samtools_stderr, "         --no-PG  do not add a PG line\n");
-        sam_global_opt_help(samtools_stderr, "--..-@-.");
-        return 1;
+        fprintf(samtools_stderr, "\nCRAM only options for filtering:\n");
+        fprintf(samtools_stderr, "         -r REG   filter to region REG.\n");
+        fprintf(samtools_stderr, "                  REG can also be #:cstart-cend for specific container numbers\n");
+        fprintf(samtools_stderr, "         -p N/M   Specify part N of M (where N is 1 to M inclusive)\n");
+        fprintf(samtools_stderr, "         -f       Fast mode: don't filter containers to exactly match region\n");
+        fprintf(samtools_stderr, "         -q       Query the total number of indexed containers\n");
+        fprintf(samtools_stderr, "\nStandard options:\n");
+        sam_global_opt_help(samtools_stderr, "---.---.");
+        ret = 1;
+        goto end;
     }
 
     in = sam_open(infns[0], "r");
     if (!in) {
         print_error_errno("cat", "failed to open file '%s'", infns[0]);
-        return 1;
+        ret = 1;
+        goto end;
     }
 
     switch (hts_get_format(in)->format) {
     case bam:
-        sam_close(in);
-        if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
+        if (bam_cat(in, infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
             ret = 1;
         break;
 
     case cram:
-        sam_close(in);
-        if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0)
+        if (query_ncont) {
+            if (cram_query_ncont(infns_size+nargv_fns, infns, reg) < 0)
+                ret = 1;
+        } else {
+            if (cram_cat(in, infns_size+nargv_fns, infns, h,
+                         outfn? outfn : "-", &ga, arg_list, no_pg, reg,
+                         part, fast_mode) < 0)
             ret = 1;
+        }
         break;
 
     default:
-        sam_close(in);
         fprintf(samtools_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
-        return 1;
+        ret = 1;
     }
+    sam_close(in);
 
  end:
     if (infns_size > 0) {
@@ -584,6 +983,7 @@ int main_cat(int argc, char *argv[])
     free(arg_list);
     if (h)
         sam_hdr_destroy(h);
+    sam_global_args_free(&ga);
 
     return ret;
 }
diff --git a/samtools/bam_consensus.c b/samtools/bam_consensus.c
index 3cbb24fa7..8572e0f03 100644
--- a/samtools/bam_consensus.c
+++ b/samtools/bam_consensus.c
@@ -1,7 +1,7 @@
 /*  bam_consensus.c -- consensus subcommand.
 
     Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source)
-    Copyright (C) 2003-2005,2007-2023 Genome Research Ltd.
+    Copyright (C) 2003-2005,2007-2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -1917,8 +1917,8 @@ static int calculate_consensus_simple(const pileup_t *plp,
     // Ignore ambiguous bases in seq for now, so we don't treat R, Y, etc
     // as part of one base and part another.  Based on BAM seqi values.
     // We also use freq[16] as "*" for gap.
-    int freq[17] = {0};  // base frequency, aka depth
-    int score[17] = {0}; // summation of base qualities
+    int      freq[17]  = {0}; // base frequency, aka depth
+    uint64_t score[17] = {0}; // summation of base qualities
 
     // Accumulate
     for (; plp; plp = plp->next) {
@@ -1959,13 +1959,13 @@ static int calculate_consensus_simple(const pileup_t *plp,
     }
 
     // Total usable depth
-    int tscore = 0;
+    uint64_t tscore = 0;
     for (i = 0; i < 5; i++)
         tscore += score[1<<i];
 
     // Best and second best potential calls
-    int call1  = 15, call2 = 15;
-    int score1 = 0,  score2 = 0;
+    int      call1  = 15, call2  = 15;
+    uint64_t score1 = 0,  score2 = 0;
     for (i = 0; i < 5; i++) {
         int c = 1<<i; // A C G T *
         if (score1 < score[c]) {
@@ -1980,8 +1980,8 @@ static int calculate_consensus_simple(const pileup_t *plp,
     }
 
     // Work out which best and second best are usable as a call
-    int used_score = score1;
-    int used_base  = call1;
+    uint64_t used_score = score1;
+    int      used_base  = call1;
     if (score2 >= opts->het_fract * score1 && opts->ambig) {
         used_base  |= call2;
         used_score += score2;
@@ -2083,7 +2083,10 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
         calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
                                   depth, p, opts, &cons, opts->default_qual,
                                   &cons_prob_recall, &cons_prob_precise);
-        if (cons.het_logodd > 0 && opts->ambig) {
+        if (cons.depth < opts->min_depth) {
+            cb = 'N';
+            cq = 0;
+        } else if (cons.het_logodd > 0 && opts->ambig) {
             cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
                  "MCSYc"
                  "RSGKg"
@@ -2228,14 +2231,17 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
         calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
                                   depth, p, opts, &cons, opts->default_qual,
                                   &cons_prob_recall, &cons_prob_precise);
-        if (cons.het_logodd > 0 && opts->ambig) {
+        if (cons.depth < opts->min_depth) {
+            cb = 'N';
+            cq = 0;
+        } else if (cons.het_logodd > 0 && opts->ambig) {
             cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
                  "MCSYc"
                  "RSGKg"
                  "WYKTt"
                  "acgt*"[cons.het_call];
             cq = cons.het_logodd;
-        } else{
+        } else {
             cb = "ACGT*"[cons.call];
             cq = cons.phred;
         }
@@ -2319,10 +2325,10 @@ static void usage_exit(FILE *fp, int exit_status) {
     fprintf(fp, "  --show-ins yes/no     Whether to show insertions [yes]\n");
     fprintf(fp, "  --mark-ins            Add '+' before every inserted base/qual [off]\n");
     fprintf(fp, "  -A, --ambig           Enable IUPAC ambiguity codes [off]\n");
+    fprintf(fp, "  -d, --min-depth INT   Minimum depth of INT [1]\n");
     fprintf(fp, "\nFor simple consensus mode:\n");
     fprintf(fp, "  -q, --(no-)use-qual   Use quality values in calculation [off]\n");
     fprintf(fp, "  -c, --call-fract INT  At least INT portion of bases must agree [0.75]\n");
-    fprintf(fp, "  -d, --min-depth INT   Minimum depth of INT [2]\n");
     fprintf(fp, "  -H, --het-fract INT   Minimum fraction of 2nd-most to most common base [0.15]\n");
     fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n");
     fprintf(fp, "  -C, --cutoff C        Consensus cutoff quality C [10]\n");
diff --git a/samtools/bam_consensus.c.pysam.c b/samtools/bam_consensus.c.pysam.c
index b090a9a56..9c73233df 100644
--- a/samtools/bam_consensus.c.pysam.c
+++ b/samtools/bam_consensus.c.pysam.c
@@ -3,7 +3,7 @@
 /*  bam_consensus.c -- consensus subcommand.
 
     Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source)
-    Copyright (C) 2003-2005,2007-2023 Genome Research Ltd.
+    Copyright (C) 2003-2005,2007-2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -1919,8 +1919,8 @@ static int calculate_consensus_simple(const pileup_t *plp,
     // Ignore ambiguous bases in seq for now, so we don't treat R, Y, etc
     // as part of one base and part another.  Based on BAM seqi values.
     // We also use freq[16] as "*" for gap.
-    int freq[17] = {0};  // base frequency, aka depth
-    int score[17] = {0}; // summation of base qualities
+    int      freq[17]  = {0}; // base frequency, aka depth
+    uint64_t score[17] = {0}; // summation of base qualities
 
     // Accumulate
     for (; plp; plp = plp->next) {
@@ -1961,13 +1961,13 @@ static int calculate_consensus_simple(const pileup_t *plp,
     }
 
     // Total usable depth
-    int tscore = 0;
+    uint64_t tscore = 0;
     for (i = 0; i < 5; i++)
         tscore += score[1<<i];
 
     // Best and second best potential calls
-    int call1  = 15, call2 = 15;
-    int score1 = 0,  score2 = 0;
+    int      call1  = 15, call2  = 15;
+    uint64_t score1 = 0,  score2 = 0;
     for (i = 0; i < 5; i++) {
         int c = 1<<i; // A C G T *
         if (score1 < score[c]) {
@@ -1982,8 +1982,8 @@ static int calculate_consensus_simple(const pileup_t *plp,
     }
 
     // Work out which best and second best are usable as a call
-    int used_score = score1;
-    int used_base  = call1;
+    uint64_t used_score = score1;
+    int      used_base  = call1;
     if (score2 >= opts->het_fract * score1 && opts->ambig) {
         used_base  |= call2;
         used_score += score2;
@@ -2085,7 +2085,10 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
         calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
                                   depth, p, opts, &cons, opts->default_qual,
                                   &cons_prob_recall, &cons_prob_precise);
-        if (cons.het_logodd > 0 && opts->ambig) {
+        if (cons.depth < opts->min_depth) {
+            cb = 'N';
+            cq = 0;
+        } else if (cons.het_logodd > 0 && opts->ambig) {
             cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
                  "MCSYc"
                  "RSGKg"
@@ -2230,14 +2233,17 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
         calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
                                   depth, p, opts, &cons, opts->default_qual,
                                   &cons_prob_recall, &cons_prob_precise);
-        if (cons.het_logodd > 0 && opts->ambig) {
+        if (cons.depth < opts->min_depth) {
+            cb = 'N';
+            cq = 0;
+        } else if (cons.het_logodd > 0 && opts->ambig) {
             cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
                  "MCSYc"
                  "RSGKg"
                  "WYKTt"
                  "acgt*"[cons.het_call];
             cq = cons.het_logodd;
-        } else{
+        } else {
             cb = "ACGT*"[cons.call];
             cq = cons.phred;
         }
@@ -2321,10 +2327,10 @@ static void usage_exit(FILE *fp, int exit_status) {
     fprintf(fp, "  --show-ins yes/no     Whether to show insertions [yes]\n");
     fprintf(fp, "  --mark-ins            Add '+' before every inserted base/qual [off]\n");
     fprintf(fp, "  -A, --ambig           Enable IUPAC ambiguity codes [off]\n");
+    fprintf(fp, "  -d, --min-depth INT   Minimum depth of INT [1]\n");
     fprintf(fp, "\nFor simple consensus mode:\n");
     fprintf(fp, "  -q, --(no-)use-qual   Use quality values in calculation [off]\n");
     fprintf(fp, "  -c, --call-fract INT  At least INT portion of bases must agree [0.75]\n");
-    fprintf(fp, "  -d, --min-depth INT   Minimum depth of INT [2]\n");
     fprintf(fp, "  -H, --het-fract INT   Minimum fraction of 2nd-most to most common base [0.15]\n");
     fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n");
     fprintf(fp, "  -C, --cutoff C        Consensus cutoff quality C [10]\n");
diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c
index e4701b1e7..fadcccb76 100644
--- a/samtools/bam_fastq.c
+++ b/samtools/bam_fastq.c
@@ -1,6 +1,6 @@
 /*  bam_fastq.c -- FASTA and FASTQ file generation
 
-    Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd.
+    Copyright (C) 2009-2017, 2019-2020, 2023-2024 Genome Research Ltd.
     Portions copyright (C) 2009, 2011, 2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -40,9 +40,13 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kstring.h"
 #include "htslib/bgzf.h"
 #include "htslib/thread_pool.h"
+#include "htslib/khash.h"
 #include "samtools.h"
 #include "sam_opts.h"
 
+KHASH_SET_INIT_STR(str)
+typedef khash_t(str) strhash_t;
+
 #define DEFAULT_BARCODE_TAG "BC"
 #define DEFAULT_QUALITY_TAG "QT"
 #define INDEX_SEPARATOR "+"
@@ -67,6 +71,8 @@ static void bam2fq_usage(FILE *to, const char *command)
 "               paired reads will be written to the -1 and -2 files.\n"
 "  -d, --tag TAG[:VAL]\n"
 "               only include reads containing TAG, optionally with value VAL\n"
+"  -D, --tag-file STR:FILE\n"
+"               only include reads containing TAG, with a value listed in FILE\n"
 "  -f, --require-flags INT\n"
 "               only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
 "  -F, --excl[ude]-flags INT\n"
@@ -150,9 +156,7 @@ typedef struct bam2fq_opts {
     char *extra_tags;
     char compression_level;
     const char *filter_tag;       // -d opt
-    const char *filter_value_str;
-    int64_t filter_value_int;
-    float filter_value_flt;
+    strhash_t *filter_tag_vals;
 } bam2fq_opts_t;
 
 typedef struct bam2fq_state {
@@ -171,6 +175,45 @@ typedef struct bam2fq_state {
     htsThreadPool p;
 } bam2fq_state_t;
 
+// Adds a single tag value to the filter tag value hash
+static int add_tag_value(bam2fq_opts_t *opts, char *val) {
+    if (!opts->filter_tag_vals) {
+        if (!(opts->filter_tag_vals = kh_init(str)))
+            return -1;
+    }
+
+    if (!(val = strdup(val)))
+        return -1;
+
+    int ret = 0;
+    kh_put(str, opts->filter_tag_vals, val, &ret);
+    if (ret <= 0)
+        free(val);
+
+    return ret < 0 ? -1 : 0;
+}
+
+// Adds multiple values, listed in a file
+static int add_tag_file(bam2fq_opts_t *opts, char *fn) {
+    FILE *fp;
+    kstring_t ks = {0,0};
+
+    if (!(fp = fopen(fn, "r"))) {
+        print_error_errno("fastq", "failed to open \"%s\" for reading", fn);
+        return -1;
+    }
+
+    while (ks.l = 0, kgetline(&ks, (kgets_func *)fgets, fp) >= 0) {
+        if (add_tag_value(opts, ks.s) < 0) {
+            ks_free(&ks);
+            return -1;
+        }
+    }
+
+    ks_free(&ks);
+    return 0;
+}
+
 static readpart which_readpart(const bam1_t *b)
 {
     if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
@@ -184,6 +227,14 @@ static readpart which_readpart(const bam1_t *b)
 
 static void free_opts(bam2fq_opts_t *opts)
 {
+    if (opts->filter_tag_vals) {
+        khint_t k;
+        for (k = 0; k < kh_end(opts->filter_tag_vals); k++)
+            if (kh_exist(opts->filter_tag_vals, k))
+                free((char *)kh_key(opts->filter_tag_vals, k));
+
+        kh_destroy(str, opts->filter_tag_vals);
+    }
     free(opts);
 }
 
@@ -230,9 +281,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
         {"barcode-tag", required_argument, NULL, 'b'},
         {"quality-tag", required_argument, NULL, 'q'},
         {"tag", required_argument, NULL, 'd'},
+        {"tag-file", required_argument, NULL, 'D'},
         { NULL, 0, NULL, 0 }
     };
-    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:",
+    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:D:",
                             lopts, NULL)) > 0) {
         switch (c) {
             case 'b': opts->barcode_tag = optarg; break;
@@ -276,10 +328,44 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
                     return false;
                 }
 
+                if (opts->filter_tag && memcmp(opts->filter_tag, optarg, 2)) {
+                    print_error("fastq", "Different tag type specified "
+                                "to before");
+                    free_opts(opts);
+                    return false;
+                }
+
+                if (strlen(optarg) >= 3)
+                    add_tag_value(opts, optarg+3);
+                opts->filter_tag = optarg;
+                break;
+
+            case 'D':
+                // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
+                // path translation as described at:
+                // http://www.mingw.org/wiki/Posix_path_conversion
+                if (strlen(optarg) < 4
+                    || (optarg[2] != ':' && optarg[2] != ';')) {
+                    print_error("view", "Invalid \"tag:file\" option: \"%s\"",
+                                optarg);
+                    free_opts(opts);
+                    return false;
+                }
+
+                if (opts->filter_tag && memcmp(opts->filter_tag, optarg, 2)) {
+                    print_error("fastq", "Different tag type specified "
+                                "to before");
+                    free_opts(opts);
+                    return false;
+                }
+
+                if (strlen(optarg) >= 3) {
+                    if (add_tag_file(opts, optarg+3) < 0) {
+                        free_opts(opts);
+                        return false;
+                    }
+                }
                 opts->filter_tag = optarg;
-                opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL;
-                opts->filter_value_int = INT64_MAX; // fill out later
-                opts->filter_value_flt = FLT_MAX;
                 break;
 
             case '?':
@@ -630,45 +716,43 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state,
         if (!s)
             return true;
 
-        if (opts->filter_value_str) {
+        if (opts->filter_tag_vals) {
+            char t[32], *val = t;
             switch (*s) {
             case 'i': case 'I':
             case 's': case 'S':
             case 'c': case 'C':
-                if (opts->filter_value_int == INT64_MAX)
-                    // cache integer conversion for repeated use
-                    opts->filter_value_int =
-                        strtoll(opts->filter_value_str, NULL, 0);
-                if (opts->filter_value_int != bam_aux2i(s))
+                if (snprintf(t, 32, "%"PRId64, bam_aux2i(s)) <= 0)
                     return true;
                 break;
 
             case 'f':
-                if (opts->filter_value_flt == FLT_MAX)
-                    opts->filter_value_flt = atof(opts->filter_value_str);
                 // Comparing floats is hard.
                 // Eg (double)0.1 - (double)0.1f is -1.5e-9.
                 // Given BAM binary encoding is float however, just keep it.
                 // This means rounding errors will (hopefully) always be the
                 // same and basic equality still works.
-                if (opts->filter_value_flt != (float)bam_aux2f(s))
+                if (snprintf(t, 32, "%f", (float)bam_aux2f(s)) <= 0)
                     return true;
                 break;
 
             case 'A':
-                if (s[1] != *opts->filter_value_str)
-                    return true;
+                t[0] = s[1];
+                t[1] = 0;
                 break;
 
             case 'Z': case 'H':
-                if (strcmp((char *)s+1, opts->filter_value_str) != 0)
-                    return true;
+                val = (char *)s+1;
                 break;
 
             default:
                 // Anything unsupported fails the filter match too.
                 return true;
             }
+
+            khint_t k = kh_get(str, opts->filter_tag_vals, val);
+            if (k == kh_end(opts->filter_tag_vals))
+                return 1; // tag value not found
         }
     }
 
@@ -764,8 +848,13 @@ int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
         }
 
         char *bc_end = bc, *qt_end = qt;
-        while (len ? *bc_end && rem-- : isalpha(*bc_end))
-            bc_end++, qt_end += qt != NULL;
+        if (qt) {
+            while (len ? *bc_end && rem-- : isalpha(*bc_end))
+                bc_end++, qt_end++;
+        } else {
+            while (len ? *bc_end && rem-- : isalpha(*bc_end))
+                bc_end++;
+        }
 
         switch (fc) {
         case 'n':
diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c
index cd8fa2757..627f741af 100644
--- a/samtools/bam_fastq.c.pysam.c
+++ b/samtools/bam_fastq.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  bam_fastq.c -- FASTA and FASTQ file generation
 
-    Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd.
+    Copyright (C) 2009-2017, 2019-2020, 2023-2024 Genome Research Ltd.
     Portions copyright (C) 2009, 2011, 2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -42,9 +42,13 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kstring.h"
 #include "htslib/bgzf.h"
 #include "htslib/thread_pool.h"
+#include "htslib/khash.h"
 #include "samtools.h"
 #include "sam_opts.h"
 
+KHASH_SET_INIT_STR(str)
+typedef khash_t(str) strhash_t;
+
 #define DEFAULT_BARCODE_TAG "BC"
 #define DEFAULT_QUALITY_TAG "QT"
 #define INDEX_SEPARATOR "+"
@@ -69,6 +73,8 @@ static void bam2fq_usage(FILE *to, const char *command)
 "               paired reads will be written to the -1 and -2 files.\n"
 "  -d, --tag TAG[:VAL]\n"
 "               only include reads containing TAG, optionally with value VAL\n"
+"  -D, --tag-file STR:FILE\n"
+"               only include reads containing TAG, with a value listed in FILE\n"
 "  -f, --require-flags INT\n"
 "               only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
 "  -F, --excl[ude]-flags INT\n"
@@ -152,9 +158,7 @@ typedef struct bam2fq_opts {
     char *extra_tags;
     char compression_level;
     const char *filter_tag;       // -d opt
-    const char *filter_value_str;
-    int64_t filter_value_int;
-    float filter_value_flt;
+    strhash_t *filter_tag_vals;
 } bam2fq_opts_t;
 
 typedef struct bam2fq_state {
@@ -173,6 +177,45 @@ typedef struct bam2fq_state {
     htsThreadPool p;
 } bam2fq_state_t;
 
+// Adds a single tag value to the filter tag value hash
+static int add_tag_value(bam2fq_opts_t *opts, char *val) {
+    if (!opts->filter_tag_vals) {
+        if (!(opts->filter_tag_vals = kh_init(str)))
+            return -1;
+    }
+
+    if (!(val = strdup(val)))
+        return -1;
+
+    int ret = 0;
+    kh_put(str, opts->filter_tag_vals, val, &ret);
+    if (ret <= 0)
+        free(val);
+
+    return ret < 0 ? -1 : 0;
+}
+
+// Adds multiple values, listed in a file
+static int add_tag_file(bam2fq_opts_t *opts, char *fn) {
+    FILE *fp;
+    kstring_t ks = {0,0};
+
+    if (!(fp = fopen(fn, "r"))) {
+        print_error_errno("fastq", "failed to open \"%s\" for reading", fn);
+        return -1;
+    }
+
+    while (ks.l = 0, kgetline(&ks, (kgets_func *)fgets, fp) >= 0) {
+        if (add_tag_value(opts, ks.s) < 0) {
+            ks_free(&ks);
+            return -1;
+        }
+    }
+
+    ks_free(&ks);
+    return 0;
+}
+
 static readpart which_readpart(const bam1_t *b)
 {
     if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
@@ -186,6 +229,14 @@ static readpart which_readpart(const bam1_t *b)
 
 static void free_opts(bam2fq_opts_t *opts)
 {
+    if (opts->filter_tag_vals) {
+        khint_t k;
+        for (k = 0; k < kh_end(opts->filter_tag_vals); k++)
+            if (kh_exist(opts->filter_tag_vals, k))
+                free((char *)kh_key(opts->filter_tag_vals, k));
+
+        kh_destroy(str, opts->filter_tag_vals);
+    }
     free(opts);
 }
 
@@ -232,9 +283,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
         {"barcode-tag", required_argument, NULL, 'b'},
         {"quality-tag", required_argument, NULL, 'q'},
         {"tag", required_argument, NULL, 'd'},
+        {"tag-file", required_argument, NULL, 'D'},
         { NULL, 0, NULL, 0 }
     };
-    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:",
+    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:D:",
                             lopts, NULL)) > 0) {
         switch (c) {
             case 'b': opts->barcode_tag = optarg; break;
@@ -278,10 +330,44 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
                     return false;
                 }
 
+                if (opts->filter_tag && memcmp(opts->filter_tag, optarg, 2)) {
+                    print_error("fastq", "Different tag type specified "
+                                "to before");
+                    free_opts(opts);
+                    return false;
+                }
+
+                if (strlen(optarg) >= 3)
+                    add_tag_value(opts, optarg+3);
+                opts->filter_tag = optarg;
+                break;
+
+            case 'D':
+                // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
+                // path translation as described at:
+                // http://www.mingw.org/wiki/Posix_path_conversion
+                if (strlen(optarg) < 4
+                    || (optarg[2] != ':' && optarg[2] != ';')) {
+                    print_error("view", "Invalid \"tag:file\" option: \"%s\"",
+                                optarg);
+                    free_opts(opts);
+                    return false;
+                }
+
+                if (opts->filter_tag && memcmp(opts->filter_tag, optarg, 2)) {
+                    print_error("fastq", "Different tag type specified "
+                                "to before");
+                    free_opts(opts);
+                    return false;
+                }
+
+                if (strlen(optarg) >= 3) {
+                    if (add_tag_file(opts, optarg+3) < 0) {
+                        free_opts(opts);
+                        return false;
+                    }
+                }
                 opts->filter_tag = optarg;
-                opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL;
-                opts->filter_value_int = INT64_MAX; // fill out later
-                opts->filter_value_flt = FLT_MAX;
                 break;
 
             case '?':
@@ -632,45 +718,43 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state,
         if (!s)
             return true;
 
-        if (opts->filter_value_str) {
+        if (opts->filter_tag_vals) {
+            char t[32], *val = t;
             switch (*s) {
             case 'i': case 'I':
             case 's': case 'S':
             case 'c': case 'C':
-                if (opts->filter_value_int == INT64_MAX)
-                    // cache integer conversion for repeated use
-                    opts->filter_value_int =
-                        strtoll(opts->filter_value_str, NULL, 0);
-                if (opts->filter_value_int != bam_aux2i(s))
+                if (snprintf(t, 32, "%"PRId64, bam_aux2i(s)) <= 0)
                     return true;
                 break;
 
             case 'f':
-                if (opts->filter_value_flt == FLT_MAX)
-                    opts->filter_value_flt = atof(opts->filter_value_str);
                 // Comparing floats is hard.
                 // Eg (double)0.1 - (double)0.1f is -1.5e-9.
                 // Given BAM binary encoding is float however, just keep it.
                 // This means rounding errors will (hopefully) always be the
                 // same and basic equality still works.
-                if (opts->filter_value_flt != (float)bam_aux2f(s))
+                if (snprintf(t, 32, "%f", (float)bam_aux2f(s)) <= 0)
                     return true;
                 break;
 
             case 'A':
-                if (s[1] != *opts->filter_value_str)
-                    return true;
+                t[0] = s[1];
+                t[1] = 0;
                 break;
 
             case 'Z': case 'H':
-                if (strcmp((char *)s+1, opts->filter_value_str) != 0)
-                    return true;
+                val = (char *)s+1;
                 break;
 
             default:
                 // Anything unsupported fails the filter match too.
                 return true;
             }
+
+            khint_t k = kh_get(str, opts->filter_tag_vals, val);
+            if (k == kh_end(opts->filter_tag_vals))
+                return 1; // tag value not found
         }
     }
 
@@ -766,8 +850,13 @@ int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
         }
 
         char *bc_end = bc, *qt_end = qt;
-        while (len ? *bc_end && rem-- : isalpha(*bc_end))
-            bc_end++, qt_end += qt != NULL;
+        if (qt) {
+            while (len ? *bc_end && rem-- : isalpha(*bc_end))
+                bc_end++, qt_end++;
+        } else {
+            while (len ? *bc_end && rem-- : isalpha(*bc_end))
+                bc_end++;
+        }
 
         switch (fc) {
         case 'n':
diff --git a/samtools/bam_import.c b/samtools/bam_import.c
index 079e04bf6..6a3b2585e 100644
--- a/samtools/bam_import.c
+++ b/samtools/bam_import.c
@@ -4,7 +4,7 @@
  *   samtools import a_1.fq a_2.fq
  *   samtools import a_interleaved.fq
  *
- * Copyright (C) 2020-2021, 2023 Genome Research Ltd.
+ * Copyright (C) 2020-2021, 2023-2024 Genome Research Ltd.
  *
  * Author: James Bonfield <jkb@sanger.ac.uk>
  */
@@ -63,6 +63,7 @@ static int usage(FILE *fp, int exit_status) {
     fprintf(fp, "  -u           Uncompressed output\n");
     fprintf(fp, "  --order TAG  Store Nth record count in TAG\n");
     fprintf(fp, "\n");
+    fprintf(fp, "      --no-PG  Do not add a PG line\n");
     sam_global_opt_help(fp, "-.O.-@--");
 
     fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n");
@@ -159,7 +160,7 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
     if (argc == 1)
         opts->fn[FQ_SINGLE] = argv[0];
     else
-        for (i = 0; i < 4; i++)
+        for (i = 0; i < 2; i++)
             if (argc > i)
                 opts->fn[FQ_R1+i] = argv[i];
 
@@ -259,6 +260,25 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
         goto err;
     }
 
+    if (!opts->no_pg) {
+        char *arg_list;
+        if (!(arg_list = stringify_argv(argc+1+optind, argv-1-optind))) {
+            print_error("view", "failed to create arg_list");
+            goto err;
+        }
+        if (sam_hdr_add_pg(hdr_out, "samtools",
+                           "VN", samtools_version(),
+                           arg_list ? "CL" : NULL,
+                           arg_list ? arg_list : NULL,
+                           NULL)) {
+            fprintf(stderr, "Failed to add PG line to the header");
+            free(arg_list);
+            goto err;
+        }
+
+        free(arg_list);
+    }
+
     // Read group
     if (opts->rg_line) {
         if (*opts->rg_line != '@')
@@ -290,7 +310,7 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
 
 
     // Interleave / combine from n files (ids[0..n-1]).
-    int res;
+    int res = 0;
     int eof = 0;
     do {
         idx_seq.l = idx_qual.l = 0;
diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c
index f16a7811f..6725844b1 100644
--- a/samtools/bam_import.c.pysam.c
+++ b/samtools/bam_import.c.pysam.c
@@ -6,7 +6,7 @@
  *   samtools import a_1.fq a_2.fq
  *   samtools import a_interleaved.fq
  *
- * Copyright (C) 2020-2021, 2023 Genome Research Ltd.
+ * Copyright (C) 2020-2021, 2023-2024 Genome Research Ltd.
  *
  * Author: James Bonfield <jkb@sanger.ac.uk>
  */
@@ -65,6 +65,7 @@ static int usage(FILE *fp, int exit_status) {
     fprintf(fp, "  -u           Uncompressed output\n");
     fprintf(fp, "  --order TAG  Store Nth record count in TAG\n");
     fprintf(fp, "\n");
+    fprintf(fp, "      --no-PG  Do not add a PG line\n");
     sam_global_opt_help(fp, "-.O.-@--");
 
     fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n");
@@ -161,7 +162,7 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
     if (argc == 1)
         opts->fn[FQ_SINGLE] = argv[0];
     else
-        for (i = 0; i < 4; i++)
+        for (i = 0; i < 2; i++)
             if (argc > i)
                 opts->fn[FQ_R1+i] = argv[i];
 
@@ -261,6 +262,25 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
         goto err;
     }
 
+    if (!opts->no_pg) {
+        char *arg_list;
+        if (!(arg_list = stringify_argv(argc+1+optind, argv-1-optind))) {
+            print_error("view", "failed to create arg_list");
+            goto err;
+        }
+        if (sam_hdr_add_pg(hdr_out, "samtools",
+                           "VN", samtools_version(),
+                           arg_list ? "CL" : NULL,
+                           arg_list ? arg_list : NULL,
+                           NULL)) {
+            fprintf(samtools_stderr, "Failed to add PG line to the header");
+            free(arg_list);
+            goto err;
+        }
+
+        free(arg_list);
+    }
+
     // Read group
     if (opts->rg_line) {
         if (*opts->rg_line != '@')
@@ -292,7 +312,7 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
 
 
     // Interleave / combine from n files (ids[0..n-1]).
-    int res;
+    int res = 0;
     int eof = 0;
     do {
         idx_seq.l = idx_qual.l = 0;
diff --git a/samtools/bam_index.c b/samtools/bam_index.c
index 0803f3e42..a04de1a6e 100644
--- a/samtools/bam_index.c
+++ b/samtools/bam_index.c
@@ -1,6 +1,6 @@
 /*  bam_index.c -- index and idxstats subcommands.
 
-    Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023  Genome Research Ltd.
+    Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023-2024  Genome Research Ltd.
     Portions copyright (C) 2010 Broad Institute.
     Portions copyright (C) 2013 Peter Cock, The James Hutton Institute.
 
@@ -209,7 +209,8 @@ int slow_idxstats(samFile *fp, sam_hdr_t *header) {
 
 static void usage_exit(FILE *fp, int exit_status)
 {
-    fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n");
+    fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n"
+                "  -X           Include customized index file\n");
     sam_global_opt_help(fp, "-.---@-.");
     exit(exit_status);
 }
@@ -219,7 +220,8 @@ int bam_idxstats(int argc, char *argv[])
     hts_idx_t* idx;
     sam_hdr_t* header;
     samFile* fp;
-    int c;
+    int c, has_index_file = 0, file_names = 1;
+    char *index_name = NULL;
 
     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
     static const struct option lopts[] = {
@@ -227,16 +229,22 @@ int bam_idxstats(int argc, char *argv[])
         {NULL, 0, NULL, 0}
     };
 
-    while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "@:X", lopts, NULL)) >= 0) {
         switch (c) {
-        default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
-            /* else fall-through */
-        case '?':
-            usage_exit(stderr, EXIT_FAILURE);
+            case 'X': has_index_file=1; break;
+            default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+                /* else fall-through */
+            case '?':
+                usage_exit(stderr, EXIT_FAILURE);
         }
     }
 
-    if (argc != optind+1) {
+    if (has_index_file) {
+        file_names = 2;
+        index_name = argv[optind + 1];
+    }
+
+    if (argc != optind + file_names) {
         if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
         else usage_exit(stderr, EXIT_FAILURE);
     }
@@ -262,7 +270,7 @@ int bam_idxstats(int argc, char *argv[])
             return 1;
         }
     } else {
-        idx = sam_index_load(fp, argv[optind]);
+        idx = sam_index_load2(fp, argv[optind], index_name);
         if (idx == NULL) {
             print_error("idxstats", "fail to load index for \"%s\", "
                         "reverting to slow method", argv[optind]);
diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c
index 3093c0138..09d01d470 100644
--- a/samtools/bam_index.c.pysam.c
+++ b/samtools/bam_index.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  bam_index.c -- index and idxstats subcommands.
 
-    Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023  Genome Research Ltd.
+    Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023-2024  Genome Research Ltd.
     Portions copyright (C) 2010 Broad Institute.
     Portions copyright (C) 2013 Peter Cock, The James Hutton Institute.
 
@@ -211,7 +211,8 @@ int slow_idxstats(samFile *fp, sam_hdr_t *header) {
 
 static void usage_exit(FILE *fp, int exit_status)
 {
-    fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n");
+    fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n"
+                "  -X           Include customized index file\n");
     sam_global_opt_help(fp, "-.---@-.");
     samtools_exit(exit_status);
 }
@@ -221,7 +222,8 @@ int bam_idxstats(int argc, char *argv[])
     hts_idx_t* idx;
     sam_hdr_t* header;
     samFile* fp;
-    int c;
+    int c, has_index_file = 0, file_names = 1;
+    char *index_name = NULL;
 
     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
     static const struct option lopts[] = {
@@ -229,16 +231,22 @@ int bam_idxstats(int argc, char *argv[])
         {NULL, 0, NULL, 0}
     };
 
-    while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "@:X", lopts, NULL)) >= 0) {
         switch (c) {
-        default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
-            /* else fall-through */
-        case '?':
-            usage_exit(samtools_stderr, EXIT_FAILURE);
+            case 'X': has_index_file=1; break;
+            default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+                /* else fall-through */
+            case '?':
+                usage_exit(samtools_stderr, EXIT_FAILURE);
         }
     }
 
-    if (argc != optind+1) {
+    if (has_index_file) {
+        file_names = 2;
+        index_name = argv[optind + 1];
+    }
+
+    if (argc != optind + file_names) {
         if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS);
         else usage_exit(samtools_stderr, EXIT_FAILURE);
     }
@@ -264,7 +272,7 @@ int bam_idxstats(int argc, char *argv[])
             return 1;
         }
     } else {
-        idx = sam_index_load(fp, argv[optind]);
+        idx = sam_index_load2(fp, argv[optind], index_name);
         if (idx == NULL) {
             print_error("idxstats", "fail to load index for \"%s\", "
                         "reverting to slow method", argv[optind]);
diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c
index 677a47f82..17fedd58d 100644
--- a/samtools/bam_markdup.c
+++ b/samtools/bam_markdup.c
@@ -122,6 +122,7 @@ typedef struct {
     int opt;
     int beg;
     int end;
+    int len;
 } check_t;
 
 typedef struct {
@@ -875,7 +876,10 @@ static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, lon
         return ret;
     }
 
-    if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
+    int o_len = o_end - o_beg;
+    int d_len = d_end - d_beg;
+
+    if ((o_len == d_len) && memcmp(original + o_beg, duplicate + d_beg, o_len) == 0) {
         long xdiff, ydiff;
 
         if (ox > dx) {
@@ -918,7 +922,10 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const
         return ret;
     }
 
-    if (strncmp(name + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
+    int o_len = o_end - o_beg;
+    int d_len = d_end - d_beg;
+
+    if ((o_len == d_len) && memcmp(name + o_beg, duplicate + d_beg, o_len) == 0) {
         // the initial parts match, look at the numbers
         long xdiff, ydiff;
 
@@ -945,6 +952,7 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const
     c->y = dy;
     c->beg = d_beg;
     c->end = d_end;
+    c->len = d_end - d_beg;
 
     return ret;
 }
@@ -1156,9 +1164,15 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *
 }
 
 
-static int xcoord_sort(const void *a, const void *b) {
+static int chain_sort(const void *a, const void *b) {
     check_t *ac = (check_t *) a;
     check_t *bc = (check_t *) b;
+    int ret;
+
+    if ((ret = ac->len - bc->len))
+        return ret;
+    else if ((ret = memcmp(bam_get_qname(ac->b) + ac->beg, bam_get_qname(bc->b) + bc->beg, ac->len)))
+        return ret;
 
     return (ac->x - bc->x);
 }
@@ -1170,106 +1184,113 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has
     int ret = 0;
     size_t curr = 0;
 
-    qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+    qsort(list->c, list->length, sizeof(list->c[0]), chain_sort);
 
     while (curr < list->length - 1) {
-        check_t *current = &list->c[curr];
-        size_t count = curr;
-        char *cur_name = bam_get_qname(current->b);
-        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+        check_t *base = &list->c[curr];
+        char *base_name = bam_get_qname(base->b);
+        int end_name_match = curr;
 
-        while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
-            // while close enough along the x coordinate
-            check_t *chk = &list->c[count];
+        // find the end of the matching name parts
+        while (++end_name_match < list->length) {
+            check_t *chk = &list->c[end_name_match];
 
-            if (current->opt && chk->opt)
-                continue;
+            if ((base->len == chk->len) && memcmp(base_name + base->beg, bam_get_qname(chk->b) + chk->beg, base->len) != 0)
+                break;
+        }
 
-            // if both are already optical duplicates there is no need to check again, otherwise...
+        while (curr < end_name_match) {
+            size_t count = curr;
+            check_t *current = &list->c[curr];
+            int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
 
-            long ydiff;
+            while (++count < end_name_match && (list->c[count].x - current->x <= param->opt_dist)) {
+                // while close enough along the x coordinate
+                check_t *chk = &list->c[count];
 
-            if (current->y > chk->y) {
-                ydiff = current->y - chk->y;
-            } else {
-                ydiff = chk->y - current->y;
-            }
+                if (current->opt && chk->opt)
+                    continue;
 
-            if (ydiff > param->opt_dist)
-                continue;
+                long ydiff;
 
-            // the number are right, check the names
-            if (strncmp(cur_name + current->beg, bam_get_qname(chk->b) + chk->beg, current->end - current->beg) != 0)
-                continue;
+                if (current->y > chk->y) {
+                    ydiff = current->y - chk->y;
+                } else {
+                    ydiff = chk->y - current->y;
+                }
 
-            // optical duplicates
-            int chk_dup = 0;
-            int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+                if (ydiff > param->opt_dist)
+                    continue;
 
-            if (current_paired != chk_paired) {
-                if (!chk_paired) {
-                    // chk is single vs pair, this is a dup.
-                    chk_dup = 1;
-                }
-            } else {
-                // do it by scores
-                int64_t cur_score, chk_score;
+                // optical duplicates
+                int chk_dup = 0;
+                int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
 
-                if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
-                    if (current->b->core.flag & BAM_FQCFAIL) {
-                        cur_score = 0;
-                        chk_score = 1;
-                    } else {
-                        cur_score = 1;
-                        chk_score = 0;
+                if (current_paired != chk_paired) {
+                    if (!chk_paired) {
+                        // chk is single vs pair, this is a dup.
+                        chk_dup = 1;
                     }
                 } else {
-                    cur_score = current->score;
-                    chk_score = chk->score;
+                    // do it by scores
+                    int64_t cur_score, chk_score;
 
-                    if (current_paired) {
-                        // they are pairs so add mate scores.
-                        chk_score += chk->mate_score;
-                        cur_score += current->mate_score;
+                    if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+                        if (current->b->core.flag & BAM_FQCFAIL) {
+                            cur_score = 0;
+                            chk_score = 1;
+                        } else {
+                            cur_score = 1;
+                            chk_score = 0;
+                        }
+                    } else {
+                        cur_score = current->score;
+                        chk_score = chk->score;
+
+                        if (current_paired) {
+                            // they are pairs so add mate scores.
+                            chk_score += chk->mate_score;
+                            cur_score += current->mate_score;
+                        }
                     }
-                }
 
-                if (cur_score == chk_score) {
-                    if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
-                        chk_score++;
-                    } else {
-                        chk_score--;
+                    if (cur_score == chk_score) {
+                        if (strcmp(bam_get_qname(chk->b), bam_get_qname(current->b)) < 0) {
+                            chk_score++;
+                        } else {
+                            chk_score--;
+                        }
                     }
-                }
 
-                if (cur_score > chk_score) {
-                    chk_dup = 1;
+                    if (cur_score > chk_score) {
+                        chk_dup = 1;
+                    }
                 }
-            }
 
-            if (chk_dup) {
-                // the duplicate is the optical duplicate
-                if (!chk->opt) { // only change if not already an optical duplicate
-                    if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
-                        ret = -1;
-                        goto fail;
-                    }
+                if (chk_dup) {
+                    // the duplicate is the optical duplicate
+                    if (!chk->opt) { // only change if not already an optical duplicate
+                        if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
+                            ret = -1;
+                            goto fail;
+                        }
 
-                    chk->opt = 1;
-                }
-            } else {
-                if (!current->opt) {
-                    if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
-                        ret = -1;
-                        goto fail;
+                        chk->opt = 1;
                     }
+                } else {
+                    if (!current->opt) {
+                        if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
+                            ret = -1;
+                            goto fail;
+                        }
 
-                    current->opt = 1;
+                        current->opt = 1;
+                    }
                 }
             }
-        }
 
-        curr++;
+            curr++;
+        }
     }
 
  fail:
diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c
index e8fea3d2a..cdf33c1a8 100644
--- a/samtools/bam_markdup.c.pysam.c
+++ b/samtools/bam_markdup.c.pysam.c
@@ -124,6 +124,7 @@ typedef struct {
     int opt;
     int beg;
     int end;
+    int len;
 } check_t;
 
 typedef struct {
@@ -877,7 +878,10 @@ static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, lon
         return ret;
     }
 
-    if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
+    int o_len = o_end - o_beg;
+    int d_len = d_end - d_beg;
+
+    if ((o_len == d_len) && memcmp(original + o_beg, duplicate + d_beg, o_len) == 0) {
         long xdiff, ydiff;
 
         if (ox > dx) {
@@ -920,7 +924,10 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const
         return ret;
     }
 
-    if (strncmp(name + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
+    int o_len = o_end - o_beg;
+    int d_len = d_end - d_beg;
+
+    if ((o_len == d_len) && memcmp(name + o_beg, duplicate + d_beg, o_len) == 0) {
         // the initial parts match, look at the numbers
         long xdiff, ydiff;
 
@@ -947,6 +954,7 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const
     c->y = dy;
     c->beg = d_beg;
     c->end = d_end;
+    c->len = d_end - d_beg;
 
     return ret;
 }
@@ -1158,9 +1166,15 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *
 }
 
 
-static int xcoord_sort(const void *a, const void *b) {
+static int chain_sort(const void *a, const void *b) {
     check_t *ac = (check_t *) a;
     check_t *bc = (check_t *) b;
+    int ret;
+
+    if ((ret = ac->len - bc->len))
+        return ret;
+    else if ((ret = memcmp(bam_get_qname(ac->b) + ac->beg, bam_get_qname(bc->b) + bc->beg, ac->len)))
+        return ret;
 
     return (ac->x - bc->x);
 }
@@ -1172,106 +1186,113 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has
     int ret = 0;
     size_t curr = 0;
 
-    qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+    qsort(list->c, list->length, sizeof(list->c[0]), chain_sort);
 
     while (curr < list->length - 1) {
-        check_t *current = &list->c[curr];
-        size_t count = curr;
-        char *cur_name = bam_get_qname(current->b);
-        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+        check_t *base = &list->c[curr];
+        char *base_name = bam_get_qname(base->b);
+        int end_name_match = curr;
 
-        while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
-            // while close enough along the x coordinate
-            check_t *chk = &list->c[count];
+        // find the end of the matching name parts
+        while (++end_name_match < list->length) {
+            check_t *chk = &list->c[end_name_match];
 
-            if (current->opt && chk->opt)
-                continue;
+            if ((base->len == chk->len) && memcmp(base_name + base->beg, bam_get_qname(chk->b) + chk->beg, base->len) != 0)
+                break;
+        }
 
-            // if both are already optical duplicates there is no need to check again, otherwise...
+        while (curr < end_name_match) {
+            size_t count = curr;
+            check_t *current = &list->c[curr];
+            int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
 
-            long ydiff;
+            while (++count < end_name_match && (list->c[count].x - current->x <= param->opt_dist)) {
+                // while close enough along the x coordinate
+                check_t *chk = &list->c[count];
 
-            if (current->y > chk->y) {
-                ydiff = current->y - chk->y;
-            } else {
-                ydiff = chk->y - current->y;
-            }
+                if (current->opt && chk->opt)
+                    continue;
 
-            if (ydiff > param->opt_dist)
-                continue;
+                long ydiff;
 
-            // the number are right, check the names
-            if (strncmp(cur_name + current->beg, bam_get_qname(chk->b) + chk->beg, current->end - current->beg) != 0)
-                continue;
+                if (current->y > chk->y) {
+                    ydiff = current->y - chk->y;
+                } else {
+                    ydiff = chk->y - current->y;
+                }
 
-            // optical duplicates
-            int chk_dup = 0;
-            int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+                if (ydiff > param->opt_dist)
+                    continue;
 
-            if (current_paired != chk_paired) {
-                if (!chk_paired) {
-                    // chk is single vs pair, this is a dup.
-                    chk_dup = 1;
-                }
-            } else {
-                // do it by scores
-                int64_t cur_score, chk_score;
+                // optical duplicates
+                int chk_dup = 0;
+                int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
 
-                if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
-                    if (current->b->core.flag & BAM_FQCFAIL) {
-                        cur_score = 0;
-                        chk_score = 1;
-                    } else {
-                        cur_score = 1;
-                        chk_score = 0;
+                if (current_paired != chk_paired) {
+                    if (!chk_paired) {
+                        // chk is single vs pair, this is a dup.
+                        chk_dup = 1;
                     }
                 } else {
-                    cur_score = current->score;
-                    chk_score = chk->score;
+                    // do it by scores
+                    int64_t cur_score, chk_score;
 
-                    if (current_paired) {
-                        // they are pairs so add mate scores.
-                        chk_score += chk->mate_score;
-                        cur_score += current->mate_score;
+                    if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+                        if (current->b->core.flag & BAM_FQCFAIL) {
+                            cur_score = 0;
+                            chk_score = 1;
+                        } else {
+                            cur_score = 1;
+                            chk_score = 0;
+                        }
+                    } else {
+                        cur_score = current->score;
+                        chk_score = chk->score;
+
+                        if (current_paired) {
+                            // they are pairs so add mate scores.
+                            chk_score += chk->mate_score;
+                            cur_score += current->mate_score;
+                        }
                     }
-                }
 
-                if (cur_score == chk_score) {
-                    if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
-                        chk_score++;
-                    } else {
-                        chk_score--;
+                    if (cur_score == chk_score) {
+                        if (strcmp(bam_get_qname(chk->b), bam_get_qname(current->b)) < 0) {
+                            chk_score++;
+                        } else {
+                            chk_score--;
+                        }
                     }
-                }
 
-                if (cur_score > chk_score) {
-                    chk_dup = 1;
+                    if (cur_score > chk_score) {
+                        chk_dup = 1;
+                    }
                 }
-            }
 
-            if (chk_dup) {
-                // the duplicate is the optical duplicate
-                if (!chk->opt) { // only change if not already an optical duplicate
-                    if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
-                        ret = -1;
-                        goto fail;
-                    }
+                if (chk_dup) {
+                    // the duplicate is the optical duplicate
+                    if (!chk->opt) { // only change if not already an optical duplicate
+                        if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
+                            ret = -1;
+                            goto fail;
+                        }
 
-                    chk->opt = 1;
-                }
-            } else {
-                if (!current->opt) {
-                    if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
-                        ret = -1;
-                        goto fail;
+                        chk->opt = 1;
                     }
+                } else {
+                    if (!current->opt) {
+                        if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
+                            ret = -1;
+                            goto fail;
+                        }
 
-                    current->opt = 1;
+                        current->opt = 1;
+                    }
                 }
             }
-        }
 
-        curr++;
+            curr++;
+        }
     }
 
  fail:
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c
index 1f2b576a7..8fb02cf64 100644
--- a/samtools/bam_mate.c
+++ b/samtools/bam_mate.c
@@ -1,6 +1,6 @@
 /*  bam_mate.c -- fix mate pairing information and clean up flags.
 
-    Copyright (C) 2009, 2011-2017, 2019, 2022 Genome Research Ltd.
+    Copyright (C) 2009, 2011-2017, 2019, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
     Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
 
@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <ctype.h>
 #include "htslib/thread_pool.h"
 #include "sam_opts.h"
 #include "htslib/kstring.h"
@@ -90,7 +91,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
  * single Reads:
  * -if pos == 0 (1 based), tid == -1, or UNMAPPED then set UNMAPPED, pos = 0,
  *  tid = -1
- * -clear bad flags (PAIRED, MREVERSE, PROPER_PAIR)
+ * -clear bad flags (MREVERSE, PROPER_PAIR)
  * -set mpos = 0 (1 based), mtid = -1 and isize = 0
  * -write to output
  * Paired Reads:
@@ -472,17 +473,495 @@ int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) {
     return 0;
 }
 
+// Look for 3 tags in one pass, for efficiencies sake.
+// We also convert the draft tags Mm and Ml to MM and ML here.
+static inline void find_tags(bam1_t *b,
+                             char *t1, uint8_t **t1p,
+                             char *t2, uint8_t **t2p,
+                             char *t3, uint8_t **t3p) {
+    *t1p = *t2p = *t3p = NULL;
+    uint8_t *aux = bam_aux_first(b);
+
+    while (aux) {
+        if (aux[-2] == t1[0] && toupper(aux[-1]) == t1[1]) {
+            *t1p = aux;
+            if (islower(aux[-1]))
+                aux[-1] = t1[1];
+        } else if (aux[-2] == t2[0] && toupper(aux[-1]) == t2[1]) {
+            *t2p = aux;
+            if (islower(aux[-1]))
+                aux[-1] = t2[1];
+        } else if (aux[-2] == t3[0] && toupper(aux[-1]) == t3[1]) {
+            *t3p = aux;
+            if (islower(aux[-1]))
+                aux[-1] = t3[1];
+        }
+        aux = bam_aux_next(b, aux);
+    }
+}
+
+// Return 5' and 3' CIGAR hard-clip counts
+static inline void hard_clips(bam1_t *b, int *end5, int *end3) {
+    uint32_t *cigar = bam_get_cigar(b);
+    int ncigar = b->core.n_cigar;
+    int endL = 0, endR = 0, nh = 0;
+
+    if (ncigar && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
+        endL = bam_cigar_oplen(cigar[0]), nh=1;
+    if (ncigar > nh && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
+        endR = bam_cigar_oplen(cigar[ncigar-1]);
+
+    if (b->core.flag & BAM_FREVERSE) {
+        *end5 = endR;
+        *end3 = endL;
+    } else {
+        *end5 = endL;
+        *end3 = endR;
+    }
+}
+
+// Get MM, ML and MN tags, and 5' and 3' hard-clip lengths.
+// MNi is integer copy of MN, or -1 if absent/invalid
+void get_mod_info(bam1_t *b, uint8_t **MM, uint8_t **ML, uint8_t **MN,
+                  int *MNi, int *end5, int *end3) {
+    find_tags(b, "MM", MM, "ML", ML, "MN", MN);
+    if (*MN) {
+        int save_errno = errno;
+        errno = 0;
+        *MNi = bam_aux2i(*MN);
+        if (errno == EINVAL)
+            *MNi = -1;
+        errno = save_errno;
+    } else {
+        *MNi = -1;
+    }
+
+    if (*MM)
+        hard_clips(b, end5, end3);
+    else
+        *end5 = *end3 = 0; // don't need if MM not found
+}
+
+typedef struct MM_state {
+    // tags found on "pre" BAM
+    uint8_t *MM, *ML, *MN;
+} MM_state;
+
+uint8_t *MN_enc(uint8_t *tag, uint32_t n) {
+    if (n > UINT16_MAX) {
+        tag[0] = 'I';
+        i32_to_le(n, tag+1);
+        tag += 5;
+    } else if (n > UINT8_MAX) {
+        tag[0] = 'S';
+        i16_to_le(n, tag+1);
+        tag += 3;
+    } else {
+        *tag++ = 'C';
+        *tag++ = n;
+    }
+
+    return tag;
+}
+
+// Trim 5'/3' bases off MM and ML tags, using a previous sequence as a guide.
+int trim_MM(bam1_t *pre, bam1_t *cur, int end5, int end3,
+            uint8_t *MM, uint8_t *ML, uint8_t *MN) {
+    // Count number of bases
+    int counts5[16] = {0}, counts3[16] = {0};
+
+    uint8_t *seq = bam_get_seq(pre);
+    int i;
+    for (i = 0; i < end5; i++)
+        counts5[bam_seqi(seq, i)]++;
+    memcpy(counts3, counts5, 16 * sizeof(*counts3));
+    for (; i < pre->core.l_qseq - end3; i++)
+        counts3[bam_seqi(seq, i)]++;
+
+    // "p" is position in pre.
+    // "q" is position in cur.
+    // Hence move up "p" to start and copy from there to "q".
+    uint8_t *MMp, *MLp, *MMq = NULL, *MLq = NULL;
+    if (ML && ML[0] == 'B' && ML[1] == 'C') {
+        MLp = ML+6;
+    } else {
+        ML = MLp = NULL;
+    }
+    MMq = MM+1;
+    MLq = MLp;
+    for (MMp = MM+1; *MMp; ) {
+        int fundamental = seq_nt16_table[*MMp];
+        while (*MMp && *MMp != ',')
+            *MMq++ = *MMp++;
+        if (*MMp)
+            *MMq++ = *MMp++;
+
+        // Now on comma separated list for MM and BC array for ML. Skip
+        int n = 0;
+        while (*MMp != ';' && n < counts5[fundamental]) {
+            char *endptr;
+            long delta = strtol((char *)MMp, &endptr, 10);
+            if (counts5[fundamental] - n > delta) {
+                // Skip entire delta in MM and ML.
+                // Eg counts[]=10, MM=3,10 ML=<10><20> => MM=10 ML=<20>
+                n += delta+1;
+                if(ML) MLp++;
+            } else if (counts3[fundamental] > counts5[fundamental]) {
+                // Shrink delta, writing MM and ML is unchanged.
+                // Eg counts[]=3, MM=10,4 ML=<10><20> => MM=7,4 ML=<10><20>
+                char num[50];
+                int l = sprintf(num, "%ld",
+                                delta - (counts5[fundamental]-n));
+                memcpy((char *)MMq, num, l);
+                MMq += l;
+                *MMq++ = *endptr;
+                n += delta+1;
+                if (ML)
+                    *MLq++ = *MLp++;
+            } else {
+                // next base mod is on boundary of 3' clip point
+                break;
+            }
+
+            MMp = (uint8_t *)endptr;
+            if (*MMp != ',')
+                // error?  if not ; also?
+                break;
+            MMp++;
+        }
+
+        // Copy
+        while (*MMp != ';' && n < counts3[fundamental]) {
+            char *endptr;
+            long delta = strtol((char *)MMp, &endptr, 10);
+            if (counts3[fundamental] - n > delta) {
+                // Copy entire delta in MM and ML including [,;]
+                memmove(MMq, MMp, (uint8_t *)endptr - MMp + 1);
+                MMq += (uint8_t *)endptr - MMp + 1;
+                n += delta+1;
+                if (ML)
+                    *MLq++ = *MLp++;
+            } else {
+                // Next mod is into 3' cutoff, so can terminate MM/ML now
+                n = counts3[fundamental];
+                if (ML)
+                    MLp++;
+            }
+
+            MMp = (uint8_t *)endptr;
+            if (*MMp != ',')
+                break;
+            MMp++;
+        }
+
+        // Skip
+        while (*MMp && *MMp != ';') {
+            while (*MMp && *MMp != ',' && *MMp != ';')
+                MMp++;
+            if (*MMp == ',')
+                MMp++;
+
+            if (ML)
+                MLp++;
+        }
+        MMq[-1] = ';'; // replaces , with ; if clipping right
+        if (*MMp)
+            MMp++;
+    }
+
+    MMp++; // skip nul
+    *MMq++ = 0;
+
+    // Adjust ML B array length
+    if (ML)
+        u32_to_le(MLq-(ML+6), ML+2);
+
+    // Move MM and ML down to include their MM:Z and ML:B bits
+    if (MM) MM-=2;
+    if (ML) ML-=2;
+
+    // Now MM/ML are start of tags, MMq/MLq are ends of edited tags,
+    // and MMp/MLp are ends of original tags.  Walk through tags taking up
+    // any gaps
+    //
+    // Eg XXXXXXmmmmm--YYYlllll-ZZ (m and l are edited MM and ML tags)
+    // => XXXXXXmmmmmYYYlllllZZ
+
+    uint8_t *tag = bam_get_aux(cur), *tag_end = cur->data + cur->l_data;
+    uint8_t *to = tag;
+    while (tag && tag < tag_end) {
+        if (tag[0] == 'M' && (tag[1] == 'M' || tag[1] == 'm')) {
+            // Slow but easy
+            memmove(to, MM, MMq-MM); // length of new tag
+            to += MMq-MM;
+            tag = MMp; // size of old tag
+        } else if (tag[0] == 'M' && (tag[1] == 'L' || tag[1] == 'l')) {
+            memmove(to, ML, MLq-ML);
+            to += MLq-ML;
+            tag = MLp;
+        } else if (tag[0] == 'M' && tag[1] == 'N') {
+            tag = bam_aux_next(cur, tag+2);
+            // Skip it as we'll overwrite this later, although this
+            // does change the tag order.  Instead we could do:
+            //
+            // *to++ = 'M';
+            // *to++ = 'N';
+            // to = MN_enc(to, cur->core.l_qseq);
+        } else {
+            // Want aux_skip, but it's private.
+            // So we use bam_aux_next with work-arounds. :(
+            uint8_t *from = tag;
+            tag = bam_aux_next(cur, tag+2);
+            tag = tag ? tag-2 : tag_end;
+            memmove(to, from, tag-from);
+            to += tag-from;
+        }
+    }
+    cur->l_data = to - cur->data;
+
+    return 0;
+}
+
+// Removes base modification tags: MM, ML and MN.
+// This is more efficient than a series of bam_aux_remove and
+// bam_aux_find calls, as the previous removes shuffle the tags we've
+// previously found.  However it's still not optimal.
+void delete_mod_tags(bam1_t *b) {
+    uint8_t *tag = bam_aux_first(b), *next;
+    uint8_t *to = tag;
+    while (tag) {
+        next = bam_aux_next(b, tag);
+        if (tag[-2] == 'M' &&
+            (tag[-1] == 'M' || tag[-1] == 'm' ||
+             tag[-1] == 'L' || tag[-1] == 'l' ||
+             tag[-1] == 'N')) {
+            // Skip. Equivalent to bam_aux_remove without multiple passes
+        } else {
+            // Copy.  All these +/-2s are an annoyance caused by the
+            // tag iterator pointing to the byte after the 2-letter code
+            uint8_t *end = next ? next : b->data + b->l_data + 2;
+            if (tag != to)
+                memmove(to-2, tag-2, end-tag);
+            to += end-tag;
+        }
+        tag = next;
+    }
+
+    b->l_data = (to-2) - b->data;
+}
+
+int validate_MM(bam1_t *b, hts_base_mod_state *state) {
+    hts_base_mod mods[10];
+    int n, pos;
+    while ((n = bam_next_basemod(b, state, mods, 10, &pos)) > 0) {
+        // bam_next_basemod will trigger MM out-of-bound checks
+    }
+    return n;
+}
+
+// Fix base modification tags MM, ML and MN.
+// For supplementary-style alignments we may have hard-clipped the sequence
+// and just duplicated the MM/ML tags.  Use the primary alignment to get the
+// clipped sequence so we can trim MM/ML accordingly.
+//
+// We call this first on primary reads with pre == NULL.  This caches
+// MM and ML data into MM_state.
+//
+// We then call it again on secondary and/or supplementary data with
+// pre == the primary record and pass in the associated state.  This then
+// validates MM/ML/MN match, and if not adjusts them if they have hard-clips
+// which yields consistent data.
+//
+// TODO: add sanity check on counts of base types and MM tag to ensure it's
+// possible. We can do this post-trimming, so we sanitize everything.
+//
+// Returns 0 on success,
+//        -1 on failure
+int fix_MM(bam1_t *pre, bam1_t *cur, MM_state *state) {
+    int end5, end3;
+    int MNi = 0; // MN of -1 is used as indicator for no valid mods
+
+    if (!pre && state) {
+        // First time we've see this name.
+        // Look for base modification tags and sanity check.
+        get_mod_info(cur, &state->MM, &state->ML, &state->MN, &MNi,
+                     &end5, &end3);
+        if (!state->MM) {
+            delete_mod_tags(cur);
+            return 0;
+        }
+
+        if (!end5 && !end3 && MNi <= 0) {
+            // No MN tag, but also no clipping.  Assume MM is valid
+            if (cur->core.l_qseq)
+                if (bam_aux_update_int(cur, "MN", cur->core.l_qseq) < 0)
+                    return -1;
+        } else if ((end3 || end5) && cur->core.l_qseq != MNi) {
+            // We have hard clips and MN tag, but the MN tag doesn't match
+            // observed sequence length so it appears the hard-clipping
+            // happened after base-mods called without updating.
+            // Fail as this is a primary read.
+            delete_mod_tags(cur);
+        }
+        // Otherwise we assume the base modifications are correct
+
+    } else if (state) {
+        // A supplementary or secondary alignment with known primary
+        uint8_t *cur_MM = NULL, *cur_ML = NULL, *cur_MN = NULL;
+        MNi = -1;
+        get_mod_info(cur, &cur_MM, &cur_ML, &cur_MN, &MNi, &end5, &end3);
+
+        if (!cur_MM) {
+            delete_mod_tags(cur);
+            return 0;
+        }
+
+        // Does MN match seq length?  If so, we believe it's already valid
+        if (MNi == cur->core.l_qseq)
+            goto validate;
+
+        // Length mismatch and/or no known length, so check vs full seq.
+        if (pre->core.l_qseq != cur->core.l_qseq + end3 + end5) {
+            delete_mod_tags(cur);
+            return 0;
+        } else if (end5 || end3) {
+             if (MNi < 0 || MNi == pre->core.l_qseq)
+                 trim_MM(pre, cur, end5, end3, cur_MM, cur_ML, cur_MN);
+        } // else no hard clips so MM is already valid
+
+        // Set MN so we've validated it, provided seq isn't "*".
+        // inefficient, but minimal compared to everything else
+        if (cur->core.l_qseq)
+            if (bam_aux_update_int(cur, "MN", cur->core.l_qseq) < 0)
+                return -1;
+    }
+
+ validate:
+    ;
+
+    // Also validate MM length matches sequence length.  This mirrors the
+    // logic in htslib/sam_mods.c.
+    // For now we take the inefficient approach of using bam_parse_basemod2.
+    // Inefficient, but robust.
+    hts_base_mod_state *mst = hts_base_mod_state_alloc();
+    if (!mst)
+        return -1;
+
+    enum htsLogLevel lvl = hts_get_log_level();
+    hts_set_log_level(HTS_LOG_OFF);
+    if (bam_parse_basemod(cur, mst) < 0)
+        // Maybe we want hts_log_warning still though?
+        delete_mod_tags(cur);
+    if (validate_MM(cur, mst) < 0)
+        delete_mod_tags(cur);
+    hts_set_log_level(lvl);
+    hts_base_mod_state_free(mst);
+
+    return 0;
+}
+
+// Ensure the b[] array is at least n.
+// Returns 0 on success,
+//        -1 on failure
+static int grow_b_array(bam1_t **b, int *ba, int n) {
+    if (n < *ba)
+        return 0;
+
+    bam1_t *bnew = realloc(*b, (n+=10) * sizeof(**b));
+    if (!bnew)
+        return -1;
+    *b = bnew;
+
+    // bam_init1 equivalent
+    int i;
+    for (i = *ba; i < n; i++)
+        memset(&(*b)[i], 0, sizeof(bam1_t));
+
+    *b = bnew;
+    *ba = n;
+
+    return 0;
+}
+
+// We have b[0]..b[bn-1] entries all from the same template (qname)
+typedef struct {
+    bam1_t *b;
+    int n, ba;  // number used and number allocated
+    int b_next; // b[b_next] for start of next set, -1 if unset
+    int eof;    // marker for having seen eof
+} bam_set;
+
+// Fetches a new batch of BAM records all containing the same name.
+// NB: we cache the last (non-matching) name in b[n], so we can use it to
+// start the next batch.
+// Returns the number of records on success,
+//         <0 on failure or EOF (sam_read1 return vals)
+static int next_template(samFile *in, sam_hdr_t *header, bam_set *bs,
+                         int sanitize_flags) {
+    int result;
+
+    if (bs->eof)
+        return -1;
+
+    // First time through, prime the template name
+    if (bs->b_next < 0) {
+        if (grow_b_array(&bs->b, &bs->ba, 1) < 0)
+            return -2;
+        result = sam_read1(in, header, &bs->b[0]);
+        if (result < 0)
+            return result;
+        if (bam_sanitize(header, &bs->b[0], sanitize_flags) < 0)
+            return -2;
+    } else {
+        // Otherwise use the previous template name read
+        bam1_t btmp = bs->b[0];
+        bs->b[0] = bs->b[bs->b_next];
+        bs->b[bs->b_next] = btmp; // For ->{,l_,m_}data
+    }
+    bs->n = 1;
+
+    // Now keep reading until we find a read that mismatches or we hit eof.
+    char *name = bam_get_qname(&bs->b[0]);
+    for (;;) {
+        if (grow_b_array(&bs->b, &bs->ba, bs->n+1) < 0)
+            return -2;
+
+        result = sam_read1(in, header, &bs->b[bs->n]);
+        if (result < -1)
+            return result;
+
+        if (result < 0) {
+            bs->eof = 1;
+            bs->b_next = -1;
+            break;
+        } else {
+            if (bam_sanitize(header, &bs->b[bs->n], sanitize_flags) < 0)
+                return -2;
+
+            bs->b_next = bs->n;
+            if (strcmp(name, bam_get_qname(&bs->b[bs->n])) != 0)
+                break;
+        }
+
+        bs->n++;
+    }
+
+    return bs->n;
+}
+
 // currently, this function ONLY works if each read has one hit
+//
+// Returns 0 on success,
+//        >0 on failure
 static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
                            int proper_pair_check, int add_ct,
                            int do_mate_scoring, char *arg_list, int no_pg,
-                           int sanitize_flags)
+                           int sanitize_flags, int base_mods)
 {
     sam_hdr_t *header;
-    bam1_t *b[2] = { NULL, NULL };
-    int curr, has_prev, result;
-    hts_pos_t pre_end = 0, cur_end = 0;
+    int result, n;
     kstring_t str = KS_INITIALIZE;
+    bam_set bs = {NULL, 0, 0, -1, 0};
 
     header = sam_hdr_read(in);
     if (header == NULL) {
@@ -506,101 +985,140 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
 
     if (sam_hdr_write(out, header) < 0) goto write_fail;
 
-    b[0] = bam_init1();
-    b[1] = bam_init1();
-    curr = 0; has_prev = 0;
-    while ((result = sam_read1(in, header, b[curr])) >= 0) {
-        bam1_t *cur = b[curr], *pre = b[1-curr];
-        if (bam_sanitize(header, cur, sanitize_flags) < 0)
-            goto fail;
-        if (cur->core.flag & BAM_FSECONDARY)
-        {
-            if ( !remove_reads ) {
-                if (sam_write1(out, header, cur) < 0) goto write_fail;
+    // Iterate template by template fetching bs->n records at a time
+    while ((result = next_template(in, header, &bs, sanitize_flags)) >= 0) {
+        bam1_t *cur = NULL, *pre = NULL, *rnum[2] = {NULL, NULL};
+        int prev = -1, curr = -1;
+        hts_pos_t pre_end = 0, cur_end = 0;
+
+        // Find and fix up primary alignments
+        MM_state state[2];
+        for (n = 0; n < bs.n; n++) {
+            int is_r2 = (bs.b[n].core.flag & BAM_FREAD2) != 0;
+            if (bs.b[n].core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY))
+                continue;
+
+            if (base_mods)
+                if (fix_MM(NULL, &bs.b[n], &state[is_r2]) < 0)
+                    goto fail;
+
+            if (!pre) {
+                pre = &bs.b[prev = n];
+                rnum[(pre->core.flag & BAM_FREAD2) != 0] = pre;
+
+                pre_end = (pre->core.flag & BAM_FUNMAP) == 0
+                    ? bam_endpos(pre) : 0;
+                continue;
             }
-            continue; // skip secondary alignments
-        }
-        if (cur->core.flag & BAM_FSUPPLEMENTARY)
-        {
-            if (sam_write1(out, header, cur) < 0) goto write_fail;
-            continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
-        }
-        if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
-        {
-            cur_end = bam_endpos(cur);
-        }
 
-        if (has_prev) { // do we have a pair of reads to examine?
-            if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
-                pre->core.flag |= BAM_FPAIRED;
-                cur->core.flag |= BAM_FPAIRED;
-                if (sync_mate(pre, cur)) goto fail;
-
-                if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
-                    && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
-                {
-                    hts_pos_t cur5, pre5;
-                    cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
-                    pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
-                    cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
-                } else cur->core.isize = pre->core.isize = 0;
-                if (add_ct) bam_template_cigar(pre, cur, &str);
-                // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution
-                if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
-                    pre->core.flag &= ~BAM_FPROPER_PAIR;
-                    cur->core.flag &= ~BAM_FPROPER_PAIR;
-                }
+            // Note, more than 2 primary alignments will use 'curr' as last
+            cur = &bs.b[curr = n];
+            rnum[(cur->core.flag & BAM_FREAD2) != 0] = cur;
+            cur_end = (cur->core.flag & BAM_FUNMAP) == 0
+                ? bam_endpos(cur) : 0;
+
+            pre->core.flag |= BAM_FPAIRED;
+            cur->core.flag |= BAM_FPAIRED;
+            if (sync_mate(pre, cur))
+                goto fail;
+
+            // If safe set TLEN/ISIZE
+            if (pre->core.tid == cur->core.tid
+                && !(cur->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))
+                && !(pre->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
+                hts_pos_t cur5, pre5;
+                cur5 = (cur->core.flag & BAM_FREVERSE)
+                    ? cur_end
+                    : cur->core.pos;
+                pre5 = (pre->core.flag & BAM_FREVERSE)
+                    ? pre_end
+                    : pre->core.pos;
+                cur->core.isize = pre5 - cur5;
+                pre->core.isize = cur5 - pre5;
+            } else {
+                cur->core.isize = pre->core.isize = 0;
+            }
 
-                if (do_mate_scoring) {
-                    if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) {
-                        fprintf(stderr, "[bam_mating_core] ERROR: unable to add mate score.\n");
-                        goto fail;
-                    }
-                }
+            if (add_ct)
+                bam_template_cigar(pre, cur, &str);
 
-                // Write out result
-                if ( !remove_reads ) {
-                    if (sam_write1(out, header, pre) < 0) goto write_fail;
-                    if (sam_write1(out, header, cur) < 0) goto write_fail;
-                } else {
-                    // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
-                    if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
-                    if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
-                    if(!(pre->core.flag&BAM_FUNMAP)) {
-                        if (sam_write1(out, header, pre) < 0) goto write_fail;
-                    }
-                    if(!(cur->core.flag&BAM_FUNMAP)) {
-                        if (sam_write1(out, header, cur) < 0) goto write_fail;
-                    }
-                }
-                has_prev = 0;
-            } else { // unpaired?  clear bad info and write it out
-                pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
-                pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
-                if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
-                    if (sam_write1(out, header, pre) < 0) goto write_fail;
+            // TODO: Add code to properly check if read is in a proper
+            // pair based on ISIZE distribution
+            if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
+                pre->core.flag &= ~BAM_FPROPER_PAIR;
+                cur->core.flag &= ~BAM_FPROPER_PAIR;
+            }
+
+            if (do_mate_scoring) {
+                if ((add_mate_score(pre, cur) == -1) ||
+                    (add_mate_score(cur, pre) == -1)) {
+                    fprintf(stderr, "[bam_mating_core] ERROR: "
+                            "unable to add mate score.\n");
+                    goto fail;
                 }
             }
-        } else has_prev = 1;
-        curr = 1 - curr;
-        pre_end = cur_end;
-    }
-    if (result < -1) goto read_fail;
-    if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
-        bam1_t *pre = b[1-curr];
-        if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
-            pre->core.flag |= BAM_FUNMAP;
-            pre->core.tid = -1;
-            pre->core.pos = -1;
+
+            // If we have to remove reads make sure we do it in a way that
+            // doesn't create orphans with bad flags
+            if (remove_reads) {
+                if (pre->core.flag&BAM_FUNMAP)
+                    cur->core.flag &=
+                        ~(BAM_FMREVERSE|BAM_FPROPER_PAIR);
+                if (cur->core.flag&BAM_FUNMAP)
+                    pre->core.flag &=
+                        ~(BAM_FMREVERSE|BAM_FPROPER_PAIR);
+            }
         }
-        pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
-        pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
 
-        if (sam_write1(out, header, pre) < 0) goto write_fail;
+        // Handle unpaired primary data
+        if (!cur && pre) {
+            pre->core.mtid = -1;
+            pre->core.mpos = -1;
+            pre->core.isize = 0;
+            pre->core.flag &= ~(BAM_FMREVERSE|BAM_FPROPER_PAIR);
+        }
+
+        // Now process secondary and supplementary alignments
+        for (n = 0; n < bs.n; n++) {
+            if (!(bs.b[n].core.flag & (BAM_FSECONDARY|BAM_FSUPPLEMENTARY))) {
+                // primary
+                continue;
+            }
+
+            // Secondary or supplementary
+            int is_r2 = (bs.b[n].core.flag & BAM_FREAD2) != 0;
+            bam1_t *primary = rnum[is_r2];
+            if (primary) {
+                if (base_mods)
+                    fix_MM(primary, &bs.b[n], &state[is_r2]);
+            } else {
+                // Record with base modifications but no known primary
+                //fprintf(stderr, "Unpaired secondary or supplementary\n");
+                if (base_mods)
+                    fix_MM(NULL, &bs.b[n], NULL);
+            }
+        }
+
+        // Finally having curated everything, write out all records in their
+        // original ordering
+        for (n = 0; n < bs.n; n++) {
+            bam1_t *cur = &bs.b[n];
+            // We may remove unmapped and secondary alignments
+            if (remove_reads && (cur->core.flag & (BAM_FSECONDARY|BAM_FUNMAP)))
+                continue;
+
+            if (sam_write1(out, header, cur) < 0)
+                goto write_fail;
+        }
     }
+    if (result < -1)
+        goto read_fail;
+
     sam_hdr_destroy(header);
-    bam_destroy1(b[0]);
-    bam_destroy1(b[1]);
+
+    for (n = 0; n < bs.ba; n++)
+        free(bs.b[n].data);
+    free(bs.b);
     ks_free(&str);
     return 0;
 
@@ -612,8 +1130,9 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
     print_error_errno("fixmate", "Couldn't write to output file");
  fail:
     sam_hdr_destroy(header);
-    bam_destroy1(b[0]);
-    bam_destroy1(b[1]);
+    for (n = 0; n < bs.ba; n++)
+        free(bs.b[n].data);
+    free(bs.b);
     ks_free(&str);
     return 1;
 }
@@ -630,6 +1149,7 @@ void usage(FILE* where)
 "  -u           Uncompressed output\n"
 "  -z, --sanitize FLAG[,FLAG]\n"
 "               Sanitize alignment fields [defaults to all types]\n"
+"  -M           Fix base modification tags (MM/ML/MN)\n"
 "  --no-PG      do not add a PG line\n");
 
     sam_global_opt_help(where, "-.O..@-.");
@@ -646,7 +1166,7 @@ int bam_mating(int argc, char *argv[])
     htsThreadPool p = {NULL, 0};
     samFile *in = NULL, *out = NULL;
     int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1,
-        mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL;
+        mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL, base_mods = 0;
     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
     char wmode[4] = {'w', 'b', 0, 0};
     static const struct option lopts[] = {
@@ -658,12 +1178,13 @@ int bam_mating(int argc, char *argv[])
 
     // parse args
     if (argc == 1) { usage(stdout); return 0; }
-    while ((c = getopt_long(argc, argv, "rpcmO:@:uz:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "rpcmMO:@:uz:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': remove_reads = 1; break;
         case 'p': proper_pair_check = 0; break;
         case 'c': add_ct = 1; break;
         case 'm': mate_score = 1; break;
+        case 'M': base_mods = 1; break;
         case 'u': wmode[2] = '0'; break;
         case 1: no_pg = 1; break;
         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -702,7 +1223,8 @@ int bam_mating(int argc, char *argv[])
 
     // run
     res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct,
-                          mate_score, arg_list, no_pg, sanitize_flags);
+                          mate_score, arg_list, no_pg, sanitize_flags,
+                          base_mods);
 
     // cleanup
     sam_close(in);
diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c
index 1796f6e9f..a0ba82051 100644
--- a/samtools/bam_mate.c.pysam.c
+++ b/samtools/bam_mate.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  bam_mate.c -- fix mate pairing information and clean up flags.
 
-    Copyright (C) 2009, 2011-2017, 2019, 2022 Genome Research Ltd.
+    Copyright (C) 2009, 2011-2017, 2019, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
     Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
 
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <ctype.h>
 #include "htslib/thread_pool.h"
 #include "sam_opts.h"
 #include "htslib/kstring.h"
@@ -92,7 +93,7 @@ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
  * single Reads:
  * -if pos == 0 (1 based), tid == -1, or UNMAPPED then set UNMAPPED, pos = 0,
  *  tid = -1
- * -clear bad flags (PAIRED, MREVERSE, PROPER_PAIR)
+ * -clear bad flags (MREVERSE, PROPER_PAIR)
  * -set mpos = 0 (1 based), mtid = -1 and isize = 0
  * -write to output
  * Paired Reads:
@@ -474,17 +475,495 @@ int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) {
     return 0;
 }
 
+// Look for 3 tags in one pass, for efficiencies sake.
+// We also convert the draft tags Mm and Ml to MM and ML here.
+static inline void find_tags(bam1_t *b,
+                             char *t1, uint8_t **t1p,
+                             char *t2, uint8_t **t2p,
+                             char *t3, uint8_t **t3p) {
+    *t1p = *t2p = *t3p = NULL;
+    uint8_t *aux = bam_aux_first(b);
+
+    while (aux) {
+        if (aux[-2] == t1[0] && toupper(aux[-1]) == t1[1]) {
+            *t1p = aux;
+            if (islower(aux[-1]))
+                aux[-1] = t1[1];
+        } else if (aux[-2] == t2[0] && toupper(aux[-1]) == t2[1]) {
+            *t2p = aux;
+            if (islower(aux[-1]))
+                aux[-1] = t2[1];
+        } else if (aux[-2] == t3[0] && toupper(aux[-1]) == t3[1]) {
+            *t3p = aux;
+            if (islower(aux[-1]))
+                aux[-1] = t3[1];
+        }
+        aux = bam_aux_next(b, aux);
+    }
+}
+
+// Return 5' and 3' CIGAR hard-clip counts
+static inline void hard_clips(bam1_t *b, int *end5, int *end3) {
+    uint32_t *cigar = bam_get_cigar(b);
+    int ncigar = b->core.n_cigar;
+    int endL = 0, endR = 0, nh = 0;
+
+    if (ncigar && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
+        endL = bam_cigar_oplen(cigar[0]), nh=1;
+    if (ncigar > nh && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
+        endR = bam_cigar_oplen(cigar[ncigar-1]);
+
+    if (b->core.flag & BAM_FREVERSE) {
+        *end5 = endR;
+        *end3 = endL;
+    } else {
+        *end5 = endL;
+        *end3 = endR;
+    }
+}
+
+// Get MM, ML and MN tags, and 5' and 3' hard-clip lengths.
+// MNi is integer copy of MN, or -1 if absent/invalid
+void get_mod_info(bam1_t *b, uint8_t **MM, uint8_t **ML, uint8_t **MN,
+                  int *MNi, int *end5, int *end3) {
+    find_tags(b, "MM", MM, "ML", ML, "MN", MN);
+    if (*MN) {
+        int save_errno = errno;
+        errno = 0;
+        *MNi = bam_aux2i(*MN);
+        if (errno == EINVAL)
+            *MNi = -1;
+        errno = save_errno;
+    } else {
+        *MNi = -1;
+    }
+
+    if (*MM)
+        hard_clips(b, end5, end3);
+    else
+        *end5 = *end3 = 0; // don't need if MM not found
+}
+
+typedef struct MM_state {
+    // tags found on "pre" BAM
+    uint8_t *MM, *ML, *MN;
+} MM_state;
+
+uint8_t *MN_enc(uint8_t *tag, uint32_t n) {
+    if (n > UINT16_MAX) {
+        tag[0] = 'I';
+        i32_to_le(n, tag+1);
+        tag += 5;
+    } else if (n > UINT8_MAX) {
+        tag[0] = 'S';
+        i16_to_le(n, tag+1);
+        tag += 3;
+    } else {
+        *tag++ = 'C';
+        *tag++ = n;
+    }
+
+    return tag;
+}
+
+// Trim 5'/3' bases off MM and ML tags, using a previous sequence as a guide.
+int trim_MM(bam1_t *pre, bam1_t *cur, int end5, int end3,
+            uint8_t *MM, uint8_t *ML, uint8_t *MN) {
+    // Count number of bases
+    int counts5[16] = {0}, counts3[16] = {0};
+
+    uint8_t *seq = bam_get_seq(pre);
+    int i;
+    for (i = 0; i < end5; i++)
+        counts5[bam_seqi(seq, i)]++;
+    memcpy(counts3, counts5, 16 * sizeof(*counts3));
+    for (; i < pre->core.l_qseq - end3; i++)
+        counts3[bam_seqi(seq, i)]++;
+
+    // "p" is position in pre.
+    // "q" is position in cur.
+    // Hence move up "p" to start and copy from there to "q".
+    uint8_t *MMp, *MLp, *MMq = NULL, *MLq = NULL;
+    if (ML && ML[0] == 'B' && ML[1] == 'C') {
+        MLp = ML+6;
+    } else {
+        ML = MLp = NULL;
+    }
+    MMq = MM+1;
+    MLq = MLp;
+    for (MMp = MM+1; *MMp; ) {
+        int fundamental = seq_nt16_table[*MMp];
+        while (*MMp && *MMp != ',')
+            *MMq++ = *MMp++;
+        if (*MMp)
+            *MMq++ = *MMp++;
+
+        // Now on comma separated list for MM and BC array for ML. Skip
+        int n = 0;
+        while (*MMp != ';' && n < counts5[fundamental]) {
+            char *endptr;
+            long delta = strtol((char *)MMp, &endptr, 10);
+            if (counts5[fundamental] - n > delta) {
+                // Skip entire delta in MM and ML.
+                // Eg counts[]=10, MM=3,10 ML=<10><20> => MM=10 ML=<20>
+                n += delta+1;
+                if(ML) MLp++;
+            } else if (counts3[fundamental] > counts5[fundamental]) {
+                // Shrink delta, writing MM and ML is unchanged.
+                // Eg counts[]=3, MM=10,4 ML=<10><20> => MM=7,4 ML=<10><20>
+                char num[50];
+                int l = sprintf(num, "%ld",
+                                delta - (counts5[fundamental]-n));
+                memcpy((char *)MMq, num, l);
+                MMq += l;
+                *MMq++ = *endptr;
+                n += delta+1;
+                if (ML)
+                    *MLq++ = *MLp++;
+            } else {
+                // next base mod is on boundary of 3' clip point
+                break;
+            }
+
+            MMp = (uint8_t *)endptr;
+            if (*MMp != ',')
+                // error?  if not ; also?
+                break;
+            MMp++;
+        }
+
+        // Copy
+        while (*MMp != ';' && n < counts3[fundamental]) {
+            char *endptr;
+            long delta = strtol((char *)MMp, &endptr, 10);
+            if (counts3[fundamental] - n > delta) {
+                // Copy entire delta in MM and ML including [,;]
+                memmove(MMq, MMp, (uint8_t *)endptr - MMp + 1);
+                MMq += (uint8_t *)endptr - MMp + 1;
+                n += delta+1;
+                if (ML)
+                    *MLq++ = *MLp++;
+            } else {
+                // Next mod is into 3' cutoff, so can terminate MM/ML now
+                n = counts3[fundamental];
+                if (ML)
+                    MLp++;
+            }
+
+            MMp = (uint8_t *)endptr;
+            if (*MMp != ',')
+                break;
+            MMp++;
+        }
+
+        // Skip
+        while (*MMp && *MMp != ';') {
+            while (*MMp && *MMp != ',' && *MMp != ';')
+                MMp++;
+            if (*MMp == ',')
+                MMp++;
+
+            if (ML)
+                MLp++;
+        }
+        MMq[-1] = ';'; // replaces , with ; if clipping right
+        if (*MMp)
+            MMp++;
+    }
+
+    MMp++; // skip nul
+    *MMq++ = 0;
+
+    // Adjust ML B array length
+    if (ML)
+        u32_to_le(MLq-(ML+6), ML+2);
+
+    // Move MM and ML down to include their MM:Z and ML:B bits
+    if (MM) MM-=2;
+    if (ML) ML-=2;
+
+    // Now MM/ML are start of tags, MMq/MLq are ends of edited tags,
+    // and MMp/MLp are ends of original tags.  Walk through tags taking up
+    // any gaps
+    //
+    // Eg XXXXXXmmmmm--YYYlllll-ZZ (m and l are edited MM and ML tags)
+    // => XXXXXXmmmmmYYYlllllZZ
+
+    uint8_t *tag = bam_get_aux(cur), *tag_end = cur->data + cur->l_data;
+    uint8_t *to = tag;
+    while (tag && tag < tag_end) {
+        if (tag[0] == 'M' && (tag[1] == 'M' || tag[1] == 'm')) {
+            // Slow but easy
+            memmove(to, MM, MMq-MM); // length of new tag
+            to += MMq-MM;
+            tag = MMp; // size of old tag
+        } else if (tag[0] == 'M' && (tag[1] == 'L' || tag[1] == 'l')) {
+            memmove(to, ML, MLq-ML);
+            to += MLq-ML;
+            tag = MLp;
+        } else if (tag[0] == 'M' && tag[1] == 'N') {
+            tag = bam_aux_next(cur, tag+2);
+            // Skip it as we'll overwrite this later, although this
+            // does change the tag order.  Instead we could do:
+            //
+            // *to++ = 'M';
+            // *to++ = 'N';
+            // to = MN_enc(to, cur->core.l_qseq);
+        } else {
+            // Want aux_skip, but it's private.
+            // So we use bam_aux_next with work-arounds. :(
+            uint8_t *from = tag;
+            tag = bam_aux_next(cur, tag+2);
+            tag = tag ? tag-2 : tag_end;
+            memmove(to, from, tag-from);
+            to += tag-from;
+        }
+    }
+    cur->l_data = to - cur->data;
+
+    return 0;
+}
+
+// Removes base modification tags: MM, ML and MN.
+// This is more efficient than a series of bam_aux_remove and
+// bam_aux_find calls, as the previous removes shuffle the tags we've
+// previously found.  However it's still not optimal.
+void delete_mod_tags(bam1_t *b) {
+    uint8_t *tag = bam_aux_first(b), *next;
+    uint8_t *to = tag;
+    while (tag) {
+        next = bam_aux_next(b, tag);
+        if (tag[-2] == 'M' &&
+            (tag[-1] == 'M' || tag[-1] == 'm' ||
+             tag[-1] == 'L' || tag[-1] == 'l' ||
+             tag[-1] == 'N')) {
+            // Skip. Equivalent to bam_aux_remove without multiple passes
+        } else {
+            // Copy.  All these +/-2s are an annoyance caused by the
+            // tag iterator pointing to the byte after the 2-letter code
+            uint8_t *end = next ? next : b->data + b->l_data + 2;
+            if (tag != to)
+                memmove(to-2, tag-2, end-tag);
+            to += end-tag;
+        }
+        tag = next;
+    }
+
+    b->l_data = (to-2) - b->data;
+}
+
+int validate_MM(bam1_t *b, hts_base_mod_state *state) {
+    hts_base_mod mods[10];
+    int n, pos;
+    while ((n = bam_next_basemod(b, state, mods, 10, &pos)) > 0) {
+        // bam_next_basemod will trigger MM out-of-bound checks
+    }
+    return n;
+}
+
+// Fix base modification tags MM, ML and MN.
+// For supplementary-style alignments we may have hard-clipped the sequence
+// and just duplicated the MM/ML tags.  Use the primary alignment to get the
+// clipped sequence so we can trim MM/ML accordingly.
+//
+// We call this first on primary reads with pre == NULL.  This caches
+// MM and ML data into MM_state.
+//
+// We then call it again on secondary and/or supplementary data with
+// pre == the primary record and pass in the associated state.  This then
+// validates MM/ML/MN match, and if not adjusts them if they have hard-clips
+// which yields consistent data.
+//
+// TODO: add sanity check on counts of base types and MM tag to ensure it's
+// possible. We can do this post-trimming, so we sanitize everything.
+//
+// Returns 0 on success,
+//        -1 on failure
+int fix_MM(bam1_t *pre, bam1_t *cur, MM_state *state) {
+    int end5, end3;
+    int MNi = 0; // MN of -1 is used as indicator for no valid mods
+
+    if (!pre && state) {
+        // First time we've see this name.
+        // Look for base modification tags and sanity check.
+        get_mod_info(cur, &state->MM, &state->ML, &state->MN, &MNi,
+                     &end5, &end3);
+        if (!state->MM) {
+            delete_mod_tags(cur);
+            return 0;
+        }
+
+        if (!end5 && !end3 && MNi <= 0) {
+            // No MN tag, but also no clipping.  Assume MM is valid
+            if (cur->core.l_qseq)
+                if (bam_aux_update_int(cur, "MN", cur->core.l_qseq) < 0)
+                    return -1;
+        } else if ((end3 || end5) && cur->core.l_qseq != MNi) {
+            // We have hard clips and MN tag, but the MN tag doesn't match
+            // observed sequence length so it appears the hard-clipping
+            // happened after base-mods called without updating.
+            // Fail as this is a primary read.
+            delete_mod_tags(cur);
+        }
+        // Otherwise we assume the base modifications are correct
+
+    } else if (state) {
+        // A supplementary or secondary alignment with known primary
+        uint8_t *cur_MM = NULL, *cur_ML = NULL, *cur_MN = NULL;
+        MNi = -1;
+        get_mod_info(cur, &cur_MM, &cur_ML, &cur_MN, &MNi, &end5, &end3);
+
+        if (!cur_MM) {
+            delete_mod_tags(cur);
+            return 0;
+        }
+
+        // Does MN match seq length?  If so, we believe it's already valid
+        if (MNi == cur->core.l_qseq)
+            goto validate;
+
+        // Length mismatch and/or no known length, so check vs full seq.
+        if (pre->core.l_qseq != cur->core.l_qseq + end3 + end5) {
+            delete_mod_tags(cur);
+            return 0;
+        } else if (end5 || end3) {
+             if (MNi < 0 || MNi == pre->core.l_qseq)
+                 trim_MM(pre, cur, end5, end3, cur_MM, cur_ML, cur_MN);
+        } // else no hard clips so MM is already valid
+
+        // Set MN so we've validated it, provided seq isn't "*".
+        // inefficient, but minimal compared to everything else
+        if (cur->core.l_qseq)
+            if (bam_aux_update_int(cur, "MN", cur->core.l_qseq) < 0)
+                return -1;
+    }
+
+ validate:
+    ;
+
+    // Also validate MM length matches sequence length.  This mirrors the
+    // logic in htslib/sam_mods.c.
+    // For now we take the inefficient approach of using bam_parse_basemod2.
+    // Inefficient, but robust.
+    hts_base_mod_state *mst = hts_base_mod_state_alloc();
+    if (!mst)
+        return -1;
+
+    enum htsLogLevel lvl = hts_get_log_level();
+    hts_set_log_level(HTS_LOG_OFF);
+    if (bam_parse_basemod(cur, mst) < 0)
+        // Maybe we want hts_log_warning still though?
+        delete_mod_tags(cur);
+    if (validate_MM(cur, mst) < 0)
+        delete_mod_tags(cur);
+    hts_set_log_level(lvl);
+    hts_base_mod_state_free(mst);
+
+    return 0;
+}
+
+// Ensure the b[] array is at least n.
+// Returns 0 on success,
+//        -1 on failure
+static int grow_b_array(bam1_t **b, int *ba, int n) {
+    if (n < *ba)
+        return 0;
+
+    bam1_t *bnew = realloc(*b, (n+=10) * sizeof(**b));
+    if (!bnew)
+        return -1;
+    *b = bnew;
+
+    // bam_init1 equivalent
+    int i;
+    for (i = *ba; i < n; i++)
+        memset(&(*b)[i], 0, sizeof(bam1_t));
+
+    *b = bnew;
+    *ba = n;
+
+    return 0;
+}
+
+// We have b[0]..b[bn-1] entries all from the same template (qname)
+typedef struct {
+    bam1_t *b;
+    int n, ba;  // number used and number allocated
+    int b_next; // b[b_next] for start of next set, -1 if unset
+    int eof;    // marker for having seen eof
+} bam_set;
+
+// Fetches a new batch of BAM records all containing the same name.
+// NB: we cache the last (non-matching) name in b[n], so we can use it to
+// start the next batch.
+// Returns the number of records on success,
+//         <0 on failure or EOF (sam_read1 return vals)
+static int next_template(samFile *in, sam_hdr_t *header, bam_set *bs,
+                         int sanitize_flags) {
+    int result;
+
+    if (bs->eof)
+        return -1;
+
+    // First time through, prime the template name
+    if (bs->b_next < 0) {
+        if (grow_b_array(&bs->b, &bs->ba, 1) < 0)
+            return -2;
+        result = sam_read1(in, header, &bs->b[0]);
+        if (result < 0)
+            return result;
+        if (bam_sanitize(header, &bs->b[0], sanitize_flags) < 0)
+            return -2;
+    } else {
+        // Otherwise use the previous template name read
+        bam1_t btmp = bs->b[0];
+        bs->b[0] = bs->b[bs->b_next];
+        bs->b[bs->b_next] = btmp; // For ->{,l_,m_}data
+    }
+    bs->n = 1;
+
+    // Now keep reading until we find a read that mismatches or we hit eof.
+    char *name = bam_get_qname(&bs->b[0]);
+    for (;;) {
+        if (grow_b_array(&bs->b, &bs->ba, bs->n+1) < 0)
+            return -2;
+
+        result = sam_read1(in, header, &bs->b[bs->n]);
+        if (result < -1)
+            return result;
+
+        if (result < 0) {
+            bs->eof = 1;
+            bs->b_next = -1;
+            break;
+        } else {
+            if (bam_sanitize(header, &bs->b[bs->n], sanitize_flags) < 0)
+                return -2;
+
+            bs->b_next = bs->n;
+            if (strcmp(name, bam_get_qname(&bs->b[bs->n])) != 0)
+                break;
+        }
+
+        bs->n++;
+    }
+
+    return bs->n;
+}
+
 // currently, this function ONLY works if each read has one hit
+//
+// Returns 0 on success,
+//        >0 on failure
 static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
                            int proper_pair_check, int add_ct,
                            int do_mate_scoring, char *arg_list, int no_pg,
-                           int sanitize_flags)
+                           int sanitize_flags, int base_mods)
 {
     sam_hdr_t *header;
-    bam1_t *b[2] = { NULL, NULL };
-    int curr, has_prev, result;
-    hts_pos_t pre_end = 0, cur_end = 0;
+    int result, n;
     kstring_t str = KS_INITIALIZE;
+    bam_set bs = {NULL, 0, 0, -1, 0};
 
     header = sam_hdr_read(in);
     if (header == NULL) {
@@ -508,101 +987,140 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
 
     if (sam_hdr_write(out, header) < 0) goto write_fail;
 
-    b[0] = bam_init1();
-    b[1] = bam_init1();
-    curr = 0; has_prev = 0;
-    while ((result = sam_read1(in, header, b[curr])) >= 0) {
-        bam1_t *cur = b[curr], *pre = b[1-curr];
-        if (bam_sanitize(header, cur, sanitize_flags) < 0)
-            goto fail;
-        if (cur->core.flag & BAM_FSECONDARY)
-        {
-            if ( !remove_reads ) {
-                if (sam_write1(out, header, cur) < 0) goto write_fail;
+    // Iterate template by template fetching bs->n records at a time
+    while ((result = next_template(in, header, &bs, sanitize_flags)) >= 0) {
+        bam1_t *cur = NULL, *pre = NULL, *rnum[2] = {NULL, NULL};
+        int prev = -1, curr = -1;
+        hts_pos_t pre_end = 0, cur_end = 0;
+
+        // Find and fix up primary alignments
+        MM_state state[2];
+        for (n = 0; n < bs.n; n++) {
+            int is_r2 = (bs.b[n].core.flag & BAM_FREAD2) != 0;
+            if (bs.b[n].core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY))
+                continue;
+
+            if (base_mods)
+                if (fix_MM(NULL, &bs.b[n], &state[is_r2]) < 0)
+                    goto fail;
+
+            if (!pre) {
+                pre = &bs.b[prev = n];
+                rnum[(pre->core.flag & BAM_FREAD2) != 0] = pre;
+
+                pre_end = (pre->core.flag & BAM_FUNMAP) == 0
+                    ? bam_endpos(pre) : 0;
+                continue;
             }
-            continue; // skip secondary alignments
-        }
-        if (cur->core.flag & BAM_FSUPPLEMENTARY)
-        {
-            if (sam_write1(out, header, cur) < 0) goto write_fail;
-            continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
-        }
-        if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
-        {
-            cur_end = bam_endpos(cur);
-        }
 
-        if (has_prev) { // do we have a pair of reads to examine?
-            if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
-                pre->core.flag |= BAM_FPAIRED;
-                cur->core.flag |= BAM_FPAIRED;
-                if (sync_mate(pre, cur)) goto fail;
-
-                if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
-                    && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
-                {
-                    hts_pos_t cur5, pre5;
-                    cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
-                    pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
-                    cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
-                } else cur->core.isize = pre->core.isize = 0;
-                if (add_ct) bam_template_cigar(pre, cur, &str);
-                // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution
-                if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
-                    pre->core.flag &= ~BAM_FPROPER_PAIR;
-                    cur->core.flag &= ~BAM_FPROPER_PAIR;
-                }
+            // Note, more than 2 primary alignments will use 'curr' as last
+            cur = &bs.b[curr = n];
+            rnum[(cur->core.flag & BAM_FREAD2) != 0] = cur;
+            cur_end = (cur->core.flag & BAM_FUNMAP) == 0
+                ? bam_endpos(cur) : 0;
+
+            pre->core.flag |= BAM_FPAIRED;
+            cur->core.flag |= BAM_FPAIRED;
+            if (sync_mate(pre, cur))
+                goto fail;
+
+            // If safe set TLEN/ISIZE
+            if (pre->core.tid == cur->core.tid
+                && !(cur->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))
+                && !(pre->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
+                hts_pos_t cur5, pre5;
+                cur5 = (cur->core.flag & BAM_FREVERSE)
+                    ? cur_end
+                    : cur->core.pos;
+                pre5 = (pre->core.flag & BAM_FREVERSE)
+                    ? pre_end
+                    : pre->core.pos;
+                cur->core.isize = pre5 - cur5;
+                pre->core.isize = cur5 - pre5;
+            } else {
+                cur->core.isize = pre->core.isize = 0;
+            }
 
-                if (do_mate_scoring) {
-                    if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) {
-                        fprintf(samtools_stderr, "[bam_mating_core] ERROR: unable to add mate score.\n");
-                        goto fail;
-                    }
-                }
+            if (add_ct)
+                bam_template_cigar(pre, cur, &str);
 
-                // Write out result
-                if ( !remove_reads ) {
-                    if (sam_write1(out, header, pre) < 0) goto write_fail;
-                    if (sam_write1(out, header, cur) < 0) goto write_fail;
-                } else {
-                    // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
-                    if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
-                    if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
-                    if(!(pre->core.flag&BAM_FUNMAP)) {
-                        if (sam_write1(out, header, pre) < 0) goto write_fail;
-                    }
-                    if(!(cur->core.flag&BAM_FUNMAP)) {
-                        if (sam_write1(out, header, cur) < 0) goto write_fail;
-                    }
-                }
-                has_prev = 0;
-            } else { // unpaired?  clear bad info and write it out
-                pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
-                pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
-                if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
-                    if (sam_write1(out, header, pre) < 0) goto write_fail;
+            // TODO: Add code to properly check if read is in a proper
+            // pair based on ISIZE distribution
+            if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
+                pre->core.flag &= ~BAM_FPROPER_PAIR;
+                cur->core.flag &= ~BAM_FPROPER_PAIR;
+            }
+
+            if (do_mate_scoring) {
+                if ((add_mate_score(pre, cur) == -1) ||
+                    (add_mate_score(cur, pre) == -1)) {
+                    fprintf(samtools_stderr, "[bam_mating_core] ERROR: "
+                            "unable to add mate score.\n");
+                    goto fail;
                 }
             }
-        } else has_prev = 1;
-        curr = 1 - curr;
-        pre_end = cur_end;
-    }
-    if (result < -1) goto read_fail;
-    if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
-        bam1_t *pre = b[1-curr];
-        if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
-            pre->core.flag |= BAM_FUNMAP;
-            pre->core.tid = -1;
-            pre->core.pos = -1;
+
+            // If we have to remove reads make sure we do it in a way that
+            // doesn't create orphans with bad flags
+            if (remove_reads) {
+                if (pre->core.flag&BAM_FUNMAP)
+                    cur->core.flag &=
+                        ~(BAM_FMREVERSE|BAM_FPROPER_PAIR);
+                if (cur->core.flag&BAM_FUNMAP)
+                    pre->core.flag &=
+                        ~(BAM_FMREVERSE|BAM_FPROPER_PAIR);
+            }
         }
-        pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
-        pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
 
-        if (sam_write1(out, header, pre) < 0) goto write_fail;
+        // Handle unpaired primary data
+        if (!cur && pre) {
+            pre->core.mtid = -1;
+            pre->core.mpos = -1;
+            pre->core.isize = 0;
+            pre->core.flag &= ~(BAM_FMREVERSE|BAM_FPROPER_PAIR);
+        }
+
+        // Now process secondary and supplementary alignments
+        for (n = 0; n < bs.n; n++) {
+            if (!(bs.b[n].core.flag & (BAM_FSECONDARY|BAM_FSUPPLEMENTARY))) {
+                // primary
+                continue;
+            }
+
+            // Secondary or supplementary
+            int is_r2 = (bs.b[n].core.flag & BAM_FREAD2) != 0;
+            bam1_t *primary = rnum[is_r2];
+            if (primary) {
+                if (base_mods)
+                    fix_MM(primary, &bs.b[n], &state[is_r2]);
+            } else {
+                // Record with base modifications but no known primary
+                //fprintf(samtools_stderr, "Unpaired secondary or supplementary\n");
+                if (base_mods)
+                    fix_MM(NULL, &bs.b[n], NULL);
+            }
+        }
+
+        // Finally having curated everything, write out all records in their
+        // original ordering
+        for (n = 0; n < bs.n; n++) {
+            bam1_t *cur = &bs.b[n];
+            // We may remove unmapped and secondary alignments
+            if (remove_reads && (cur->core.flag & (BAM_FSECONDARY|BAM_FUNMAP)))
+                continue;
+
+            if (sam_write1(out, header, cur) < 0)
+                goto write_fail;
+        }
     }
+    if (result < -1)
+        goto read_fail;
+
     sam_hdr_destroy(header);
-    bam_destroy1(b[0]);
-    bam_destroy1(b[1]);
+
+    for (n = 0; n < bs.ba; n++)
+        free(bs.b[n].data);
+    free(bs.b);
     ks_free(&str);
     return 0;
 
@@ -614,8 +1132,9 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
     print_error_errno("fixmate", "Couldn't write to output file");
  fail:
     sam_hdr_destroy(header);
-    bam_destroy1(b[0]);
-    bam_destroy1(b[1]);
+    for (n = 0; n < bs.ba; n++)
+        free(bs.b[n].data);
+    free(bs.b);
     ks_free(&str);
     return 1;
 }
@@ -632,6 +1151,7 @@ void usage(FILE* where)
 "  -u           Uncompressed output\n"
 "  -z, --sanitize FLAG[,FLAG]\n"
 "               Sanitize alignment fields [defaults to all types]\n"
+"  -M           Fix base modification tags (MM/ML/MN)\n"
 "  --no-PG      do not add a PG line\n");
 
     sam_global_opt_help(where, "-.O..@-.");
@@ -648,7 +1168,7 @@ int bam_mating(int argc, char *argv[])
     htsThreadPool p = {NULL, 0};
     samFile *in = NULL, *out = NULL;
     int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1,
-        mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL;
+        mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL, base_mods = 0;
     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
     char wmode[4] = {'w', 'b', 0, 0};
     static const struct option lopts[] = {
@@ -660,12 +1180,13 @@ int bam_mating(int argc, char *argv[])
 
     // parse args
     if (argc == 1) { usage(samtools_stdout); return 0; }
-    while ((c = getopt_long(argc, argv, "rpcmO:@:uz:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "rpcmMO:@:uz:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': remove_reads = 1; break;
         case 'p': proper_pair_check = 0; break;
         case 'c': add_ct = 1; break;
         case 'm': mate_score = 1; break;
+        case 'M': base_mods = 1; break;
         case 'u': wmode[2] = '0'; break;
         case 1: no_pg = 1; break;
         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -704,7 +1225,8 @@ int bam_mating(int argc, char *argv[])
 
     // run
     res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct,
-                          mate_score, arg_list, no_pg, sanitize_flags);
+                          mate_score, arg_list, no_pg, sanitize_flags,
+                          base_mods);
 
     // cleanup
     sam_close(in);
diff --git a/samtools/bam_md.c b/samtools/bam_md.c
index b9182b6a8..55d02dc53 100644
--- a/samtools/bam_md.c
+++ b/samtools/bam_md.c
@@ -328,7 +328,7 @@ static void refs_destroy(ref_cache *cache) {
     }
 }
 
-int calmd_usage() {
+int calmd_usage(void) {
     fprintf(stderr,
 "Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
 "Options:\n"
diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c
index 795eccb21..e1101475b 100644
--- a/samtools/bam_md.c.pysam.c
+++ b/samtools/bam_md.c.pysam.c
@@ -330,7 +330,7 @@ static void refs_destroy(ref_cache *cache) {
     }
 }
 
-int calmd_usage() {
+int calmd_usage(void) {
     fprintf(samtools_stderr,
 "Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
 "Options:\n"
diff --git a/samtools/bam_plbuf.h b/samtools/bam_plbuf.h
index 9a718e01c..ba76db18f 100644
--- a/samtools/bam_plbuf.h
+++ b/samtools/bam_plbuf.h
@@ -50,11 +50,6 @@ void bam_plbuf_destroy(bam_plbuf_t *buf);
 
 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
 
-/* Exported from bam_plcmd.c */
-int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
-               hts_pos_t ref_len, const char *ref, kstring_t *ks,
-               int rev_del, int no_ins, int no_ins_mods,
-               int no_del, int no_ends);
 #ifdef __cplusplus
 }
 #endif
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c
index 264a7f5a0..4880cd420 100644
--- a/samtools/bam_plcmd.c
+++ b/samtools/bam_plcmd.c
@@ -1,6 +1,6 @@
 /*  bam_plcmd.c -- mpileup subcommand.
 
-    Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd.
+    Copyright (C) 2008-2015, 2019-2021, 2023-2024 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -51,51 +51,43 @@ DEALINGS IN THE SOFTWARE.  */
 #define dummy_free(p)
 KLIST_INIT(auxlist, char *, dummy_free)
 
-static inline int printw(int c, FILE *fp)
-{
-    char buf[16];
-    int l, x;
-    if (c == 0) return fputc('0', fp);
-    for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
-    if (c < 0) buf[l++] = '-';
-    buf[l] = 0;
-    for (x = 0; x < l/2; ++x) {
-        int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y;
-    }
-    fputs(buf, fp);
-    return 0;
-}
-
-int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
-               hts_pos_t ref_len, const char *ref, kstring_t *ks,
+int pileup_seq(kstring_t *ks_seq, const bam_pileup1_t *p, hts_pos_t pos,
+               hts_pos_t ref_len, const char *ref, kstring_t *ks_mod,
                int rev_del, int no_ins, int no_ins_mods,
                int no_del, int no_ends)
 {
     no_ins_mods |= no_ins;
-    int j;
+    int j, err = 0;
     hts_base_mod_state *m = p->cd.p;
     if (!no_ends && p->is_head) {
-        putc('^', fp);
-        putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
+        err |= kputc_('^', ks_seq) < 0;
+        err |= kputc_(p->b->core.qual > 93 ? 126 : p->b->core.qual + 33,
+                      ks_seq) < 0;
     }
     if (!p->is_del) {
+        // See seq_nt16_str in htslib/hts.c
+        const char seq_nt_str_lc[] = ",acmgrsvtwyhkdbn"; // reverse strand
+        const char seq_nt_str_uc[] = ".ACMGRSVTWYHKDBN";
         int c = p->qpos < p->b->core.l_qseq
-            ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]
-            : 'N';
+            ? bam_seqi(bam_get_seq(p->b), p->qpos)
+            : 15 /*N*/;
         if (ref) {
-            int rb = pos < ref_len? ref[pos] : 'N';
-            if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
-            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
-        } else {
-            if (c == '=') c = bam_is_rev(p->b)? ',' : '.';
-            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
+            int rb = pos < ref_len
+                ? seq_nt16_table[(uint8_t)(ref[pos])]
+                : 15/*N*/;
+            if (c == rb)
+                c = 0; // "=", which becomes . or ,
         }
-        putc(c, fp);
+        c = bam_is_rev(p->b)
+            ? seq_nt_str_lc[c]
+            : seq_nt_str_uc[c];
+        err |= kputc_(c, ks_seq) < 0;
+
         if (m) {
             int nm;
             hts_base_mod mod[256];
             if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 256)) > 0) {
-                putc('[', fp);
+                err |= kputc_('[', ks_seq) < 0;
                 int j;
                 for (j = 0; j < nm && j < 256; j++) {
                     char qual[20];
@@ -105,61 +97,75 @@ int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
                         *qual = 0;
                     if (mod[j].modified_base < 0)
                         // ChEBI
-                        fprintf(fp, "%c(%d)%s", "+-"[mod[j].strand],
-                                -mod[j].modified_base, qual);
+                        err |= ksprintf(ks_seq, "%c(%d)%s",
+                                        "+-"[mod[j].strand],
+                                        -mod[j].modified_base, qual) < 0;
                     else
-                        fprintf(fp, "%c%c%s", "+-"[mod[j].strand],
-                                mod[j].modified_base, qual);
+                        err |= ksprintf(ks_seq, "%c%c%s", "+-"[mod[j].strand],
+                                        mod[j].modified_base, qual) < 0;
                 }
-                putc(']', fp);
+                err |= kputc_(']', ks_seq) < 0;
             }
         }
-    } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp);
+    } else {
+        err |= kputc_(p->is_refskip
+                      ? (bam_is_rev(p->b)? '<' : '>')
+                      : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'),
+                      ks_seq) < 0;
+    }
+
     int del_len = -p->indel;
     if (p->indel > 0) {
         int len = bam_plp_insertion_mod(p, m && !no_ins_mods ? m : NULL,
-                                        ks, &del_len);
+                                        ks_mod, &del_len);
         if (len < 0) {
             print_error("mpileup", "bam_plp_insertion() failed");
             return -1;
         }
         if (no_ins < 2) {
-            putc('+', fp);
-            printw(len, fp);
+            err |= kputc_('+', ks_seq) < 0;
+            err |= kputuw(len, ks_seq) < 0;
         }
         if (!no_ins) {
+            kstring_t *ks = ks_mod;
             if (bam_is_rev(p->b)) {
                 char pad = rev_del ? '#' : '*';
                 int in_mod = 0;
                 for (j = 0; j < ks->l; j++) {
                     if (ks->s[j] == '[') in_mod = 1;
                     else if (ks->s[j] == ']') in_mod = 0;
-                    putc(ks->s[j] != '*'
-                         ? (in_mod ? ks->s[j] : tolower(ks->s[j]))
-                         : pad, fp);
+                    err |= kputc_(ks->s[j] != '*'
+                                  ? (in_mod ? ks->s[j] : tolower(ks->s[j]))
+                                  : pad, ks_seq) < 0;
                 }
             } else {
                 int in_mod = 0;
                 for (j = 0; j < ks->l; j++) {
                     if (ks->s[j] == '[') in_mod = 1;
                     if (ks->s[j] == ']') in_mod = 0;
-                    putc(in_mod ? ks->s[j] : toupper(ks->s[j]), fp);
+                    err |= kputc_(in_mod ? ks->s[j] : toupper(ks->s[j]),
+                                  ks_seq) < 0;
                 }
             }
         }
     }
+
     if (del_len > 0) {
         if (no_del < 2)
-            printw(-del_len, fp);
+            err |= kputw(-del_len, ks_seq) < 0;
         if (!no_del) {
             for (j = 1; j <= del_len; ++j) {
                 int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
-                putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
+                err |= kputc_(bam_is_rev(p->b)? tolower(c) : toupper(c),
+                              ks_seq) < 0;
             }
         }
     }
-    if (!no_ends && p->is_tail) putc('$', fp);
-    return 0;
+
+    if (!no_ends && p->is_tail)
+        err |= kputc_('$', ks_seq) < 0;
+
+    return -err;
 }
 
 #include "sample.h"
@@ -175,6 +181,7 @@ int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
 
 #define MPLP_PRINT_MAPQ_CHAR (1<<11)
 #define MPLP_PRINT_QPOS  (1<<12)
+// Start of struct active_cols elements
 #define MPLP_PRINT_QNAME (1<<13)
 #define MPLP_PRINT_FLAG  (1<<14)
 #define MPLP_PRINT_RNAME (1<<15)
@@ -186,10 +193,12 @@ int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
 #define MPLP_PRINT_TLEN  (1<<21)
 #define MPLP_PRINT_SEQ   (1<<22)
 #define MPLP_PRINT_QUAL  (1<<23)
-#define MPLP_PRINT_MODS  (1<<24)
-#define MPLP_PRINT_QPOS5 (1<<25)
+#define MPLP_PRINT_RLEN  (1<<24)
+// Must occur after struct active_cols element list
+#define MPLP_PRINT_MODS  (1<<25)
+#define MPLP_PRINT_QPOS5 (1<<26)
 
-#define MPLP_PRINT_LAST  (1<<26) // terminator for loop
+#define MPLP_PRINT_LAST  (1<<27) // terminator for loop
 
 #define MPLP_MAX_DEPTH 8000
 #define MPLP_MAX_INDEL_DEPTH 250
@@ -241,11 +250,13 @@ static int build_auxlist(mplp_conf_t *conf, char *optstring) {
         int supported;
     };
 
-    const struct active_cols colnames[11] = {
-            {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1}, {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0}, {"QUAL", 0}
+    const struct active_cols colnames[12] = {
+            {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1},
+            {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0},
+            {"QUAL", 0},  {"RLEN", 1},
     };
 
-    int i, f = MPLP_PRINT_QNAME, colno = 11;
+    int i, f = MPLP_PRINT_QNAME, colno = sizeof(colnames)/sizeof(*colnames);
     for (i = 0; i < colno; i++, f <<= 1)
         if (colnames[i].supported)
             khash_str2int_set(colhash, colnames[i].name, f);
@@ -440,7 +451,7 @@ static int mplp_func(void *data, bam1_t *b)
  * @param fn filenames
  * @param fn_idx index filenames
  */
-static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
+static int mpileup(mplp_conf_t *conf, int nfn, char **fn, char **fn_idx)
 {
     mplp_aux_t **data;
     int i, tid, *n_plp, tid0 = 0, max_depth;
@@ -458,19 +469,19 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
 
     memset(&gplp, 0, sizeof(mplp_pileup_t));
     memset(&buf, 0, sizeof(kstring_t));
-    data = calloc(n, sizeof(mplp_aux_t*));
-    plp = calloc(n, sizeof(bam_pileup1_t*));
-    n_plp = calloc(n, sizeof(int));
+    data = calloc(nfn, sizeof(mplp_aux_t*));
+    plp = calloc(nfn, sizeof(bam_pileup1_t*));
+    n_plp = calloc(nfn, sizeof(int));
     sm = bam_smpl_init();
 
-    if (n == 0) {
+    if (nfn == 0) {
         fprintf(stderr,"[%s] no input file/data given\n", __func__);
         exit(EXIT_FAILURE);
     }
 
     // read the header of each file in the list and initialize data
     refs_t *refs = NULL;
-    for (i = 0; i < n; ++i) {
+    for (i = 0; i < nfn; ++i) {
         sam_hdr_t *h_tmp;
         data[i] = calloc(1, sizeof(mplp_aux_t));
         data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in);
@@ -540,7 +551,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
             data[i]->h = h;
         }
     }
-    fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
+    fprintf(stderr, "[%s] %d samples in %d input files\n",
+            __func__, sm->n, nfn);
 
     pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout;
 
@@ -550,7 +562,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     }
 
     // init pileup
-    iter = bam_mplp_init(n, mplp_func, (void**)data);
+    iter = bam_mplp_init(nfn, mplp_func, (void**)data);
     if (conf->flag & MPLP_PRINT_MODS) {
         bam_mplp_constructor(iter, pileup_cd_create);
         bam_mplp_destructor(iter, pileup_cd_destroy);
@@ -561,7 +573,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
         fprintf(stderr, "[%s] Max depth set to maximum value (%d)\n", __func__, INT_MAX);
     } else {
         max_depth = conf->max_depth;
-        if ( max_depth * n > 1<<20 )
+        if ( max_depth * nfn > 1<<20 )
             fprintf(stderr, "[%s] Combined max depth is above 1M. Potential memory hog!\n", __func__);
     }
 
@@ -570,12 +582,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     int ret;
     int last_tid = -1;
     hts_pos_t last_pos = -1;
+    int one_seq = 0;
 
     // begin pileup
+    kstring_t ks_seq = KS_INITIALIZE;
+    kstring_t ks_mod = KS_INITIALIZE;
+    kstring_t ks_qual = KS_INITIALIZE;
     while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
+        one_seq = 1; // at least 1 output
         if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
-        mplp_get_ref(data[0], tid, &ref, &ref_len);
-        //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
         if (conf->all) {
             // Deal with missing portions of previous tids
             while (tid > last_tid) {
@@ -583,22 +598,27 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                     while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
                         if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
                             continue;
-                        print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len);
+                        print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, nfn, ref, ref_len);
                     }
                 }
                 last_tid++;
                 last_pos = -1;
                 if (conf->all < 2)
                     break;
+                if (tid > last_tid)
+                    // multiple missing references and -aa used
+                    mplp_get_ref(data[0], last_tid, &ref, &ref_len);
             }
         }
+        mplp_get_ref(data[0], tid, &ref, &ref_len);
+
         if (conf->all) {
             // Deal with missing portion of current tid
             while (++last_pos < pos) {
                 if (conf->reg && last_pos < beg0) continue; // out of range; skip
                 if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0)
                     continue;
-                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len);
+                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, nfn, ref, ref_len);
             }
             last_tid = tid;
             last_pos = pos;
@@ -606,16 +626,34 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
         if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue;
 
         fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
-        for (i = 0; i < n; ++i) {
-            int j, cnt;
+        for (i = 0; i < nfn; ++i) {
+            int j, cnt, err = 0;
+            ks_clear(&ks_seq);
+            ks_clear(&ks_qual);
+            ks_clear(&ks_mod);
             for (j = cnt = 0; j < n_plp[i]; ++j) {
                 const bam_pileup1_t *p = plp[i] + j;
                 int c = p->qpos < p->b->core.l_qseq
                     ? bam_get_qual(p->b)[p->qpos]
                     : 0;
-                if (c >= conf->min_baseQ) ++cnt;
+                if (c >= conf->min_baseQ) {
+                    // Build up seq
+                    err |= pileup_seq(&ks_seq, plp[i] + j, pos, ref_len,
+                                      ref, &ks_mod, conf->rev_del,
+                                      conf->no_ins, conf->no_ins_mods,
+                                      conf->no_del, conf->no_ends) < 0;
+
+                    // Build up qual
+                    err |= kputc_(c+33 < 126 ? c+33 : 126, &ks_qual) < 0;
+                    cnt++;
+                }
+            }
+            if (err) {
+                ret = 1;
+                goto fail;
             }
             fprintf(pileup_fp, "\t%d\t", cnt);
+
             if (n_plp[i] == 0) {
                 fputs("*\t*", pileup_fp);
                 int flag_value = MPLP_PRINT_MAPQ_CHAR;
@@ -631,49 +669,25 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                         fputs("\t*", pileup_fp);
                 }
             } else {
-                int n = 0;
-                kstring_t ks = KS_INITIALIZE;
-                for (j = 0; j < n_plp[i]; ++j) {
-                    const bam_pileup1_t *p = plp[i] + j;
-                    int c = p->qpos < p->b->core.l_qseq
-                        ? bam_get_qual(p->b)[p->qpos]
-                        : 0;
-                    if (c >= conf->min_baseQ) {
-                        n++;
-                        if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len,
-                                       ref, &ks, conf->rev_del,
-                                       conf->no_ins, conf->no_ins_mods,
-                                       conf->no_del, conf->no_ends) < 0) {
-                            ret = 1;
-                            goto fail;
-                        }
-                    }
+                if (ks_seq.l) {
+                    fwrite(ks_seq.s, 1, ks_seq.l, pileup_fp);
+                } else {
+                    putc('*', pileup_fp);
                 }
-                if (!n) putc('*', pileup_fp);
-
-                /* Print base qualities */
-                n = 0;
-                ks_free(&ks);
                 putc('\t', pileup_fp);
-                for (j = 0; j < n_plp[i]; ++j) {
-                    const bam_pileup1_t *p = plp[i] + j;
-                    int c = p->qpos < p->b->core.l_qseq
-                        ? bam_get_qual(p->b)[p->qpos]
-                        : 0;
-                    if (c >= conf->min_baseQ) {
-                        c = c + 33 < 126? c + 33 : 126;
-                        putc(c, pileup_fp);
-                        n++;
-                    }
+
+                if (ks_qual.l) {
+                    fwrite(ks_qual.s, 1, ks_qual.l, pileup_fp);
+                } else {
+                    putc('*', pileup_fp);
                 }
-                if (!n) putc('*', pileup_fp);
 
                 /* Print selected columns */
                 int flag_value = MPLP_PRINT_MAPQ_CHAR;
                 while(flag_value < MPLP_PRINT_LAST) {
                     if (flag_value != MPLP_PRINT_MODS
                         && (conf->flag & flag_value)) {
-                        n = 0;
+                        int n = 0;
                         putc('\t', pileup_fp);
                         for (j = 0; j < n_plp[i]; ++j) {
                             const bam_pileup1_t *p = &plp[i][j];
@@ -729,6 +743,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                             case MPLP_PRINT_PNEXT:
                                 fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1);
                                 break;
+                            case MPLP_PRINT_RLEN:
+                                fprintf(pileup_fp, "%d", p->b->core.l_qseq);
+                                break;
                             }
                         }
                         if (!n) putc('*', pileup_fp);
@@ -741,7 +758,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                 if (auxlist_p && auxlist_p->size) {
                     kliter_t(auxlist) *aux;
                     for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) {
-                        n = 0;
+                        int n = 0; // NB shadows outer loop
                         putc('\t', pileup_fp);
                         for (j = 0; j < n_plp[i]; ++j) {
                             const bam_pileup1_t *p = &plp[i][j];
@@ -799,6 +816,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
         putc('\n', pileup_fp);
     }
 
+    ks_free(&ks_seq);
+    ks_free(&ks_mod);
+    ks_free(&ks_qual);
+
     if (ret < 0) {
         print_error("mpileup", "error reading from input file");
         ret = EXIT_FAILURE;
@@ -811,13 +832,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
             last_tid = tid0;
             last_pos = beg0-1;
             mplp_get_ref(data[0], tid0, &ref, &ref_len);
+        } else if (last_tid < 0 && !one_seq && conf->all > 1) {
+            last_tid = 0; // --aa on a blank file
         }
-       while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
+        while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
+            mplp_get_ref(data[0], last_tid, &ref, &ref_len);
             while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
                 if (last_pos >= end0) break;
                 if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
                     continue;
-                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len);
+                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, nfn, ref, ref_len);
             }
             last_tid++;
             last_pos = -1;
@@ -834,7 +858,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
     bam_mplp_destroy(iter);
     sam_hdr_destroy(h);
-    for (i = 0; i < n; ++i) {
+    for (i = 0; i < nfn; ++i) {
         sam_close(data[i]->fp);
         if (data[i]->iter) hts_itr_destroy(data[i]->iter);
         free(data[i]);
diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c
index 009867e07..2b6f01821 100644
--- a/samtools/bam_plcmd.c.pysam.c
+++ b/samtools/bam_plcmd.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  bam_plcmd.c -- mpileup subcommand.
 
-    Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd.
+    Copyright (C) 2008-2015, 2019-2021, 2023-2024 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -53,51 +53,43 @@ DEALINGS IN THE SOFTWARE.  */
 #define dummy_free(p)
 KLIST_INIT(auxlist, char *, dummy_free)
 
-static inline int printw(int c, FILE *fp)
-{
-    char buf[16];
-    int l, x;
-    if (c == 0) return fputc('0', fp);
-    for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
-    if (c < 0) buf[l++] = '-';
-    buf[l] = 0;
-    for (x = 0; x < l/2; ++x) {
-        int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y;
-    }
-    fputs(buf, fp);
-    return 0;
-}
-
-int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
-               hts_pos_t ref_len, const char *ref, kstring_t *ks,
+int pileup_seq(kstring_t *ks_seq, const bam_pileup1_t *p, hts_pos_t pos,
+               hts_pos_t ref_len, const char *ref, kstring_t *ks_mod,
                int rev_del, int no_ins, int no_ins_mods,
                int no_del, int no_ends)
 {
     no_ins_mods |= no_ins;
-    int j;
+    int j, err = 0;
     hts_base_mod_state *m = p->cd.p;
     if (!no_ends && p->is_head) {
-        putc('^', fp);
-        putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
+        err |= kputc_('^', ks_seq) < 0;
+        err |= kputc_(p->b->core.qual > 93 ? 126 : p->b->core.qual + 33,
+                      ks_seq) < 0;
     }
     if (!p->is_del) {
+        // See seq_nt16_str in htslib/hts.c
+        const char seq_nt_str_lc[] = ",acmgrsvtwyhkdbn"; // reverse strand
+        const char seq_nt_str_uc[] = ".ACMGRSVTWYHKDBN";
         int c = p->qpos < p->b->core.l_qseq
-            ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]
-            : 'N';
+            ? bam_seqi(bam_get_seq(p->b), p->qpos)
+            : 15 /*N*/;
         if (ref) {
-            int rb = pos < ref_len? ref[pos] : 'N';
-            if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
-            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
-        } else {
-            if (c == '=') c = bam_is_rev(p->b)? ',' : '.';
-            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
+            int rb = pos < ref_len
+                ? seq_nt16_table[(uint8_t)(ref[pos])]
+                : 15/*N*/;
+            if (c == rb)
+                c = 0; // "=", which becomes . or ,
         }
-        putc(c, fp);
+        c = bam_is_rev(p->b)
+            ? seq_nt_str_lc[c]
+            : seq_nt_str_uc[c];
+        err |= kputc_(c, ks_seq) < 0;
+
         if (m) {
             int nm;
             hts_base_mod mod[256];
             if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 256)) > 0) {
-                putc('[', fp);
+                err |= kputc_('[', ks_seq) < 0;
                 int j;
                 for (j = 0; j < nm && j < 256; j++) {
                     char qual[20];
@@ -107,61 +99,75 @@ int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
                         *qual = 0;
                     if (mod[j].modified_base < 0)
                         // ChEBI
-                        fprintf(fp, "%c(%d)%s", "+-"[mod[j].strand],
-                                -mod[j].modified_base, qual);
+                        err |= ksprintf(ks_seq, "%c(%d)%s",
+                                        "+-"[mod[j].strand],
+                                        -mod[j].modified_base, qual) < 0;
                     else
-                        fprintf(fp, "%c%c%s", "+-"[mod[j].strand],
-                                mod[j].modified_base, qual);
+                        err |= ksprintf(ks_seq, "%c%c%s", "+-"[mod[j].strand],
+                                        mod[j].modified_base, qual) < 0;
                 }
-                putc(']', fp);
+                err |= kputc_(']', ks_seq) < 0;
             }
         }
-    } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp);
+    } else {
+        err |= kputc_(p->is_refskip
+                      ? (bam_is_rev(p->b)? '<' : '>')
+                      : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'),
+                      ks_seq) < 0;
+    }
+
     int del_len = -p->indel;
     if (p->indel > 0) {
         int len = bam_plp_insertion_mod(p, m && !no_ins_mods ? m : NULL,
-                                        ks, &del_len);
+                                        ks_mod, &del_len);
         if (len < 0) {
             print_error("mpileup", "bam_plp_insertion() failed");
             return -1;
         }
         if (no_ins < 2) {
-            putc('+', fp);
-            printw(len, fp);
+            err |= kputc_('+', ks_seq) < 0;
+            err |= kputuw(len, ks_seq) < 0;
         }
         if (!no_ins) {
+            kstring_t *ks = ks_mod;
             if (bam_is_rev(p->b)) {
                 char pad = rev_del ? '#' : '*';
                 int in_mod = 0;
                 for (j = 0; j < ks->l; j++) {
                     if (ks->s[j] == '[') in_mod = 1;
                     else if (ks->s[j] == ']') in_mod = 0;
-                    putc(ks->s[j] != '*'
-                         ? (in_mod ? ks->s[j] : tolower(ks->s[j]))
-                         : pad, fp);
+                    err |= kputc_(ks->s[j] != '*'
+                                  ? (in_mod ? ks->s[j] : tolower(ks->s[j]))
+                                  : pad, ks_seq) < 0;
                 }
             } else {
                 int in_mod = 0;
                 for (j = 0; j < ks->l; j++) {
                     if (ks->s[j] == '[') in_mod = 1;
                     if (ks->s[j] == ']') in_mod = 0;
-                    putc(in_mod ? ks->s[j] : toupper(ks->s[j]), fp);
+                    err |= kputc_(in_mod ? ks->s[j] : toupper(ks->s[j]),
+                                  ks_seq) < 0;
                 }
             }
         }
     }
+
     if (del_len > 0) {
         if (no_del < 2)
-            printw(-del_len, fp);
+            err |= kputw(-del_len, ks_seq) < 0;
         if (!no_del) {
             for (j = 1; j <= del_len; ++j) {
                 int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
-                putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
+                err |= kputc_(bam_is_rev(p->b)? tolower(c) : toupper(c),
+                              ks_seq) < 0;
             }
         }
     }
-    if (!no_ends && p->is_tail) putc('$', fp);
-    return 0;
+
+    if (!no_ends && p->is_tail)
+        err |= kputc_('$', ks_seq) < 0;
+
+    return -err;
 }
 
 #include "sample.h"
@@ -177,6 +183,7 @@ int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
 
 #define MPLP_PRINT_MAPQ_CHAR (1<<11)
 #define MPLP_PRINT_QPOS  (1<<12)
+// Start of struct active_cols elements
 #define MPLP_PRINT_QNAME (1<<13)
 #define MPLP_PRINT_FLAG  (1<<14)
 #define MPLP_PRINT_RNAME (1<<15)
@@ -188,10 +195,12 @@ int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
 #define MPLP_PRINT_TLEN  (1<<21)
 #define MPLP_PRINT_SEQ   (1<<22)
 #define MPLP_PRINT_QUAL  (1<<23)
-#define MPLP_PRINT_MODS  (1<<24)
-#define MPLP_PRINT_QPOS5 (1<<25)
+#define MPLP_PRINT_RLEN  (1<<24)
+// Must occur after struct active_cols element list
+#define MPLP_PRINT_MODS  (1<<25)
+#define MPLP_PRINT_QPOS5 (1<<26)
 
-#define MPLP_PRINT_LAST  (1<<26) // terminator for loop
+#define MPLP_PRINT_LAST  (1<<27) // terminator for loop
 
 #define MPLP_MAX_DEPTH 8000
 #define MPLP_MAX_INDEL_DEPTH 250
@@ -243,11 +252,13 @@ static int build_auxlist(mplp_conf_t *conf, char *optstring) {
         int supported;
     };
 
-    const struct active_cols colnames[11] = {
-            {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1}, {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0}, {"QUAL", 0}
+    const struct active_cols colnames[12] = {
+            {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1},
+            {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0},
+            {"QUAL", 0},  {"RLEN", 1},
     };
 
-    int i, f = MPLP_PRINT_QNAME, colno = 11;
+    int i, f = MPLP_PRINT_QNAME, colno = sizeof(colnames)/sizeof(*colnames);
     for (i = 0; i < colno; i++, f <<= 1)
         if (colnames[i].supported)
             khash_str2int_set(colhash, colnames[i].name, f);
@@ -442,7 +453,7 @@ static int mplp_func(void *data, bam1_t *b)
  * @param fn filenames
  * @param fn_idx index filenames
  */
-static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
+static int mpileup(mplp_conf_t *conf, int nfn, char **fn, char **fn_idx)
 {
     mplp_aux_t **data;
     int i, tid, *n_plp, tid0 = 0, max_depth;
@@ -460,19 +471,19 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
 
     memset(&gplp, 0, sizeof(mplp_pileup_t));
     memset(&buf, 0, sizeof(kstring_t));
-    data = calloc(n, sizeof(mplp_aux_t*));
-    plp = calloc(n, sizeof(bam_pileup1_t*));
-    n_plp = calloc(n, sizeof(int));
+    data = calloc(nfn, sizeof(mplp_aux_t*));
+    plp = calloc(nfn, sizeof(bam_pileup1_t*));
+    n_plp = calloc(nfn, sizeof(int));
     sm = bam_smpl_init();
 
-    if (n == 0) {
+    if (nfn == 0) {
         fprintf(samtools_stderr,"[%s] no input file/data given\n", __func__);
         samtools_exit(EXIT_FAILURE);
     }
 
     // read the header of each file in the list and initialize data
     refs_t *refs = NULL;
-    for (i = 0; i < n; ++i) {
+    for (i = 0; i < nfn; ++i) {
         sam_hdr_t *h_tmp;
         data[i] = calloc(1, sizeof(mplp_aux_t));
         data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in);
@@ -542,7 +553,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
             data[i]->h = h;
         }
     }
-    fprintf(samtools_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
+    fprintf(samtools_stderr, "[%s] %d samples in %d input files\n",
+            __func__, sm->n, nfn);
 
     pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : samtools_stdout;
 
@@ -552,7 +564,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     }
 
     // init pileup
-    iter = bam_mplp_init(n, mplp_func, (void**)data);
+    iter = bam_mplp_init(nfn, mplp_func, (void**)data);
     if (conf->flag & MPLP_PRINT_MODS) {
         bam_mplp_constructor(iter, pileup_cd_create);
         bam_mplp_destructor(iter, pileup_cd_destroy);
@@ -563,7 +575,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
         fprintf(samtools_stderr, "[%s] Max depth set to maximum value (%d)\n", __func__, INT_MAX);
     } else {
         max_depth = conf->max_depth;
-        if ( max_depth * n > 1<<20 )
+        if ( max_depth * nfn > 1<<20 )
             fprintf(samtools_stderr, "[%s] Combined max depth is above 1M. Potential memory hog!\n", __func__);
     }
 
@@ -572,12 +584,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     int ret;
     int last_tid = -1;
     hts_pos_t last_pos = -1;
+    int one_seq = 0;
 
     // begin pileup
+    kstring_t ks_seq = KS_INITIALIZE;
+    kstring_t ks_mod = KS_INITIALIZE;
+    kstring_t ks_qual = KS_INITIALIZE;
     while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
+        one_seq = 1; // at least 1 output
         if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
-        mplp_get_ref(data[0], tid, &ref, &ref_len);
-        //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
         if (conf->all) {
             // Deal with missing portions of previous tids
             while (tid > last_tid) {
@@ -585,22 +600,27 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                     while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
                         if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
                             continue;
-                        print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len);
+                        print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, nfn, ref, ref_len);
                     }
                 }
                 last_tid++;
                 last_pos = -1;
                 if (conf->all < 2)
                     break;
+                if (tid > last_tid)
+                    // multiple missing references and -aa used
+                    mplp_get_ref(data[0], last_tid, &ref, &ref_len);
             }
         }
+        mplp_get_ref(data[0], tid, &ref, &ref_len);
+
         if (conf->all) {
             // Deal with missing portion of current tid
             while (++last_pos < pos) {
                 if (conf->reg && last_pos < beg0) continue; // out of range; skip
                 if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0)
                     continue;
-                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len);
+                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, nfn, ref, ref_len);
             }
             last_tid = tid;
             last_pos = pos;
@@ -608,16 +628,34 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
         if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue;
 
         fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
-        for (i = 0; i < n; ++i) {
-            int j, cnt;
+        for (i = 0; i < nfn; ++i) {
+            int j, cnt, err = 0;
+            ks_clear(&ks_seq);
+            ks_clear(&ks_qual);
+            ks_clear(&ks_mod);
             for (j = cnt = 0; j < n_plp[i]; ++j) {
                 const bam_pileup1_t *p = plp[i] + j;
                 int c = p->qpos < p->b->core.l_qseq
                     ? bam_get_qual(p->b)[p->qpos]
                     : 0;
-                if (c >= conf->min_baseQ) ++cnt;
+                if (c >= conf->min_baseQ) {
+                    // Build up seq
+                    err |= pileup_seq(&ks_seq, plp[i] + j, pos, ref_len,
+                                      ref, &ks_mod, conf->rev_del,
+                                      conf->no_ins, conf->no_ins_mods,
+                                      conf->no_del, conf->no_ends) < 0;
+
+                    // Build up qual
+                    err |= kputc_(c+33 < 126 ? c+33 : 126, &ks_qual) < 0;
+                    cnt++;
+                }
+            }
+            if (err) {
+                ret = 1;
+                goto fail;
             }
             fprintf(pileup_fp, "\t%d\t", cnt);
+
             if (n_plp[i] == 0) {
                 fputs("*\t*", pileup_fp);
                 int flag_value = MPLP_PRINT_MAPQ_CHAR;
@@ -633,49 +671,25 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                         fputs("\t*", pileup_fp);
                 }
             } else {
-                int n = 0;
-                kstring_t ks = KS_INITIALIZE;
-                for (j = 0; j < n_plp[i]; ++j) {
-                    const bam_pileup1_t *p = plp[i] + j;
-                    int c = p->qpos < p->b->core.l_qseq
-                        ? bam_get_qual(p->b)[p->qpos]
-                        : 0;
-                    if (c >= conf->min_baseQ) {
-                        n++;
-                        if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len,
-                                       ref, &ks, conf->rev_del,
-                                       conf->no_ins, conf->no_ins_mods,
-                                       conf->no_del, conf->no_ends) < 0) {
-                            ret = 1;
-                            goto fail;
-                        }
-                    }
+                if (ks_seq.l) {
+                    fwrite(ks_seq.s, 1, ks_seq.l, pileup_fp);
+                } else {
+                    putc('*', pileup_fp);
                 }
-                if (!n) putc('*', pileup_fp);
-
-                /* Print base qualities */
-                n = 0;
-                ks_free(&ks);
                 putc('\t', pileup_fp);
-                for (j = 0; j < n_plp[i]; ++j) {
-                    const bam_pileup1_t *p = plp[i] + j;
-                    int c = p->qpos < p->b->core.l_qseq
-                        ? bam_get_qual(p->b)[p->qpos]
-                        : 0;
-                    if (c >= conf->min_baseQ) {
-                        c = c + 33 < 126? c + 33 : 126;
-                        putc(c, pileup_fp);
-                        n++;
-                    }
+
+                if (ks_qual.l) {
+                    fwrite(ks_qual.s, 1, ks_qual.l, pileup_fp);
+                } else {
+                    putc('*', pileup_fp);
                 }
-                if (!n) putc('*', pileup_fp);
 
                 /* Print selected columns */
                 int flag_value = MPLP_PRINT_MAPQ_CHAR;
                 while(flag_value < MPLP_PRINT_LAST) {
                     if (flag_value != MPLP_PRINT_MODS
                         && (conf->flag & flag_value)) {
-                        n = 0;
+                        int n = 0;
                         putc('\t', pileup_fp);
                         for (j = 0; j < n_plp[i]; ++j) {
                             const bam_pileup1_t *p = &plp[i][j];
@@ -731,6 +745,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                             case MPLP_PRINT_PNEXT:
                                 fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1);
                                 break;
+                            case MPLP_PRINT_RLEN:
+                                fprintf(pileup_fp, "%d", p->b->core.l_qseq);
+                                break;
                             }
                         }
                         if (!n) putc('*', pileup_fp);
@@ -743,7 +760,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                 if (auxlist_p && auxlist_p->size) {
                     kliter_t(auxlist) *aux;
                     for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) {
-                        n = 0;
+                        int n = 0; // NB shadows outer loop
                         putc('\t', pileup_fp);
                         for (j = 0; j < n_plp[i]; ++j) {
                             const bam_pileup1_t *p = &plp[i][j];
@@ -801,6 +818,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
         putc('\n', pileup_fp);
     }
 
+    ks_free(&ks_seq);
+    ks_free(&ks_mod);
+    ks_free(&ks_qual);
+
     if (ret < 0) {
         print_error("mpileup", "error reading from input file");
         ret = EXIT_FAILURE;
@@ -813,13 +834,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
             last_tid = tid0;
             last_pos = beg0-1;
             mplp_get_ref(data[0], tid0, &ref, &ref_len);
+        } else if (last_tid < 0 && !one_seq && conf->all > 1) {
+            last_tid = 0; // --aa on a blank file
         }
-       while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
+        while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
+            mplp_get_ref(data[0], last_tid, &ref, &ref_len);
             while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
                 if (last_pos >= end0) break;
                 if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
                     continue;
-                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len);
+                print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, nfn, ref, ref_len);
             }
             last_tid++;
             last_pos = -1;
@@ -836,7 +860,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
     free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
     bam_mplp_destroy(iter);
     sam_hdr_destroy(h);
-    for (i = 0; i < n; ++i) {
+    for (i = 0; i < nfn; ++i) {
         sam_close(data[i]->fp);
         if (data[i]->iter) hts_itr_destroy(data[i]->iter);
         free(data[i]);
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c
index b44bd665d..2ff6f7e95 100644
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -1,6 +1,6 @@
 /*  bam_sort.c -- sorting and merging.
 
-    Copyright (C) 2008-2023 Genome Research Ltd.
+    Copyright (C) 2008-2024 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -164,11 +164,15 @@ static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_co
 
 typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder;
 static SamOrder g_sam_order = Coordinate;
+static int natural_sort = 1; // not ASCII, but alphanumeric: a12b > a7b
 static char g_sort_tag[2] = {0,0};
 
 #define is_digit(c) ((c)<='9' && (c)>='0')
 static int strnum_cmp(const char *_a, const char *_b)
 {
+    if (!natural_sort)
+        return strcmp(_a,_b);
+
     const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
     const unsigned char *pa = a, *pb = b;
     while (*pa && *pb) {
@@ -236,8 +240,12 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
         case QueryName:
             t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
             if (t != 0) return t > 0;
-            fa = a.entry.bam_record->core.flag & 0xc0;
-            fb = b.entry.bam_record->core.flag & 0xc0;
+            fa = a.entry.bam_record->core.flag;
+            fb = b.entry.bam_record->core.flag;
+            // Sort order is READ1, READ2, (PRIMARY), SUPPLEMENTARY, SECONDARY
+            // Get the bits in this order so sort is a natural a-b
+            fa = ((fa&0xc0)<<8)|((fa&0x100)<<3)|((fa&0x800)>>3);
+            fb = ((fb&0xc0)<<8)|((fb&0x100)<<3)|((fb&0x800)>>3);
             if (fa != fb) return fa > fb;
             break;
         case TagQueryName:
@@ -258,7 +266,7 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
             break;
     }
 
-    // This compares by position in the input file(s)
+    // This compares by position (i/idx'th read) in the input file(s)
     if (a.i != b.i) return a.i > b.i;
     return a.idx > b.idx;
 }
@@ -320,7 +328,7 @@ static void trans_tbl_destroy(trans_tbl_t *tbl) {
  *  Create a merged_header_t struct.
  */
 
-static merged_header_t * init_merged_header() {
+static merged_header_t * init_merged_header(void) {
     merged_header_t *merged_hdr;
 
     merged_hdr = calloc(1, sizeof(*merged_hdr));
@@ -397,7 +405,7 @@ static int gen_unique_id(char *prefix, khash_t(cset) *existing_ids,
 
     do {
         dest->l = 0;
-        ksprintf(dest, "%s-%0lX", prefix, lrand48());
+        ksprintf(dest, "%s-%08lX", prefix, lrand48());
         iter = kh_get(cset, existing_ids, ks_str(dest));
     } while (iter != kh_end(existing_ids));
 
@@ -1320,7 +1328,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
     // Make sure that there's enough memory for template coordinate keys, one per file to read
     if (sam_order == TemplateCoordinate) {
         if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) {
-            print_error("sort", "could not allocate memory for the top-level keys");
+            print_error(cmd, "could not allocate memory for the top-level keys");
             goto mem_fail;
         }
         keys->n = 0;
@@ -1356,8 +1364,8 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
                 h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
             } else if (g_sam_order == TemplateCoordinate) {
                 template_coordinate_key_t *key = template_coordinate_keys_get(keys, i); // get the next key to use
-                h->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
-                if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
+                h->entry.u.key = template_coordinate_key(h->entry.bam_record, key, hout, lib_lookup); // update the key
+                if (h->entry.u.key == NULL) goto fail; // key could not be created, error out
             } else {
                 h->entry.u.tag = NULL;
             }
@@ -1431,7 +1439,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
             } else if (g_sam_order == TemplateCoordinate) {
                 template_coordinate_key_t *key = template_coordinate_keys_get(keys, heap->i); // get the next key to use
                 heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
-                if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
+                if (heap->entry.u.key == NULL) goto fail; // key could not be created, error out
             } else {
                 heap->entry.u.tag = NULL;
             }
@@ -1473,9 +1481,17 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
     bed_destroy(hreg);
     free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
     if (sam_close(fpout) < 0) {
-        print_error(cmd, "error closing output file");
+        print_error_errno(cmd, "error closing output file \"%s\"", out);
         return -1;
     }
+    if (keys != NULL) {
+        for (i = 0; i < keys->m; ++i) {
+            free(keys->buffers[i]);
+        }
+        free(keys->buffers);
+        free(keys);
+    }
+    lib_lookup_destroy(lib_lookup);
     return 0;
 
  mem_fail:
@@ -1535,7 +1551,8 @@ static void merge_usage(FILE *to)
 "   or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
 "\n"
 "Options:\n"
-"  -n         Input files are sorted by read name\n"
+"  -n         Input files are sorted by read name (natural)\n"
+"  -N         Input files are sorted by read name (ASCII)\n"
 "  -t TAG     Input files are sorted by TAG value\n"
 "  -r         Attach RG tag (inferred from file names)\n"
 "  -u         Uncompressed BAM output\n"
@@ -1581,11 +1598,12 @@ int bam_merge(int argc, char *argv[])
         return 0;
     }
 
-    while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nNru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': flag |= MERGE_RG; break;
         case 'f': flag |= MERGE_FORCE; break;
         case 'h': fn_headers = optarg; break;
+        case 'N': natural_sort = 0; // fall through
         case 'n': sam_order = QueryName; break;
         case 'o': fnout = optarg; break;
         case 't': sort_tag = optarg; break;
@@ -1924,7 +1942,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
     }
 
     if (sam_close(fpout) < 0) {
-        print_error(cmd, "error closing output file");
+        print_error_errno(cmd, "error closing output file \"%s\"", out);
         return -1;
     }
     return 0;
@@ -1959,7 +1977,13 @@ static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
     if (g_sam_order == QueryName || g_sam_order == TagQueryName) {
         int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record));
         if (t != 0) return t;
-        return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
+        int af = a.bam_record->core.flag;
+        int bf = b.bam_record->core.flag;
+        // Sort order is READ1, READ2, (PRIMARY), SUPPLEMENTARY, SECONDARY
+        // Get the bits in this order so sort is a natural a-b
+        af = ((af&0xc0)<<8)|((af&0x100)<<3)|((af&0x800)>>3);
+        bf = ((bf&0xc0)<<8)|((bf&0x100)<<3)|((bf&0x800)>>3);
+        return af - bf;
     } else {
         pa = a.bam_record->core.tid;
         pb = b.bam_record->core.tid;
@@ -3055,7 +3079,7 @@ static void *worker(void *data)
             break;
         case MinHash:
             worker_minhash(w);
-            // no break, go to merge sort
+            // fall-through
         default:
             ks_mergesort(sort, w->buf_len, w->buf, 0);
     }
@@ -3287,6 +3311,9 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
             break;
         case QueryName:
             new_so = "queryname";
+            new_ss = natural_sort
+                ? "queryname:natural"
+                : "queryname:lexicographical";
             break;
         case MinHash:
             new_so = "coordinate";
@@ -3605,7 +3632,8 @@ static void sort_usage(FILE *fp)
 "  -I FILE    Order minimisers by their position in FILE FASTA\n"
 "  -w INT     Window size for minimiser indexing via -I ref.fa [100]\n"
 "  -H         Squash homopolymers when computing minimiser\n"
-"  -n         Sort by read name (not compatible with samtools index command)\n"
+"  -n         Sort by read name (natural): cannot be used with samtools index\n"
+"  -N         Sort by read name (ASCII): cannot be used with samtools index\n"
 "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
 "  -o FILE    Write final output to FILE rather than standard output\n"
 "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n"
@@ -3658,9 +3686,10 @@ int bam_sort(int argc, char *argv[])
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:nNo:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
         switch (c) {
         case 'o': fnout = optarg; o_seen = 1; break;
+        case 'N': natural_sort = 0; // fall through
         case 'n': sam_order = QueryName; break;
         case 't': by_tag = true; sort_tag = optarg; break;
         case 'm': {
@@ -3732,7 +3761,7 @@ int bam_sort(int argc, char *argv[])
         goto sort_end;
     }
 
-    if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) {
+    if (ga.write_index && sam_order != Coordinate) {
         fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n");
         ga.write_index = 0;
     }
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c
index 80aa4d807..3aa3a49af 100644
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  bam_sort.c -- sorting and merging.
 
-    Copyright (C) 2008-2023 Genome Research Ltd.
+    Copyright (C) 2008-2024 Genome Research Ltd.
     Portions copyright (C) 2009-2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -166,11 +166,15 @@ static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_co
 
 typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder;
 static SamOrder g_sam_order = Coordinate;
+static int natural_sort = 1; // not ASCII, but alphanumeric: a12b > a7b
 static char g_sort_tag[2] = {0,0};
 
 #define is_digit(c) ((c)<='9' && (c)>='0')
 static int strnum_cmp(const char *_a, const char *_b)
 {
+    if (!natural_sort)
+        return strcmp(_a,_b);
+
     const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
     const unsigned char *pa = a, *pb = b;
     while (*pa && *pb) {
@@ -238,8 +242,12 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
         case QueryName:
             t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
             if (t != 0) return t > 0;
-            fa = a.entry.bam_record->core.flag & 0xc0;
-            fb = b.entry.bam_record->core.flag & 0xc0;
+            fa = a.entry.bam_record->core.flag;
+            fb = b.entry.bam_record->core.flag;
+            // Sort order is READ1, READ2, (PRIMARY), SUPPLEMENTARY, SECONDARY
+            // Get the bits in this order so sort is a natural a-b
+            fa = ((fa&0xc0)<<8)|((fa&0x100)<<3)|((fa&0x800)>>3);
+            fb = ((fb&0xc0)<<8)|((fb&0x100)<<3)|((fb&0x800)>>3);
             if (fa != fb) return fa > fb;
             break;
         case TagQueryName:
@@ -260,7 +268,7 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
             break;
     }
 
-    // This compares by position in the input file(s)
+    // This compares by position (i/idx'th read) in the input file(s)
     if (a.i != b.i) return a.i > b.i;
     return a.idx > b.idx;
 }
@@ -322,7 +330,7 @@ static void trans_tbl_destroy(trans_tbl_t *tbl) {
  *  Create a merged_header_t struct.
  */
 
-static merged_header_t * init_merged_header() {
+static merged_header_t * init_merged_header(void) {
     merged_header_t *merged_hdr;
 
     merged_hdr = calloc(1, sizeof(*merged_hdr));
@@ -399,7 +407,7 @@ static int gen_unique_id(char *prefix, khash_t(cset) *existing_ids,
 
     do {
         dest->l = 0;
-        ksprintf(dest, "%s-%0lX", prefix, lrand48());
+        ksprintf(dest, "%s-%08lX", prefix, lrand48());
         iter = kh_get(cset, existing_ids, ks_str(dest));
     } while (iter != kh_end(existing_ids));
 
@@ -1322,7 +1330,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
     // Make sure that there's enough memory for template coordinate keys, one per file to read
     if (sam_order == TemplateCoordinate) {
         if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) {
-            print_error("sort", "could not allocate memory for the top-level keys");
+            print_error(cmd, "could not allocate memory for the top-level keys");
             goto mem_fail;
         }
         keys->n = 0;
@@ -1358,8 +1366,8 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
                 h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
             } else if (g_sam_order == TemplateCoordinate) {
                 template_coordinate_key_t *key = template_coordinate_keys_get(keys, i); // get the next key to use
-                h->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
-                if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
+                h->entry.u.key = template_coordinate_key(h->entry.bam_record, key, hout, lib_lookup); // update the key
+                if (h->entry.u.key == NULL) goto fail; // key could not be created, error out
             } else {
                 h->entry.u.tag = NULL;
             }
@@ -1433,7 +1441,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
             } else if (g_sam_order == TemplateCoordinate) {
                 template_coordinate_key_t *key = template_coordinate_keys_get(keys, heap->i); // get the next key to use
                 heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
-                if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
+                if (heap->entry.u.key == NULL) goto fail; // key could not be created, error out
             } else {
                 heap->entry.u.tag = NULL;
             }
@@ -1475,9 +1483,17 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c
     bed_destroy(hreg);
     free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
     if (sam_close(fpout) < 0) {
-        print_error(cmd, "error closing output file");
+        print_error_errno(cmd, "error closing output file \"%s\"", out);
         return -1;
     }
+    if (keys != NULL) {
+        for (i = 0; i < keys->m; ++i) {
+            free(keys->buffers[i]);
+        }
+        free(keys->buffers);
+        free(keys);
+    }
+    lib_lookup_destroy(lib_lookup);
     return 0;
 
  mem_fail:
@@ -1537,7 +1553,8 @@ static void merge_usage(FILE *to)
 "   or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
 "\n"
 "Options:\n"
-"  -n         Input files are sorted by read name\n"
+"  -n         Input files are sorted by read name (natural)\n"
+"  -N         Input files are sorted by read name (ASCII)\n"
 "  -t TAG     Input files are sorted by TAG value\n"
 "  -r         Attach RG tag (inferred from file names)\n"
 "  -u         Uncompressed BAM output\n"
@@ -1583,11 +1600,12 @@ int bam_merge(int argc, char *argv[])
         return 0;
     }
 
-    while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nNru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
         switch (c) {
         case 'r': flag |= MERGE_RG; break;
         case 'f': flag |= MERGE_FORCE; break;
         case 'h': fn_headers = optarg; break;
+        case 'N': natural_sort = 0; // fall through
         case 'n': sam_order = QueryName; break;
         case 'o': fnout = optarg; break;
         case 't': sort_tag = optarg; break;
@@ -1926,7 +1944,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
     }
 
     if (sam_close(fpout) < 0) {
-        print_error(cmd, "error closing output file");
+        print_error_errno(cmd, "error closing output file \"%s\"", out);
         return -1;
     }
     return 0;
@@ -1961,7 +1979,13 @@ static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
     if (g_sam_order == QueryName || g_sam_order == TagQueryName) {
         int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record));
         if (t != 0) return t;
-        return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
+        int af = a.bam_record->core.flag;
+        int bf = b.bam_record->core.flag;
+        // Sort order is READ1, READ2, (PRIMARY), SUPPLEMENTARY, SECONDARY
+        // Get the bits in this order so sort is a natural a-b
+        af = ((af&0xc0)<<8)|((af&0x100)<<3)|((af&0x800)>>3);
+        bf = ((bf&0xc0)<<8)|((bf&0x100)<<3)|((bf&0x800)>>3);
+        return af - bf;
     } else {
         pa = a.bam_record->core.tid;
         pb = b.bam_record->core.tid;
@@ -3057,7 +3081,7 @@ static void *worker(void *data)
             break;
         case MinHash:
             worker_minhash(w);
-            // no break, go to merge sort
+            // fall-through
         default:
             ks_mergesort(sort, w->buf_len, w->buf, 0);
     }
@@ -3289,6 +3313,9 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
             break;
         case QueryName:
             new_so = "queryname";
+            new_ss = natural_sort
+                ? "queryname:natural"
+                : "queryname:lexicographical";
             break;
         case MinHash:
             new_so = "coordinate";
@@ -3607,7 +3634,8 @@ static void sort_usage(FILE *fp)
 "  -I FILE    Order minimisers by their position in FILE FASTA\n"
 "  -w INT     Window size for minimiser indexing via -I ref.fa [100]\n"
 "  -H         Squash homopolymers when computing minimiser\n"
-"  -n         Sort by read name (not compatible with samtools index command)\n"
+"  -n         Sort by read name (natural): cannot be used with samtools index\n"
+"  -N         Sort by read name (ASCII): cannot be used with samtools index\n"
 "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
 "  -o FILE    Write final output to FILE rather than standard output\n"
 "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n"
@@ -3660,9 +3688,10 @@ int bam_sort(int argc, char *argv[])
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:nNo:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
         switch (c) {
         case 'o': fnout = optarg; o_seen = 1; break;
+        case 'N': natural_sort = 0; // fall through
         case 'n': sam_order = QueryName; break;
         case 't': by_tag = true; sort_tag = optarg; break;
         case 'm': {
@@ -3734,7 +3763,7 @@ int bam_sort(int argc, char *argv[])
         goto sort_end;
     }
 
-    if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) {
+    if (ga.write_index && sam_order != Coordinate) {
         fprintf(samtools_stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n");
         ga.write_index = 0;
     }
diff --git a/samtools/bam_split.c b/samtools/bam_split.c
index e9f0fb591..a12bf7690 100644
--- a/samtools/bam_split.c
+++ b/samtools/bam_split.c
@@ -1,6 +1,6 @@
 /*  bam_split.c -- split subcommand.
 
-    Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd.
+    Copyright (C) 2013-2016,2018-2019,2023,2024 Genome Research Ltd.
 
     Author: Martin Pollard <mp15@sanger.ac.uk>
 
@@ -48,11 +48,16 @@ struct parsed_opts {
     const char *unaccounted_header_name;
     const char *unaccounted_name;
     const char *output_format_string;
+    const char *tag;
+    long max_split;
     bool verbose;
     int no_pg;
+    int zero_pad;
     sam_global_args ga;
 };
 
+#define DEFAULT_MAX_SPLIT 100
+
 typedef struct parsed_opts parsed_opts_t;
 
 struct state {
@@ -60,13 +65,14 @@ struct state {
     sam_hdr_t* merged_input_header;
     samFile* unaccounted_file;
     sam_hdr_t* unaccounted_header;
+    char *unaccounted_idx_fn;
     size_t output_count;
-    char** rg_id;
-    char **rg_index_file_name;
-    char **rg_output_file_name;
-    samFile** rg_output_file;
-    sam_hdr_t** rg_output_header;
-    kh_c2i_t* rg_hash;
+    char **tag_vals;
+    char **index_file_name;
+    char **output_file_name;
+    samFile **output_file;
+    sam_hdr_t **output_header;
+    kh_c2i_t* tag_val_hash;
     htsThreadPool p;
     int write_index;
 };
@@ -82,19 +88,23 @@ static void usage(FILE *write_to)
 "Usage: samtools split [-u <unaccounted.bam>] [-h <unaccounted_header.sam>]\n"
 "                      [-f <format_string>] [-v] <merged.bam>\n"
 "Options:\n"
-"  -f STRING       output filename format string [\"%%*_%%#.%%.\"]\n"
-"  -u FILE1        put reads with no RG tag or an unrecognised RG tag in FILE1\n"
-"  -h FILE2        ... and override the header with FILE2 (-u file only)\n"
-"  -v              verbose output\n"
-"  --no-PG         do not add a PG line\n");
+"  -f STRING           output filename format string [\"%%*_%%#.%%.\"]\n"
+"  -u FILE1            put left-over reads in FILE1\n"
+"  -h FILE2            ... and override the header with FILE2 (-u file only)\n"
+"  -d TAG              split by TAG value. TAG value must be a string.\n"
+"  -p NUMBER           zero-pad numbers in filenames to NUMBER digits\n"
+"  -M,--max-split NUM  limit number of output files from -d to NUM [%d]\n"
+"  -v                  verbose output\n"
+"  --no-PG             do not add a PG line\n",
+            DEFAULT_MAX_SPLIT);
     sam_global_opt_help(write_to, "-....@..");
     fprintf(write_to,
 "\n"
 "Format string expansions:\n"
 "  %%%%     %%\n"
 "  %%*     basename\n"
-"  %%#     @RG index\n"
-"  %%!     @RG ID\n"
+"  %%#     index (of @RG in the header, or count of TAG values seen so far)\n"
+"  %%!     @RG ID or TAG value\n"
 "  %%.     filename extension for output format\n"
       );
 }
@@ -104,17 +114,21 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 {
     if (argc == 1) { usage(stdout); return NULL; }
 
-    const char *optstring = "vf:h:u:@:";
+    const char *optstring = "vf:h:u:d:M:p:@:";
+    char *default_format_string = "%*_%#.%.";
 
     static const struct option lopts[] = {
         SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
         {"no-PG", no_argument, NULL, 1},
+        {"max-split", required_argument, NULL, 'M'},
+        {"zero-pad", required_argument, NULL, 'p'},
         { NULL, 0, NULL, 0 }
     };
 
     parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
     if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
 
+    retval->max_split = DEFAULT_MAX_SPLIT;
     sam_global_args_init(&retval->ga);
 
     int opt;
@@ -132,9 +146,37 @@ static parsed_opts_t* parse_args(int argc, char** argv)
         case 'u':
             retval->unaccounted_name = optarg;
             break;
+        case 'd':
+            retval->tag = optarg;
+            default_format_string = "%*_%!.%.";
+            break;
+        case 'M': {
+            char *end = optarg;
+            retval->max_split = strtol(optarg, &end, 10);
+            if (*optarg == '\0' || *end != '\0' || retval->max_split == 0) {
+                print_error("split", "Invalid -M argument: \"%s\"", optarg);
+                free(retval);
+                return NULL;
+            }
+            if (retval->max_split < 0) // No limit requested
+                retval->max_split = LONG_MAX;
+            break;
+        }
+        case 'p': {
+            char *end = optarg;
+            unsigned long val = strtoul(optarg, &end, 10);
+            if (*optarg == '\0' || *end != '\0' || val > 20) {
+                print_error("split", "Invalid -p argument: \"%s\"", optarg);
+                free(retval);
+                return NULL;
+            }
+            retval->zero_pad = (int) val;
+            break;
+        }
         case 1:
             retval->no_pg = 1;
             break;
+
         default:
             if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
             /* else fall-through */
@@ -145,7 +187,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
         }
     }
 
-    if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%.";
+    if (retval->output_format_string == NULL) retval->output_format_string = default_format_string;
 
     argc -= optind;
     argv += optind;
@@ -163,7 +205,10 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 }
 
 // Expands a output filename format string
-static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
+static char* expand_format_string(const char* format_string,
+                                  const char* basename, const char* tag_val,
+                                  const int file_idx, const int zero_pad,
+                                  const htsFormat *format)
 {
     kstring_t str = { 0, 0, NULL };
     const char* pointer = format_string;
@@ -179,10 +224,15 @@ static char* expand_format_string(const char* format_string, const char* basenam
                 if (kputs(basename, &str) < 0) goto memfail;
                 break;
             case '#':
-                if (kputl(rg_idx, &str) < 0) goto memfail;
+                if (zero_pad == 0) {
+                    if (kputl(file_idx, &str) < 0) goto memfail;
+                } else {
+                    if (ksprintf(&str, "%0*d", zero_pad, file_idx) < 0)
+                        goto memfail;
+                }
                 break;
             case '!':
-                if (kputs(rg_id, &str) < 0) goto memfail;
+                if (kputs(tag_val, &str) < 0) goto memfail;
                 break;
             case '.':
                 // Only really need to cope with sam, bam, cram
@@ -275,6 +325,157 @@ static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2)
     return 0;
 }
 
+static int grow_output_lists(state_t *state, size_t count) {
+    char **new_list = realloc(state->tag_vals, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->tag_vals = new_list;
+    new_list = realloc(state->index_file_name, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->index_file_name = new_list;
+    new_list = realloc(state->output_file_name, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->output_file_name = new_list;
+    samFile **new_file = realloc(state->output_file,
+                                 count * sizeof(samFile *));
+    if (!new_file)
+        return -1;
+    state->output_file = new_file;
+    sam_hdr_t **new_hdr = realloc(state->output_header,
+                                  count * sizeof(sam_hdr_t *));
+    if (!new_hdr)
+        return -1;
+    state->output_header = new_hdr;
+    return 0;
+}
+
+static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
+                              const char *tag, const char *arg_list,
+                              int is_rg) {
+    char *input_base_name = NULL, *new_file_name = NULL, *tag_val = NULL;
+    char *new_idx_fn = NULL;
+    sam_hdr_t *new_hdr = NULL;
+    samFile *new_sam_file = NULL;
+
+    khiter_t i = kh_get_c2i(state->tag_val_hash, tag);
+    if (i != kh_end(state->tag_val_hash)) {
+        return i;
+    }
+    // create new file
+    if (grow_output_lists(state, state->output_count + 1) != 0) {
+        print_error_errno("split", "Couldn't grow output lists");
+        return kh_end(state->tag_val_hash);
+    }
+    tag_val = strdup(tag);
+    if (!tag_val) {
+        print_error_errno("split", "Couldn't copy tag value");
+        return kh_end(state->tag_val_hash);
+    }
+    char *dirsep = strrchr(opts->merged_input_name, '/');
+    input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
+    if (!input_base_name) {
+        print_error_errno("split", "Filename parsing failed");
+        goto fail;
+    }
+
+    char* extension = strrchr(input_base_name, '.');
+    if (extension) *extension = '\0';
+
+    new_file_name = expand_format_string(opts->output_format_string,
+                                         input_base_name, tag,
+                                         kh_size(state->tag_val_hash),
+                                         opts->zero_pad, &opts->ga.out);
+    if (!new_file_name) {
+        print_error_errno("split", "Filename creation failed");
+        goto fail;
+    }
+
+    new_hdr = sam_hdr_dup(state->merged_input_header);
+    if (!new_hdr) {
+        print_error_errno("split", "Duplicating header for file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+    if (!opts->no_pg && sam_hdr_add_pg(new_hdr, "samtools",
+                                       "VN", samtools_version(),
+                                       arg_list ? "CL": NULL,
+                                       arg_list ? arg_list : NULL,
+                                       NULL)) {
+        print_error_errno("split", "Adding PG line to file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+
+    if (is_rg) {
+        // If here, we've found an RG:Z: tag without a corresponding @RG
+        // line in the header.
+        if (sam_hdr_remove_lines(new_hdr, "RG", "ID", NULL) != 0) {
+            print_error_errno("split",
+                              "Failed to remove @RG lines from file \"%s\"",
+                              new_file_name);
+            goto fail;
+        }
+        if (sam_hdr_add_line(new_hdr, "RG", "ID", tag_val, NULL) != 0) {
+            print_error_errno("split",
+                              "Failed to add @RG line to file \"%s\"",
+                              new_file_name);
+            goto fail;
+        }
+    }
+
+    char outmode[4] = "w";
+    sam_open_mode(outmode + 1, new_file_name, NULL);
+    new_sam_file = sam_open_format(new_file_name, outmode, &opts->ga.out);
+    if (!new_sam_file) {
+        print_error_errno("split", "Opening filename for writing \"%s\" failed", new_file_name);
+        goto fail;
+    }
+    if (state->p.pool)
+        hts_set_opt(new_sam_file, HTS_OPT_THREAD_POOL, &state->p);
+
+    if (sam_hdr_write(new_sam_file, new_hdr) != 0) {
+        print_error_errno("split", "Couldn't write header to \"%s\"",
+                          new_file_name);
+        goto fail;
+    }
+
+    if (state->write_index) {
+        new_idx_fn = auto_index(new_sam_file, new_file_name, new_hdr);
+        if (!new_idx_fn) {
+            print_error_errno("split", "Creating index file for file \"%s\" failed", new_file_name);
+            goto fail;
+        }
+    }
+
+    int ret = -1;
+    i = kh_put_c2i(state->tag_val_hash, tag_val, &ret);
+    if (ret < 0) {
+        print_error_errno("split", "Adding file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+
+    kh_val(state->tag_val_hash, i) = state->output_count;
+    state->tag_vals[state->output_count] = tag_val;
+    state->index_file_name[state->output_count] = new_idx_fn;
+    state->output_file_name[state->output_count] = new_file_name;
+    state->output_file[state->output_count] = new_sam_file;
+    state->output_header[state->output_count] = new_hdr;
+    state->output_count++;
+    free(input_base_name);
+    return i;
+
+ fail:
+    free(input_base_name);
+    free(new_file_name);
+    free(tag_val);
+    free(new_idx_fn);
+    if (new_hdr)
+        sam_hdr_destroy(new_hdr);
+    if (new_sam_file)
+        sam_close(new_sam_file);
+    return kh_end(state->tag_val_hash);
+}
+
 // Set the initial state
 static state_t* init(parsed_opts_t* opts, const char *arg_list)
 {
@@ -306,6 +507,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         cleanup_state(retval, false);
         return NULL;
     }
+    retval->write_index = opts->ga.write_index;
 
     if (opts->unaccounted_name) {
         if (opts->unaccounted_header_name) {
@@ -331,10 +533,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         } else {
             retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header);
             if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools",
-                                    "VN", samtools_version(),
-                                    arg_list ? "CL": NULL,
-                                    arg_list ? arg_list : NULL,
-                                    NULL)) {
+                                               "VN", samtools_version(),
+                                               arg_list ? "CL": NULL,
+                                               arg_list ? arg_list : NULL,
+                                               NULL)) {
                 print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name);
                 cleanup_state(retval, false);
                 return NULL;
@@ -354,23 +556,40 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
             hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
     }
 
-    // Open output files for RGs
-    if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
-    if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count);
+    int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
+    if (is_rg) {
+        if (!count_RG(retval->merged_input_header,
+                      &retval->output_count, &retval->tag_vals)) {
+            cleanup_state(retval, false);
+            return NULL;
+        }
+        if (opts->verbose)
+            fprintf(stderr, "@RG's found %zu\n",retval->output_count);
+    } else {
+        retval->output_count = 0;
+    }
+
     // Prevent calloc(0, size);
     size_t num = retval->output_count ? retval->output_count : 1;
-    retval->rg_index_file_name = (char **)calloc(num, sizeof(char *));
-    retval->rg_output_file_name = (char **)calloc(num, sizeof(char *));
-    retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*));
-    retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
-    retval->rg_hash = kh_init_c2i();
-    if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header ||
-        !retval->rg_hash || !retval->rg_index_file_name) {
+    retval->index_file_name = (char **)calloc(num, sizeof(char *));
+    retval->output_file_name = (char **)calloc(num, sizeof(char *));
+    retval->output_file = (samFile**)calloc(num, sizeof(samFile*));
+    retval->output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
+    retval->tag_val_hash = kh_init_c2i();
+    if (!retval->output_file_name || !retval->output_file || !retval->output_header ||
+        !retval->tag_val_hash || !retval->index_file_name) {
         print_error_errno("split", "Could not initialise output file array");
         cleanup_state(retval, false);
         return NULL;
     }
+    if (!is_rg)
+        return retval;  // Done for this case - outputs will be opened later
 
+    // Adjust max_split if too small for the read-groups listed in the header
+    if (opts->max_split < retval->output_count)
+        opts->max_split = retval->output_count;
+
+    // Open output files for RGs
     char* dirsep = strrchr(opts->merged_input_name, '/');
     char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
     if (!input_base_name) {
@@ -388,8 +607,8 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
 
         output_filename = expand_format_string(opts->output_format_string,
                                                input_base_name,
-                                               retval->rg_id[i], i,
-                                               &opts->ga.out);
+                                               retval->tag_vals[i], i,
+                                               opts->zero_pad, &opts->ga.out);
 
         if ( output_filename == NULL ) {
             cleanup_state(retval, false);
@@ -397,40 +616,39 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
             return NULL;
         }
 
-        retval->rg_output_file_name[i] = output_filename;
-
+        retval->output_file_name[i] = output_filename;
         sam_open_mode(outmode + 1, output_filename, NULL);
-        retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+        retval->output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
 
-        if (retval->rg_output_file[i] == NULL) {
+        if (retval->output_file[i] == NULL) {
             print_error_errno("split", "Could not open \"%s\"", output_filename);
             cleanup_state(retval, false);
             free(input_base_name);
             return NULL;
         }
         if (retval->p.pool)
-            hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
+            hts_set_opt(retval->output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
 
         // Record index in hash
         int ret;
-        khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret);
+        khiter_t iter = kh_put_c2i(retval->tag_val_hash, retval->tag_vals[i], &ret);
         if (ret < 0) {
             print_error_errno("split", "Couldn't add @RG ID to look-up table");
             cleanup_state(retval, false);
             free(input_base_name);
             return NULL;
         }
-        kh_val(retval->rg_hash,iter) = i;
+        kh_val(retval->tag_val_hash,iter) = i;
 
         // Set and edit header
-        retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header);
-        if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) ||
-           (!opts->no_pg &&
-            sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
-                        "VN", samtools_version(),
-                        arg_list ? "CL": NULL,
-                        arg_list ? arg_list : NULL,
-                        NULL))) {
+        retval->output_header[i] = sam_hdr_dup(retval->merged_input_header);
+        if (sam_hdr_remove_except(retval->output_header[i], "RG", "ID", retval->tag_vals[i]) ||
+            (!opts->no_pg &&
+             sam_hdr_add_pg(retval->output_header[i], "samtools",
+                            "VN", samtools_version(),
+                            arg_list ? "CL": NULL,
+                            arg_list ? arg_list : NULL,
+                            NULL))) {
             print_error("split", "Could not rewrite header for \"%s\"", output_filename);
             cleanup_state(retval, false);
             free(input_base_name);
@@ -439,34 +657,30 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
     }
 
     free(input_base_name);
-    retval->write_index = opts->ga.write_index;
 
     return retval;
 }
 
-static bool split(state_t* state)
+static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
 {
-    if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
-        print_error_errno("split", "Could not write output file header");
-        return false;
-    }
-    size_t i;
-    for (i = 0; i < state->output_count; i++) {
-        if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
-            print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
+    int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
+    if (state->unaccounted_file) {
+        if (sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
+            print_error_errno("split", "Could not write output file header");
             return false;
         }
-        if (state->write_index) {
-            state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
-                                                      state->rg_output_file_name[i],
-                                                      state->rg_output_header[i]);
-            if (!state->rg_index_file_name[i]) {
-                print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
+        if (opts->ga.write_index) {
+            state->unaccounted_idx_fn = auto_index(state->unaccounted_file,
+                                                   opts->unaccounted_name,
+                                                   state->unaccounted_header);
+            if (!state->unaccounted_idx_fn) {
+                print_error_errno("split",
+                                  "Creating index file for file \"%s\" failed",
+                                  opts->unaccounted_name);
                 return false;
             }
         }
     }
-
     bam1_t* file_read = bam_init1();
     // Read the first record
     int r;
@@ -480,25 +694,79 @@ static bool split(state_t* state)
         }
     }
 
+    if (is_rg) {
+        size_t i;
+        for (i = 0; i < state->output_count; i++) {
+            if (sam_hdr_write(state->output_file[i], state->output_header[i]) != 0) {
+                print_error_errno("split", "Could not write file header to \"%s\"", state->output_file_name[i]);
+                goto error;
+            }
+            if (state->write_index) {
+                state->index_file_name[i] = auto_index(state->output_file[i],
+                        state->output_file_name[i],
+                        state->output_header[i]);
+                if (!state->index_file_name[i]) {
+                    print_error_errno("split", "Could not create index for file \"%s\"", state->output_file_name[i]);
+                    goto error;
+                }
+            }
+        }
+    }
     while (file_read != NULL) {
         // Get RG tag from read and look it up in hash to find file to output it to
-        uint8_t* tag = bam_aux_get(file_read, "RG");
+        uint8_t* tag = bam_aux_get(file_read, is_rg ? "RG" : opts->tag);
+        char *val = NULL;
+        char number[28];
         khiter_t iter;
-        if ( tag != NULL ) {
-            char* rg = bam_aux2Z(tag);
-            iter = kh_get_c2i(state->rg_hash, rg);
+        if (tag) {
+            switch (*tag) {
+            case 'Z': case 'H':
+                val = bam_aux2Z(tag);
+                break;
+            case 'c': case 'C': case 's': case 'S': case 'i': case 'I':
+                if (opts->zero_pad == 0) {
+                    snprintf(number, sizeof(number), "%"PRId64, bam_aux2i(tag));
+                } else {
+                    int64_t v = bam_aux2i(tag);
+                    snprintf(number, sizeof(number), "%0*"PRId64,
+                             v < 0 ? opts->zero_pad + 1 : opts->zero_pad, v);
+                }
+                val = number;
+                break;
+            default:
+                break;
+            }
+        }
+        if ( val != NULL ) {
+            iter = kh_get_c2i(state->tag_val_hash, val);
         } else {
-            iter = kh_end(state->rg_hash);
+            iter = kh_end(state->tag_val_hash);
+        }
+
+        // Check for opts->tag here instead of !is_rg so we open new
+        // files if the user specified '-d RG' and we find a RG:Z: value
+        // that wasn't listed in the header.  If the '-d' option is
+        // not used, we don't open a file to preserve existing behaviour.
+        if (opts->tag && val && iter == kh_end(state->tag_val_hash)
+            && state->output_count < opts->max_split) {
+            // Need to open a new output file
+            iter = prep_sam_file(opts, state, val, arg_list, is_rg);
+            if (iter == kh_end(state->tag_val_hash)) { // Open failed
+                print_error("split",
+                            "Could not create output file for tag \"%s:%s\"",
+                            opts->tag, bam_aux2Z(tag));
+                goto error;
+
+            }
         }
 
         // Write the read out to correct file
-        if (iter != kh_end(state->rg_hash)) {
+        if (iter != kh_end(state->tag_val_hash)) {
             // if found write to the appropriate untangled bam
-            int i = kh_val(state->rg_hash,iter);
-            if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
-                print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
-                bam_destroy1(file_read);
-                return false;
+            int i = kh_val(state->tag_val_hash,iter);
+            if (sam_write1(state->output_file[i], state->output_header[i], file_read) < 0) {
+                print_error_errno("split", "Could not write to \"%s\"", state->output_file_name[i]);
+                goto error;
             }
         } else {
             // otherwise write to the unaccounted bam if there is one or fail
@@ -506,15 +774,14 @@ static bool split(state_t* state)
                 if (tag) {
                     fprintf(stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
                 } else {
-                    fprintf(stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
+                    fprintf(stderr, "Read \"%s\" has no %s tag.\n",
+                            bam_get_qname(file_read), is_rg ? "RG" : opts->tag);
                 }
-                bam_destroy1(file_read);
-                return false;
+                goto error;
             } else {
                 if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
                     print_error_errno("split", "Could not write to unaccounted output file");
-                    bam_destroy1(file_read);
-                    return false;
+                    goto error;
                 }
             }
         }
@@ -532,16 +799,27 @@ static bool split(state_t* state)
     }
 
     if (state->write_index) {
+        size_t i;
         for (i = 0; i < state->output_count; i++) {
-            if (sam_idx_save(state->rg_output_file[i]) < 0) {
-                print_error_errno("split", "writing index failed");
+            if (sam_idx_save(state->output_file[i]) < 0) {
+                print_error_errno("split", "writing index \"%s\" failed",
+                                  state->index_file_name[i]);
+                return false;
+            }
+        }
+        if (state->unaccounted_file) {
+            if (sam_idx_save(state->unaccounted_file) < 0) {
+                print_error_errno("split", "writing index \"%s\" failed",
+                                  state->unaccounted_idx_fn);
                 return false;
             }
-            free(state->rg_index_file_name[i]);
         }
     }
 
     return true;
+error:
+    bam_destroy1(file_read);
+    return false;
 }
 
 static int cleanup_state(state_t* status, bool check_close)
@@ -559,25 +837,28 @@ static int cleanup_state(state_t* status, bool check_close)
     sam_close(status->merged_input_file);
     size_t i;
     for (i = 0; i < status->output_count; i++) {
-        if (status->rg_output_header && status->rg_output_header[i])
-            sam_hdr_destroy(status->rg_output_header[i]);
-        if (status->rg_output_file && status->rg_output_file[i]) {
-            if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
-                print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
+        if (status->output_header && status->output_header[i])
+            sam_hdr_destroy(status->output_header[i]);
+        if (status->output_file && status->output_file[i]) {
+            if (sam_close(status->output_file[i]) < 0 && check_close) {
+                print_error("split", "Error on closing output file \"%s\"", status->output_file_name[i]);
                 ret = -1;
             }
         }
-        if (status->rg_id) free(status->rg_id[i]);
-        if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
+        if (status->tag_vals) free(status->tag_vals[i]);
+        if (status->output_file_name) free(status->output_file_name[i]);
+        if (status->index_file_name[i]) free(status->index_file_name[i]);
     }
     if (status->merged_input_header)
         sam_hdr_destroy(status->merged_input_header);
-    free(status->rg_output_header);
-    free(status->rg_output_file);
-    free(status->rg_output_file_name);
-    free(status->rg_index_file_name);
-    kh_destroy_c2i(status->rg_hash);
-    free(status->rg_id);
+    free(status->output_header);
+    free(status->output_file);
+    free(status->output_file_name);
+    free(status->index_file_name);
+    free(status->unaccounted_idx_fn);
+    kh_destroy_c2i(status->tag_val_hash);
+
+    free(status->tag_vals);
     if (status->p.pool)
         hts_tpool_destroy(status->p.pool);
     free(status);
@@ -603,7 +884,7 @@ int main_split(int argc, char** argv)
     state_t* status = init(opts, arg_list);
     if (!status) goto cleanup_opts;
 
-    if (!split(status)) {
+    if (!split(status, opts, arg_list)) {
         cleanup_state(status, false);
         goto cleanup_opts;
     }
diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c
index 6c48466da..f1fd5f4d8 100644
--- a/samtools/bam_split.c.pysam.c
+++ b/samtools/bam_split.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  bam_split.c -- split subcommand.
 
-    Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd.
+    Copyright (C) 2013-2016,2018-2019,2023,2024 Genome Research Ltd.
 
     Author: Martin Pollard <mp15@sanger.ac.uk>
 
@@ -50,11 +50,16 @@ struct parsed_opts {
     const char *unaccounted_header_name;
     const char *unaccounted_name;
     const char *output_format_string;
+    const char *tag;
+    long max_split;
     bool verbose;
     int no_pg;
+    int zero_pad;
     sam_global_args ga;
 };
 
+#define DEFAULT_MAX_SPLIT 100
+
 typedef struct parsed_opts parsed_opts_t;
 
 struct state {
@@ -62,13 +67,14 @@ struct state {
     sam_hdr_t* merged_input_header;
     samFile* unaccounted_file;
     sam_hdr_t* unaccounted_header;
+    char *unaccounted_idx_fn;
     size_t output_count;
-    char** rg_id;
-    char **rg_index_file_name;
-    char **rg_output_file_name;
-    samFile** rg_output_file;
-    sam_hdr_t** rg_output_header;
-    kh_c2i_t* rg_hash;
+    char **tag_vals;
+    char **index_file_name;
+    char **output_file_name;
+    samFile **output_file;
+    sam_hdr_t **output_header;
+    kh_c2i_t* tag_val_hash;
     htsThreadPool p;
     int write_index;
 };
@@ -84,19 +90,23 @@ static void usage(FILE *write_to)
 "Usage: samtools split [-u <unaccounted.bam>] [-h <unaccounted_header.sam>]\n"
 "                      [-f <format_string>] [-v] <merged.bam>\n"
 "Options:\n"
-"  -f STRING       output filename format string [\"%%*_%%#.%%.\"]\n"
-"  -u FILE1        put reads with no RG tag or an unrecognised RG tag in FILE1\n"
-"  -h FILE2        ... and override the header with FILE2 (-u file only)\n"
-"  -v              verbose output\n"
-"  --no-PG         do not add a PG line\n");
+"  -f STRING           output filename format string [\"%%*_%%#.%%.\"]\n"
+"  -u FILE1            put left-over reads in FILE1\n"
+"  -h FILE2            ... and override the header with FILE2 (-u file only)\n"
+"  -d TAG              split by TAG value. TAG value must be a string.\n"
+"  -p NUMBER           zero-pad numbers in filenames to NUMBER digits\n"
+"  -M,--max-split NUM  limit number of output files from -d to NUM [%d]\n"
+"  -v                  verbose output\n"
+"  --no-PG             do not add a PG line\n",
+            DEFAULT_MAX_SPLIT);
     sam_global_opt_help(write_to, "-....@..");
     fprintf(write_to,
 "\n"
 "Format string expansions:\n"
 "  %%%%     %%\n"
 "  %%*     basename\n"
-"  %%#     @RG index\n"
-"  %%!     @RG ID\n"
+"  %%#     index (of @RG in the header, or count of TAG values seen so far)\n"
+"  %%!     @RG ID or TAG value\n"
 "  %%.     filename extension for output format\n"
       );
 }
@@ -106,17 +116,21 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 {
     if (argc == 1) { usage(samtools_stdout); return NULL; }
 
-    const char *optstring = "vf:h:u:@:";
+    const char *optstring = "vf:h:u:d:M:p:@:";
+    char *default_format_string = "%*_%#.%.";
 
     static const struct option lopts[] = {
         SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
         {"no-PG", no_argument, NULL, 1},
+        {"max-split", required_argument, NULL, 'M'},
+        {"zero-pad", required_argument, NULL, 'p'},
         { NULL, 0, NULL, 0 }
     };
 
     parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
     if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
 
+    retval->max_split = DEFAULT_MAX_SPLIT;
     sam_global_args_init(&retval->ga);
 
     int opt;
@@ -134,9 +148,37 @@ static parsed_opts_t* parse_args(int argc, char** argv)
         case 'u':
             retval->unaccounted_name = optarg;
             break;
+        case 'd':
+            retval->tag = optarg;
+            default_format_string = "%*_%!.%.";
+            break;
+        case 'M': {
+            char *end = optarg;
+            retval->max_split = strtol(optarg, &end, 10);
+            if (*optarg == '\0' || *end != '\0' || retval->max_split == 0) {
+                print_error("split", "Invalid -M argument: \"%s\"", optarg);
+                free(retval);
+                return NULL;
+            }
+            if (retval->max_split < 0) // No limit requested
+                retval->max_split = LONG_MAX;
+            break;
+        }
+        case 'p': {
+            char *end = optarg;
+            unsigned long val = strtoul(optarg, &end, 10);
+            if (*optarg == '\0' || *end != '\0' || val > 20) {
+                print_error("split", "Invalid -p argument: \"%s\"", optarg);
+                free(retval);
+                return NULL;
+            }
+            retval->zero_pad = (int) val;
+            break;
+        }
         case 1:
             retval->no_pg = 1;
             break;
+
         default:
             if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
             /* else fall-through */
@@ -147,7 +189,7 @@ static parsed_opts_t* parse_args(int argc, char** argv)
         }
     }
 
-    if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%.";
+    if (retval->output_format_string == NULL) retval->output_format_string = default_format_string;
 
     argc -= optind;
     argv += optind;
@@ -165,7 +207,10 @@ static parsed_opts_t* parse_args(int argc, char** argv)
 }
 
 // Expands a output filename format string
-static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
+static char* expand_format_string(const char* format_string,
+                                  const char* basename, const char* tag_val,
+                                  const int file_idx, const int zero_pad,
+                                  const htsFormat *format)
 {
     kstring_t str = { 0, 0, NULL };
     const char* pointer = format_string;
@@ -181,10 +226,15 @@ static char* expand_format_string(const char* format_string, const char* basenam
                 if (kputs(basename, &str) < 0) goto memfail;
                 break;
             case '#':
-                if (kputl(rg_idx, &str) < 0) goto memfail;
+                if (zero_pad == 0) {
+                    if (kputl(file_idx, &str) < 0) goto memfail;
+                } else {
+                    if (ksprintf(&str, "%0*d", zero_pad, file_idx) < 0)
+                        goto memfail;
+                }
                 break;
             case '!':
-                if (kputs(rg_id, &str) < 0) goto memfail;
+                if (kputs(tag_val, &str) < 0) goto memfail;
                 break;
             case '.':
                 // Only really need to cope with sam, bam, cram
@@ -277,6 +327,157 @@ static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2)
     return 0;
 }
 
+static int grow_output_lists(state_t *state, size_t count) {
+    char **new_list = realloc(state->tag_vals, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->tag_vals = new_list;
+    new_list = realloc(state->index_file_name, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->index_file_name = new_list;
+    new_list = realloc(state->output_file_name, count * sizeof(char *));
+    if (!new_list)
+        return -1;
+    state->output_file_name = new_list;
+    samFile **new_file = realloc(state->output_file,
+                                 count * sizeof(samFile *));
+    if (!new_file)
+        return -1;
+    state->output_file = new_file;
+    sam_hdr_t **new_hdr = realloc(state->output_header,
+                                  count * sizeof(sam_hdr_t *));
+    if (!new_hdr)
+        return -1;
+    state->output_header = new_hdr;
+    return 0;
+}
+
+static khiter_t prep_sam_file(parsed_opts_t *opts, state_t *state,
+                              const char *tag, const char *arg_list,
+                              int is_rg) {
+    char *input_base_name = NULL, *new_file_name = NULL, *tag_val = NULL;
+    char *new_idx_fn = NULL;
+    sam_hdr_t *new_hdr = NULL;
+    samFile *new_sam_file = NULL;
+
+    khiter_t i = kh_get_c2i(state->tag_val_hash, tag);
+    if (i != kh_end(state->tag_val_hash)) {
+        return i;
+    }
+    // create new file
+    if (grow_output_lists(state, state->output_count + 1) != 0) {
+        print_error_errno("split", "Couldn't grow output lists");
+        return kh_end(state->tag_val_hash);
+    }
+    tag_val = strdup(tag);
+    if (!tag_val) {
+        print_error_errno("split", "Couldn't copy tag value");
+        return kh_end(state->tag_val_hash);
+    }
+    char *dirsep = strrchr(opts->merged_input_name, '/');
+    input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
+    if (!input_base_name) {
+        print_error_errno("split", "Filename parsing failed");
+        goto fail;
+    }
+
+    char* extension = strrchr(input_base_name, '.');
+    if (extension) *extension = '\0';
+
+    new_file_name = expand_format_string(opts->output_format_string,
+                                         input_base_name, tag,
+                                         kh_size(state->tag_val_hash),
+                                         opts->zero_pad, &opts->ga.out);
+    if (!new_file_name) {
+        print_error_errno("split", "Filename creation failed");
+        goto fail;
+    }
+
+    new_hdr = sam_hdr_dup(state->merged_input_header);
+    if (!new_hdr) {
+        print_error_errno("split", "Duplicating header for file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+    if (!opts->no_pg && sam_hdr_add_pg(new_hdr, "samtools",
+                                       "VN", samtools_version(),
+                                       arg_list ? "CL": NULL,
+                                       arg_list ? arg_list : NULL,
+                                       NULL)) {
+        print_error_errno("split", "Adding PG line to file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+
+    if (is_rg) {
+        // If here, we've found an RG:Z: tag without a corresponding @RG
+        // line in the header.
+        if (sam_hdr_remove_lines(new_hdr, "RG", "ID", NULL) != 0) {
+            print_error_errno("split",
+                              "Failed to remove @RG lines from file \"%s\"",
+                              new_file_name);
+            goto fail;
+        }
+        if (sam_hdr_add_line(new_hdr, "RG", "ID", tag_val, NULL) != 0) {
+            print_error_errno("split",
+                              "Failed to add @RG line to file \"%s\"",
+                              new_file_name);
+            goto fail;
+        }
+    }
+
+    char outmode[4] = "w";
+    sam_open_mode(outmode + 1, new_file_name, NULL);
+    new_sam_file = sam_open_format(new_file_name, outmode, &opts->ga.out);
+    if (!new_sam_file) {
+        print_error_errno("split", "Opening filename for writing \"%s\" failed", new_file_name);
+        goto fail;
+    }
+    if (state->p.pool)
+        hts_set_opt(new_sam_file, HTS_OPT_THREAD_POOL, &state->p);
+
+    if (sam_hdr_write(new_sam_file, new_hdr) != 0) {
+        print_error_errno("split", "Couldn't write header to \"%s\"",
+                          new_file_name);
+        goto fail;
+    }
+
+    if (state->write_index) {
+        new_idx_fn = auto_index(new_sam_file, new_file_name, new_hdr);
+        if (!new_idx_fn) {
+            print_error_errno("split", "Creating index file for file \"%s\" failed", new_file_name);
+            goto fail;
+        }
+    }
+
+    int ret = -1;
+    i = kh_put_c2i(state->tag_val_hash, tag_val, &ret);
+    if (ret < 0) {
+        print_error_errno("split", "Adding file \"%s\" failed", new_file_name);
+        goto fail;
+    }
+
+    kh_val(state->tag_val_hash, i) = state->output_count;
+    state->tag_vals[state->output_count] = tag_val;
+    state->index_file_name[state->output_count] = new_idx_fn;
+    state->output_file_name[state->output_count] = new_file_name;
+    state->output_file[state->output_count] = new_sam_file;
+    state->output_header[state->output_count] = new_hdr;
+    state->output_count++;
+    free(input_base_name);
+    return i;
+
+ fail:
+    free(input_base_name);
+    free(new_file_name);
+    free(tag_val);
+    free(new_idx_fn);
+    if (new_hdr)
+        sam_hdr_destroy(new_hdr);
+    if (new_sam_file)
+        sam_close(new_sam_file);
+    return kh_end(state->tag_val_hash);
+}
+
 // Set the initial state
 static state_t* init(parsed_opts_t* opts, const char *arg_list)
 {
@@ -308,6 +509,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         cleanup_state(retval, false);
         return NULL;
     }
+    retval->write_index = opts->ga.write_index;
 
     if (opts->unaccounted_name) {
         if (opts->unaccounted_header_name) {
@@ -333,10 +535,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
         } else {
             retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header);
             if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools",
-                                    "VN", samtools_version(),
-                                    arg_list ? "CL": NULL,
-                                    arg_list ? arg_list : NULL,
-                                    NULL)) {
+                                               "VN", samtools_version(),
+                                               arg_list ? "CL": NULL,
+                                               arg_list ? arg_list : NULL,
+                                               NULL)) {
                 print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name);
                 cleanup_state(retval, false);
                 return NULL;
@@ -356,23 +558,40 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
             hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
     }
 
-    // Open output files for RGs
-    if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
-    if (opts->verbose) fprintf(samtools_stderr, "@RG's found %zu\n",retval->output_count);
+    int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
+    if (is_rg) {
+        if (!count_RG(retval->merged_input_header,
+                      &retval->output_count, &retval->tag_vals)) {
+            cleanup_state(retval, false);
+            return NULL;
+        }
+        if (opts->verbose)
+            fprintf(samtools_stderr, "@RG's found %zu\n",retval->output_count);
+    } else {
+        retval->output_count = 0;
+    }
+
     // Prevent calloc(0, size);
     size_t num = retval->output_count ? retval->output_count : 1;
-    retval->rg_index_file_name = (char **)calloc(num, sizeof(char *));
-    retval->rg_output_file_name = (char **)calloc(num, sizeof(char *));
-    retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*));
-    retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
-    retval->rg_hash = kh_init_c2i();
-    if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header ||
-        !retval->rg_hash || !retval->rg_index_file_name) {
+    retval->index_file_name = (char **)calloc(num, sizeof(char *));
+    retval->output_file_name = (char **)calloc(num, sizeof(char *));
+    retval->output_file = (samFile**)calloc(num, sizeof(samFile*));
+    retval->output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
+    retval->tag_val_hash = kh_init_c2i();
+    if (!retval->output_file_name || !retval->output_file || !retval->output_header ||
+        !retval->tag_val_hash || !retval->index_file_name) {
         print_error_errno("split", "Could not initialise output file array");
         cleanup_state(retval, false);
         return NULL;
     }
+    if (!is_rg)
+        return retval;  // Done for this case - outputs will be opened later
 
+    // Adjust max_split if too small for the read-groups listed in the header
+    if (opts->max_split < retval->output_count)
+        opts->max_split = retval->output_count;
+
+    // Open output files for RGs
     char* dirsep = strrchr(opts->merged_input_name, '/');
     char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
     if (!input_base_name) {
@@ -390,8 +609,8 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
 
         output_filename = expand_format_string(opts->output_format_string,
                                                input_base_name,
-                                               retval->rg_id[i], i,
-                                               &opts->ga.out);
+                                               retval->tag_vals[i], i,
+                                               opts->zero_pad, &opts->ga.out);
 
         if ( output_filename == NULL ) {
             cleanup_state(retval, false);
@@ -399,40 +618,39 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
             return NULL;
         }
 
-        retval->rg_output_file_name[i] = output_filename;
-
+        retval->output_file_name[i] = output_filename;
         sam_open_mode(outmode + 1, output_filename, NULL);
-        retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+        retval->output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
 
-        if (retval->rg_output_file[i] == NULL) {
+        if (retval->output_file[i] == NULL) {
             print_error_errno("split", "Could not open \"%s\"", output_filename);
             cleanup_state(retval, false);
             free(input_base_name);
             return NULL;
         }
         if (retval->p.pool)
-            hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
+            hts_set_opt(retval->output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
 
         // Record index in hash
         int ret;
-        khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret);
+        khiter_t iter = kh_put_c2i(retval->tag_val_hash, retval->tag_vals[i], &ret);
         if (ret < 0) {
             print_error_errno("split", "Couldn't add @RG ID to look-up table");
             cleanup_state(retval, false);
             free(input_base_name);
             return NULL;
         }
-        kh_val(retval->rg_hash,iter) = i;
+        kh_val(retval->tag_val_hash,iter) = i;
 
         // Set and edit header
-        retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header);
-        if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) ||
-           (!opts->no_pg &&
-            sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
-                        "VN", samtools_version(),
-                        arg_list ? "CL": NULL,
-                        arg_list ? arg_list : NULL,
-                        NULL))) {
+        retval->output_header[i] = sam_hdr_dup(retval->merged_input_header);
+        if (sam_hdr_remove_except(retval->output_header[i], "RG", "ID", retval->tag_vals[i]) ||
+            (!opts->no_pg &&
+             sam_hdr_add_pg(retval->output_header[i], "samtools",
+                            "VN", samtools_version(),
+                            arg_list ? "CL": NULL,
+                            arg_list ? arg_list : NULL,
+                            NULL))) {
             print_error("split", "Could not rewrite header for \"%s\"", output_filename);
             cleanup_state(retval, false);
             free(input_base_name);
@@ -441,34 +659,30 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
     }
 
     free(input_base_name);
-    retval->write_index = opts->ga.write_index;
 
     return retval;
 }
 
-static bool split(state_t* state)
+static bool split(state_t* state, parsed_opts_t *opts, char *arg_list)
 {
-    if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
-        print_error_errno("split", "Could not write output file header");
-        return false;
-    }
-    size_t i;
-    for (i = 0; i < state->output_count; i++) {
-        if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
-            print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
+    int is_rg = !opts->tag || strcmp(opts->tag, "RG") == 0;
+    if (state->unaccounted_file) {
+        if (sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
+            print_error_errno("split", "Could not write output file header");
             return false;
         }
-        if (state->write_index) {
-            state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
-                                                      state->rg_output_file_name[i],
-                                                      state->rg_output_header[i]);
-            if (!state->rg_index_file_name[i]) {
-                print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
+        if (opts->ga.write_index) {
+            state->unaccounted_idx_fn = auto_index(state->unaccounted_file,
+                                                   opts->unaccounted_name,
+                                                   state->unaccounted_header);
+            if (!state->unaccounted_idx_fn) {
+                print_error_errno("split",
+                                  "Creating index file for file \"%s\" failed",
+                                  opts->unaccounted_name);
                 return false;
             }
         }
     }
-
     bam1_t* file_read = bam_init1();
     // Read the first record
     int r;
@@ -482,25 +696,79 @@ static bool split(state_t* state)
         }
     }
 
+    if (is_rg) {
+        size_t i;
+        for (i = 0; i < state->output_count; i++) {
+            if (sam_hdr_write(state->output_file[i], state->output_header[i]) != 0) {
+                print_error_errno("split", "Could not write file header to \"%s\"", state->output_file_name[i]);
+                goto error;
+            }
+            if (state->write_index) {
+                state->index_file_name[i] = auto_index(state->output_file[i],
+                        state->output_file_name[i],
+                        state->output_header[i]);
+                if (!state->index_file_name[i]) {
+                    print_error_errno("split", "Could not create index for file \"%s\"", state->output_file_name[i]);
+                    goto error;
+                }
+            }
+        }
+    }
     while (file_read != NULL) {
         // Get RG tag from read and look it up in hash to find file to output it to
-        uint8_t* tag = bam_aux_get(file_read, "RG");
+        uint8_t* tag = bam_aux_get(file_read, is_rg ? "RG" : opts->tag);
+        char *val = NULL;
+        char number[28];
         khiter_t iter;
-        if ( tag != NULL ) {
-            char* rg = bam_aux2Z(tag);
-            iter = kh_get_c2i(state->rg_hash, rg);
+        if (tag) {
+            switch (*tag) {
+            case 'Z': case 'H':
+                val = bam_aux2Z(tag);
+                break;
+            case 'c': case 'C': case 's': case 'S': case 'i': case 'I':
+                if (opts->zero_pad == 0) {
+                    snprintf(number, sizeof(number), "%"PRId64, bam_aux2i(tag));
+                } else {
+                    int64_t v = bam_aux2i(tag);
+                    snprintf(number, sizeof(number), "%0*"PRId64,
+                             v < 0 ? opts->zero_pad + 1 : opts->zero_pad, v);
+                }
+                val = number;
+                break;
+            default:
+                break;
+            }
+        }
+        if ( val != NULL ) {
+            iter = kh_get_c2i(state->tag_val_hash, val);
         } else {
-            iter = kh_end(state->rg_hash);
+            iter = kh_end(state->tag_val_hash);
+        }
+
+        // Check for opts->tag here instead of !is_rg so we open new
+        // files if the user specified '-d RG' and we find a RG:Z: value
+        // that wasn't listed in the header.  If the '-d' option is
+        // not used, we don't open a file to preserve existing behaviour.
+        if (opts->tag && val && iter == kh_end(state->tag_val_hash)
+            && state->output_count < opts->max_split) {
+            // Need to open a new output file
+            iter = prep_sam_file(opts, state, val, arg_list, is_rg);
+            if (iter == kh_end(state->tag_val_hash)) { // Open failed
+                print_error("split",
+                            "Could not create output file for tag \"%s:%s\"",
+                            opts->tag, bam_aux2Z(tag));
+                goto error;
+
+            }
         }
 
         // Write the read out to correct file
-        if (iter != kh_end(state->rg_hash)) {
+        if (iter != kh_end(state->tag_val_hash)) {
             // if found write to the appropriate untangled bam
-            int i = kh_val(state->rg_hash,iter);
-            if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
-                print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
-                bam_destroy1(file_read);
-                return false;
+            int i = kh_val(state->tag_val_hash,iter);
+            if (sam_write1(state->output_file[i], state->output_header[i], file_read) < 0) {
+                print_error_errno("split", "Could not write to \"%s\"", state->output_file_name[i]);
+                goto error;
             }
         } else {
             // otherwise write to the unaccounted bam if there is one or fail
@@ -508,15 +776,14 @@ static bool split(state_t* state)
                 if (tag) {
                     fprintf(samtools_stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
                 } else {
-                    fprintf(samtools_stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
+                    fprintf(samtools_stderr, "Read \"%s\" has no %s tag.\n",
+                            bam_get_qname(file_read), is_rg ? "RG" : opts->tag);
                 }
-                bam_destroy1(file_read);
-                return false;
+                goto error;
             } else {
                 if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
                     print_error_errno("split", "Could not write to unaccounted output file");
-                    bam_destroy1(file_read);
-                    return false;
+                    goto error;
                 }
             }
         }
@@ -534,16 +801,27 @@ static bool split(state_t* state)
     }
 
     if (state->write_index) {
+        size_t i;
         for (i = 0; i < state->output_count; i++) {
-            if (sam_idx_save(state->rg_output_file[i]) < 0) {
-                print_error_errno("split", "writing index failed");
+            if (sam_idx_save(state->output_file[i]) < 0) {
+                print_error_errno("split", "writing index \"%s\" failed",
+                                  state->index_file_name[i]);
+                return false;
+            }
+        }
+        if (state->unaccounted_file) {
+            if (sam_idx_save(state->unaccounted_file) < 0) {
+                print_error_errno("split", "writing index \"%s\" failed",
+                                  state->unaccounted_idx_fn);
                 return false;
             }
-            free(state->rg_index_file_name[i]);
         }
     }
 
     return true;
+error:
+    bam_destroy1(file_read);
+    return false;
 }
 
 static int cleanup_state(state_t* status, bool check_close)
@@ -561,25 +839,28 @@ static int cleanup_state(state_t* status, bool check_close)
     sam_close(status->merged_input_file);
     size_t i;
     for (i = 0; i < status->output_count; i++) {
-        if (status->rg_output_header && status->rg_output_header[i])
-            sam_hdr_destroy(status->rg_output_header[i]);
-        if (status->rg_output_file && status->rg_output_file[i]) {
-            if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
-                print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
+        if (status->output_header && status->output_header[i])
+            sam_hdr_destroy(status->output_header[i]);
+        if (status->output_file && status->output_file[i]) {
+            if (sam_close(status->output_file[i]) < 0 && check_close) {
+                print_error("split", "Error on closing output file \"%s\"", status->output_file_name[i]);
                 ret = -1;
             }
         }
-        if (status->rg_id) free(status->rg_id[i]);
-        if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
+        if (status->tag_vals) free(status->tag_vals[i]);
+        if (status->output_file_name) free(status->output_file_name[i]);
+        if (status->index_file_name[i]) free(status->index_file_name[i]);
     }
     if (status->merged_input_header)
         sam_hdr_destroy(status->merged_input_header);
-    free(status->rg_output_header);
-    free(status->rg_output_file);
-    free(status->rg_output_file_name);
-    free(status->rg_index_file_name);
-    kh_destroy_c2i(status->rg_hash);
-    free(status->rg_id);
+    free(status->output_header);
+    free(status->output_file);
+    free(status->output_file_name);
+    free(status->index_file_name);
+    free(status->unaccounted_idx_fn);
+    kh_destroy_c2i(status->tag_val_hash);
+
+    free(status->tag_vals);
     if (status->p.pool)
         hts_tpool_destroy(status->p.pool);
     free(status);
@@ -605,7 +886,7 @@ int main_split(int argc, char** argv)
     state_t* status = init(opts, arg_list);
     if (!status) goto cleanup_opts;
 
-    if (!split(status)) {
+    if (!split(status, opts, arg_list)) {
         cleanup_state(status, false);
         goto cleanup_opts;
     }
diff --git a/samtools/bamtk.c b/samtools/bamtk.c
index e05ea1816..8c330bc2b 100644
--- a/samtools/bamtk.c
+++ b/samtools/bamtk.c
@@ -75,7 +75,7 @@ int main_reference(int argc, char *argv[]);
 int main_reset(int argc, char *argv[]);
 int main_cram_size(int argc, char *argv[]);
 
-const char *samtools_version()
+const char *samtools_version(void)
 {
     return SAMTOOLS_VERSION;
 }
@@ -103,7 +103,7 @@ const char *samtools_feature_string(void) {
 static void long_version(void) {
     printf("samtools %s\n"
            "Using htslib %s\n"
-           "Copyright (C) 2023 Genome Research Ltd.\n",
+           "Copyright (C) 2024 Genome Research Ltd.\n",
            samtools_version(), hts_version());
 
     printf("\nSamtools compilation details:\n");
@@ -303,5 +303,14 @@ int main(int argc, char *argv[])
         fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
         return 1;
     }
+
+    // For subcommands that may have produced substantial output on stdout,
+    // make a final check for delayed I/O errors. Ignore EBADF as other code
+    // may have already closed stdout.
+    if (fclose(stdout) != 0 && errno != EBADF) {
+        print_error_errno(argv[1], "closing standard output failed");
+        return 1;
+    }
+
     return ret;
 }
diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c
index d95ec05bb..a76729ce6 100644
--- a/samtools/bamtk.c.pysam.c
+++ b/samtools/bamtk.c.pysam.c
@@ -78,7 +78,7 @@ int main_reference(int argc, char *argv[]);
 int main_reset(int argc, char *argv[]);
 int main_cram_size(int argc, char *argv[]);
 
-const char *samtools_version()
+const char *samtools_version(void)
 {
     return SAMTOOLS_VERSION;
 }
@@ -106,7 +106,7 @@ const char *samtools_feature_string(void) {
 static void long_version(void) {
     fprintf(samtools_stdout, "samtools %s\n"
            "Using htslib %s\n"
-           "Copyright (C) 2023 Genome Research Ltd.\n",
+           "Copyright (C) 2024 Genome Research Ltd.\n",
            samtools_version(), hts_version());
 
     fprintf(samtools_stdout, "\nSamtools compilation details:\n");
@@ -306,5 +306,14 @@ int samtools_main(int argc, char *argv[])
         fprintf(samtools_stderr, "[main] unrecognized command '%s'\n", argv[1]);
         return 1;
     }
+
+    // For subcommands that may have produced substantial output on samtools_stdout,
+    // make a final check for delayed I/O errors. Ignore EBADF as other code
+    // may have already closed samtools_stdout.
+    if (0) { //if (fclose(samtools_stdout) != 0 && errno != EBADF) {
+        print_error_errno(argv[1], "closing standard output failed");
+        return 1;
+    }
+
     return ret;
 }
diff --git a/samtools/bedcov.c b/samtools/bedcov.c
index 1bd46a0eb..10eeface6 100644
--- a/samtools/bedcov.c
+++ b/samtools/bedcov.c
@@ -1,7 +1,7 @@
 /*  bedcov.c -- bedcov subcommand.
 
     Copyright (C) 2012 Broad Institute.
-    Copyright (C) 2013-2014, 2018-2022 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2018-2022, 2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -40,7 +40,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kseq.h"
 KSTREAM_INIT(gzFile, gzread, 16384)
 
-#define DEFAULT_DEPTH 64000
+#define DEFAULT_DEPTH INT_MAX
 
 typedef struct {
     htsFile *fp;
@@ -85,21 +85,23 @@ int main_bedcov(int argc, char *argv[])
     const bam_pileup1_t **plp;
     int usage = 0, has_index_file = 0;
     uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
-    int tflags = 0, min_depth = -1;
+    int tflags = 0, min_depth = -1, max_depth = DEFAULT_DEPTH, print_header=0;
 
     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
     static const struct option lopts[] = {
         {"min-MQ", required_argument, NULL, 'Q'},
         {"min-mq", required_argument, NULL, 'Q'},
+        {"max-depth", required_argument, NULL, 'd'+1000},
         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:c", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:Hc", lopts, NULL)) >= 0) {
         switch (c) {
         case 'Q': min_mapQ = atoi(optarg); break;
         case 'X': has_index_file = 1; break;
         case 'c': do_rcount = 1; break;
+        case 'H': print_header = 1; break;
         case 'g':
             tflags = bam_str2flag(optarg);
             if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
@@ -118,6 +120,7 @@ int main_bedcov(int argc, char *argv[])
             break;
         case 'j': skip_DN = 1; break;
         case 'd': min_depth = atoi(optarg); break;
+        case 'd'+1000: max_depth = atoi(optarg); break;
         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                   /* else fall-through */
         case '?': usage = 1; break;
@@ -133,9 +136,11 @@ int main_bedcov(int argc, char *argv[])
         fprintf(stderr, "      -G <flags>          add the specified flags to the set used to filter out reads\n"
                         "                          The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n");
         fprintf(stderr, "      -j                  do not include deletions (D) and ref skips (N) in bedcov computation\n");
-        fprintf(stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and"
+        fprintf(stderr, "      --max-depth <int>   sets the maximum depth used in the mpileup algorithm\n");
+        fprintf(stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and\n"
                         "                          including this value will be displayed in a separate column\n");
         fprintf(stderr, "      -c                  add an additional column showing read count\n");
+        fprintf(stderr, "      -H                  print a comment/header line with column information.\n");
         sam_global_opt_help(stderr, "-.--.--.");
         return 1;
     }
@@ -190,6 +195,23 @@ int main_bedcov(int argc, char *argv[])
         print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]);
         return 2;
     }
+
+    if (print_header) {
+        printf("#chrom\tstart\tend");
+        for (i = 0; i < n; ++i) {
+            printf("\t%s_cov", argv[i+optind+1]);
+        }
+        if (min_depth >= 0) {
+            for (i = 0; i < n; ++i)
+                printf("\t%s_depth", argv[i+optind+1]);
+        }
+        if (do_rcount) {
+            for (i = 0; i < n; ++i)
+                printf("\t%s_count", argv[i+optind+1]);
+        }
+        putchar('\n');
+    }
+
     ks = ks_init(fp);
     n_plp = calloc(n, sizeof(int));
     plp = calloc(n, sizeof(bam_pileup1_t*));
@@ -220,10 +242,11 @@ int main_bedcov(int argc, char *argv[])
         }
 
         mplp = bam_mplp_init(n, read_bam, (void**)aux);
-        if (min_depth > DEFAULT_DEPTH)
+        if (min_depth > max_depth)
+            // NB: never happens given current DEFAULT_DEPTH of INT_MAX
             bam_mplp_set_maxcnt(mplp, min_depth);
         else
-            bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
+            bam_mplp_set_maxcnt(mplp, max_depth);
 
         memset(cnt, 0, sizeof(*cnt) * n);
         if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c
index f259cb185..c6f0cccd6 100644
--- a/samtools/bedcov.c.pysam.c
+++ b/samtools/bedcov.c.pysam.c
@@ -3,7 +3,7 @@
 /*  bedcov.c -- bedcov subcommand.
 
     Copyright (C) 2012 Broad Institute.
-    Copyright (C) 2013-2014, 2018-2022 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2018-2022, 2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -42,7 +42,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kseq.h"
 KSTREAM_INIT(gzFile, gzread, 16384)
 
-#define DEFAULT_DEPTH 64000
+#define DEFAULT_DEPTH INT_MAX
 
 typedef struct {
     htsFile *fp;
@@ -87,21 +87,23 @@ int main_bedcov(int argc, char *argv[])
     const bam_pileup1_t **plp;
     int usage = 0, has_index_file = 0;
     uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
-    int tflags = 0, min_depth = -1;
+    int tflags = 0, min_depth = -1, max_depth = DEFAULT_DEPTH, print_header=0;
 
     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
     static const struct option lopts[] = {
         {"min-MQ", required_argument, NULL, 'Q'},
         {"min-mq", required_argument, NULL, 'Q'},
+        {"max-depth", required_argument, NULL, 'd'+1000},
         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:c", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:Hc", lopts, NULL)) >= 0) {
         switch (c) {
         case 'Q': min_mapQ = atoi(optarg); break;
         case 'X': has_index_file = 1; break;
         case 'c': do_rcount = 1; break;
+        case 'H': print_header = 1; break;
         case 'g':
             tflags = bam_str2flag(optarg);
             if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
@@ -120,6 +122,7 @@ int main_bedcov(int argc, char *argv[])
             break;
         case 'j': skip_DN = 1; break;
         case 'd': min_depth = atoi(optarg); break;
+        case 'd'+1000: max_depth = atoi(optarg); break;
         default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                   /* else fall-through */
         case '?': usage = 1; break;
@@ -135,9 +138,11 @@ int main_bedcov(int argc, char *argv[])
         fprintf(samtools_stderr, "      -G <flags>          add the specified flags to the set used to filter out reads\n"
                         "                          The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n");
         fprintf(samtools_stderr, "      -j                  do not include deletions (D) and ref skips (N) in bedcov computation\n");
-        fprintf(samtools_stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and"
+        fprintf(samtools_stderr, "      --max-depth <int>   sets the maximum depth used in the mpileup algorithm\n");
+        fprintf(samtools_stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and\n"
                         "                          including this value will be displayed in a separate column\n");
         fprintf(samtools_stderr, "      -c                  add an additional column showing read count\n");
+        fprintf(samtools_stderr, "      -H                  print a comment/header line with column information.\n");
         sam_global_opt_help(samtools_stderr, "-.--.--.");
         return 1;
     }
@@ -192,6 +197,23 @@ int main_bedcov(int argc, char *argv[])
         print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]);
         return 2;
     }
+
+    if (print_header) {
+        fprintf(samtools_stdout, "#chrom\tstart\tend");
+        for (i = 0; i < n; ++i) {
+            fprintf(samtools_stdout, "\t%s_cov", argv[i+optind+1]);
+        }
+        if (min_depth >= 0) {
+            for (i = 0; i < n; ++i)
+                fprintf(samtools_stdout, "\t%s_depth", argv[i+optind+1]);
+        }
+        if (do_rcount) {
+            for (i = 0; i < n; ++i)
+                fprintf(samtools_stdout, "\t%s_count", argv[i+optind+1]);
+        }
+        fputc('\n', samtools_stdout);
+    }
+
     ks = ks_init(fp);
     n_plp = calloc(n, sizeof(int));
     plp = calloc(n, sizeof(bam_pileup1_t*));
@@ -222,10 +244,11 @@ int main_bedcov(int argc, char *argv[])
         }
 
         mplp = bam_mplp_init(n, read_bam, (void**)aux);
-        if (min_depth > DEFAULT_DEPTH)
+        if (min_depth > max_depth)
+            // NB: never happens given current DEFAULT_DEPTH of INT_MAX
             bam_mplp_set_maxcnt(mplp, min_depth);
         else
-            bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
+            bam_mplp_set_maxcnt(mplp, max_depth);
 
         memset(cnt, 0, sizeof(*cnt) * n);
         if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
diff --git a/samtools/bedidx.c b/samtools/bedidx.c
index 6b22d4efc..0fe5e7262 100644
--- a/samtools/bedidx.c
+++ b/samtools/bedidx.c
@@ -1,7 +1,7 @@
 /*  bedidx.c -- BED file indexing.
 
     Copyright (C) 2011 Broad Institute.
-    Copyright (C) 2014, 2017-2019 Genome Research Ltd.
+    Copyright (C) 2014, 2017-2019, 2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -58,6 +58,7 @@ typedef struct {
     hts_pair_pos_t *a;
     int *idx;
     int filter;
+    hts_pos_t max_idx;
 } bed_reglist_t;
 
 #include "htslib/khash.h"
@@ -96,63 +97,68 @@ static void bed_print(void *reg_hash) {
 }
 #endif
 
-static int *bed_index_core(int n, hts_pair_pos_t *a)
+static int bed_index_core(bed_reglist_t *regions)
 {
-    int i, j, l, *idx, *new_idx;
-    l = 0; idx = 0;
-    for (i = 0; i < n; ++i) {
-        hts_pos_t beg, end;
-        beg = a[i].beg >> LIDX_SHIFT; end = a[i].end >> LIDX_SHIFT;
-        if (l < end + 1) {
-            int old_l = l;
-            l = end + 1;
-            kroundup32(l);
-            new_idx = realloc(idx, l * sizeof(*idx));
-            if (!new_idx) {
-                free(idx);
-                return NULL;
-            }
-            idx = new_idx;
-
-            for (j = old_l; j < l; ++j)
-                idx[j] = -1;
+    int i, *idx = NULL;
+    size_t idx_size = 0;
+    hts_pos_t last_end = 0;
+    hts_pair_pos_t *a = regions->a;
+
+    // Construct a linear index on regions, to allow rapid lookup of
+    // where to start searching for matches
+    for (i = 0; i < regions->n; ++i) {
+        hts_pos_t beg = a[i].beg >= 0 ? a[i].beg >> LIDX_SHIFT : 0;
+        hts_pos_t end = a[i].end >= 0 ? a[i].end >> LIDX_SHIFT : 0;
+        hts_pos_t j;
+        if (end < last_end)
+            continue;  // Can happen for a containment
+        if (end + 1 >= SIZE_MAX / sizeof(*idx)) { // Ensure no overflow
+            errno = ENOMEM;
+            free(idx);
+            return -1;
         }
-
-        for (j = beg; j < end+1; ++j)
-            if (idx[j] < 0)
-                idx[j] = i;
+        if (hts_resize(int, (size_t) end + 1, &idx_size, &idx, 0) < 0) {
+            free(idx);
+            return -1;
+        }
+        // Fill any gap prior to this region by pointing to the previous one
+        for (j = last_end; j < beg; j++)
+            idx[j] = i > 0 ? i - 1 : 0;
+        // Fill from max(last_end, beg) to `end` (inclusive) with current region
+        for (; j <= end; j++)
+            idx[j] = i;
+        // Remember where finished for the next gap
+        last_end = end + 1;
     }
-    return idx;
+    regions->idx = idx;
+    regions->max_idx = last_end;
+    return 0;
 }
 
-static void bed_index(void *_h)
+static int bed_index(reghash_t *h)
 {
-    reghash_t *h = (reghash_t*)_h;
     khint_t k;
     for (k = 0; k < kh_end(h); ++k) {
         if (kh_exist(h, k)) {
             bed_reglist_t *p = &kh_val(h, k);
-            if (p->idx) free(p->idx);
+            if (p->idx) {
+                free(p->idx);
+                p->idx = NULL;
+            }
             ks_introsort(hts_pair_pos_t, p->n, p->a);
-            p->idx = bed_index_core(p->n, p->a);
+            if (bed_index_core(p) != 0) {
+                return -1;
+            }
         }
     }
+    return 0;
 }
 
-static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) {
-    int i, min_off=0;
-
-    if (p && p->idx) {
-        min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
-        if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
-            hts_pos_t n = beg>>LIDX_SHIFT;
-            if (n > p->n)
-                n = p->n;
-            for (i = n - 1; i >= 0; --i)
-                if (p->idx[i] >= 0)
-                    break;
-            min_off = i >= 0? p->idx[i] : 0;
-        }
+static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg) {
+    int min_off=0;
+
+    if (p && p->idx && p->max_idx > 0 && beg >= 0) {
+        min_off = (beg>>LIDX_SHIFT >= p->max_idx)? p->idx[p->max_idx-1] : p->idx[beg>>LIDX_SHIFT];
     }
 
     return min_off;
@@ -162,7 +168,7 @@ static int bed_overlap_core(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end
 {
     int i, min_off;
     if (p->n == 0) return 0;
-    min_off = bed_minoff(p, beg, end);
+    min_off = bed_minoff(p, beg);
 
     for (i = min_off; i < p->n; ++i) {
         if (p->a[i].beg >= end) break; // out of range; no need to proceed
@@ -259,7 +265,7 @@ void *bed_read(const char *fn)
     if (NULL == h) return NULL;
     // read the list
     fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
-    if (fp == 0) return 0;
+    if (fp == 0) goto fail;
     ks = ks_init(fp);
     if (NULL == ks) goto fail;  // In case ks_init ever gets error checking...
     int ks_len;
@@ -339,9 +345,10 @@ void *bed_read(const char *fn)
         fp = NULL;
         goto fail;
     }
+    if (bed_index(h) != 0)
+        goto fail;
     ks_destroy(ks);
     free(str.s);
-    bed_index(h);
     //bed_unify(h);
     return h;
  fail:
@@ -457,7 +464,7 @@ static void *bed_filter(void *reg_hash, void *tmp_hash) {
             beg = q->a[i].beg;
             end = q->a[i].end;
 
-            min_off = bed_minoff(p, beg, end);
+            min_off = bed_minoff(p, beg);
             for (j = min_off; j < p->n; ++j) {
                 if (p->a[j].beg >= end) break; // out of range; no need to proceed
                 if (p->a[j].end > beg && p->a[j].beg < end) {
@@ -534,28 +541,43 @@ void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op
 
         //if op==1 insert reg to the bed hash table
         if (*op && !(bed_insert(h, reg, beg, end))) {
-            fprintf(stderr, "Error when inserting region='%s' in the bed hash table at address=%p!\n", regs[i], h);
+            fprintf(stderr, "Error when inserting region='%s' in the bed hash table at address=%p!\n", regs[i], (void *)h);
         }
         //if op==0, first insert the regions in the temporary hash table,
         //then filter the bed hash table using it
         if (!(*op) && !(bed_insert(t, reg, beg, end))) {
-            fprintf(stderr, "Error when inserting region='%s' in the temporary hash table at address=%p!\n", regs[i], t);
+            fprintf(stderr, "Error when inserting region='%s' in the temporary hash table at address=%p!\n", regs[i], (void *)t);
         }
     }
 
     if (!(*op)) {
-        bed_index(t);
+        if (bed_index(t) != 0)
+            goto fail;
         bed_unify(t);
-        h = bed_filter(h, t);
+        if (bed_filter(h, t) == NULL)
+            goto fail;
         bed_destroy(t);
+        t = NULL;
     }
 
     if (h) {
-        bed_index(h);
+        if (bed_index(h) != 0)
+            goto fail;
         bed_unify(h);
     }
 
     return h;
+
+ fail:
+    // Clean up whichever hash we made
+    if (reg_hash) {
+        if (t)
+            bed_destroy(t);
+    } else {
+        if (h)
+            bed_destroy(h);
+    }
+    return NULL;
 }
 
 const char* bed_get(void *reg_hash, int i, int filter) {
diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c
index 533b42a92..e62ab5143 100644
--- a/samtools/bedidx.c.pysam.c
+++ b/samtools/bedidx.c.pysam.c
@@ -3,7 +3,7 @@
 /*  bedidx.c -- BED file indexing.
 
     Copyright (C) 2011 Broad Institute.
-    Copyright (C) 2014, 2017-2019 Genome Research Ltd.
+    Copyright (C) 2014, 2017-2019, 2024 Genome Research Ltd.
 
     Author: Heng Li <lh3@sanger.ac.uk>
 
@@ -60,6 +60,7 @@ typedef struct {
     hts_pair_pos_t *a;
     int *idx;
     int filter;
+    hts_pos_t max_idx;
 } bed_reglist_t;
 
 #include "htslib/khash.h"
@@ -98,63 +99,68 @@ static void bed_print(void *reg_hash) {
 }
 #endif
 
-static int *bed_index_core(int n, hts_pair_pos_t *a)
+static int bed_index_core(bed_reglist_t *regions)
 {
-    int i, j, l, *idx, *new_idx;
-    l = 0; idx = 0;
-    for (i = 0; i < n; ++i) {
-        hts_pos_t beg, end;
-        beg = a[i].beg >> LIDX_SHIFT; end = a[i].end >> LIDX_SHIFT;
-        if (l < end + 1) {
-            int old_l = l;
-            l = end + 1;
-            kroundup32(l);
-            new_idx = realloc(idx, l * sizeof(*idx));
-            if (!new_idx) {
-                free(idx);
-                return NULL;
-            }
-            idx = new_idx;
-
-            for (j = old_l; j < l; ++j)
-                idx[j] = -1;
+    int i, *idx = NULL;
+    size_t idx_size = 0;
+    hts_pos_t last_end = 0;
+    hts_pair_pos_t *a = regions->a;
+
+    // Construct a linear index on regions, to allow rapid lookup of
+    // where to start searching for matches
+    for (i = 0; i < regions->n; ++i) {
+        hts_pos_t beg = a[i].beg >= 0 ? a[i].beg >> LIDX_SHIFT : 0;
+        hts_pos_t end = a[i].end >= 0 ? a[i].end >> LIDX_SHIFT : 0;
+        hts_pos_t j;
+        if (end < last_end)
+            continue;  // Can happen for a containment
+        if (end + 1 >= SIZE_MAX / sizeof(*idx)) { // Ensure no overflow
+            errno = ENOMEM;
+            free(idx);
+            return -1;
         }
-
-        for (j = beg; j < end+1; ++j)
-            if (idx[j] < 0)
-                idx[j] = i;
+        if (hts_resize(int, (size_t) end + 1, &idx_size, &idx, 0) < 0) {
+            free(idx);
+            return -1;
+        }
+        // Fill any gap prior to this region by pointing to the previous one
+        for (j = last_end; j < beg; j++)
+            idx[j] = i > 0 ? i - 1 : 0;
+        // Fill from max(last_end, beg) to `end` (inclusive) with current region
+        for (; j <= end; j++)
+            idx[j] = i;
+        // Remember where finished for the next gap
+        last_end = end + 1;
     }
-    return idx;
+    regions->idx = idx;
+    regions->max_idx = last_end;
+    return 0;
 }
 
-static void bed_index(void *_h)
+static int bed_index(reghash_t *h)
 {
-    reghash_t *h = (reghash_t*)_h;
     khint_t k;
     for (k = 0; k < kh_end(h); ++k) {
         if (kh_exist(h, k)) {
             bed_reglist_t *p = &kh_val(h, k);
-            if (p->idx) free(p->idx);
+            if (p->idx) {
+                free(p->idx);
+                p->idx = NULL;
+            }
             ks_introsort(hts_pair_pos_t, p->n, p->a);
-            p->idx = bed_index_core(p->n, p->a);
+            if (bed_index_core(p) != 0) {
+                return -1;
+            }
         }
     }
+    return 0;
 }
 
-static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) {
-    int i, min_off=0;
-
-    if (p && p->idx) {
-        min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
-        if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
-            hts_pos_t n = beg>>LIDX_SHIFT;
-            if (n > p->n)
-                n = p->n;
-            for (i = n - 1; i >= 0; --i)
-                if (p->idx[i] >= 0)
-                    break;
-            min_off = i >= 0? p->idx[i] : 0;
-        }
+static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg) {
+    int min_off=0;
+
+    if (p && p->idx && p->max_idx > 0 && beg >= 0) {
+        min_off = (beg>>LIDX_SHIFT >= p->max_idx)? p->idx[p->max_idx-1] : p->idx[beg>>LIDX_SHIFT];
     }
 
     return min_off;
@@ -164,7 +170,7 @@ static int bed_overlap_core(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end
 {
     int i, min_off;
     if (p->n == 0) return 0;
-    min_off = bed_minoff(p, beg, end);
+    min_off = bed_minoff(p, beg);
 
     for (i = min_off; i < p->n; ++i) {
         if (p->a[i].beg >= end) break; // out of range; no need to proceed
@@ -261,7 +267,7 @@ void *bed_read(const char *fn)
     if (NULL == h) return NULL;
     // read the list
     fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
-    if (fp == 0) return 0;
+    if (fp == 0) goto fail;
     ks = ks_init(fp);
     if (NULL == ks) goto fail;  // In case ks_init ever gets error checking...
     int ks_len;
@@ -341,9 +347,10 @@ void *bed_read(const char *fn)
         fp = NULL;
         goto fail;
     }
+    if (bed_index(h) != 0)
+        goto fail;
     ks_destroy(ks);
     free(str.s);
-    bed_index(h);
     //bed_unify(h);
     return h;
  fail:
@@ -459,7 +466,7 @@ static void *bed_filter(void *reg_hash, void *tmp_hash) {
             beg = q->a[i].beg;
             end = q->a[i].end;
 
-            min_off = bed_minoff(p, beg, end);
+            min_off = bed_minoff(p, beg);
             for (j = min_off; j < p->n; ++j) {
                 if (p->a[j].beg >= end) break; // out of range; no need to proceed
                 if (p->a[j].end > beg && p->a[j].beg < end) {
@@ -536,28 +543,43 @@ void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op
 
         //if op==1 insert reg to the bed hash table
         if (*op && !(bed_insert(h, reg, beg, end))) {
-            fprintf(samtools_stderr, "Error when inserting region='%s' in the bed hash table at address=%p!\n", regs[i], h);
+            fprintf(samtools_stderr, "Error when inserting region='%s' in the bed hash table at address=%p!\n", regs[i], (void *)h);
         }
         //if op==0, first insert the regions in the temporary hash table,
         //then filter the bed hash table using it
         if (!(*op) && !(bed_insert(t, reg, beg, end))) {
-            fprintf(samtools_stderr, "Error when inserting region='%s' in the temporary hash table at address=%p!\n", regs[i], t);
+            fprintf(samtools_stderr, "Error when inserting region='%s' in the temporary hash table at address=%p!\n", regs[i], (void *)t);
         }
     }
 
     if (!(*op)) {
-        bed_index(t);
+        if (bed_index(t) != 0)
+            goto fail;
         bed_unify(t);
-        h = bed_filter(h, t);
+        if (bed_filter(h, t) == NULL)
+            goto fail;
         bed_destroy(t);
+        t = NULL;
     }
 
     if (h) {
-        bed_index(h);
+        if (bed_index(h) != 0)
+            goto fail;
         bed_unify(h);
     }
 
     return h;
+
+ fail:
+    // Clean up whichever hash we made
+    if (reg_hash) {
+        if (t)
+            bed_destroy(t);
+    } else {
+        if (h)
+            bed_destroy(h);
+    }
+    return NULL;
 }
 
 const char* bed_get(void *reg_hash, int i, int filter) {
diff --git a/samtools/coverage.c b/samtools/coverage.c
index dedaa8e99..92f497dc8 100644
--- a/samtools/coverage.c
+++ b/samtools/coverage.c
@@ -1,7 +1,7 @@
 /* coverage.c -- samtools coverage subcommand
 
     Copyright (C) 2018,2019 Florian Breitwieser
-    Portions copyright (C) 2019-2021 Genome Research Ltd.
+    Portions copyright (C) 2019-2021, 2023-2024 Genome Research Ltd.
 
     Author: Florian P Breitwieser <florian.bw@gmail.com>
 
@@ -60,6 +60,7 @@ typedef struct {  // auxiliary data structure to hold stats on coverage
     unsigned long long summed_coverage;
     unsigned long long summed_baseQ;
     unsigned long long summed_mapQ;
+    unsigned long long quality_bases;
     unsigned int n_reads;
     unsigned int n_selected_reads;
     bool covered;
@@ -105,7 +106,7 @@ static const char *const BLOCK_CHARS2[2] = {".", ":"};
 // in bam_plcmd.c
 int read_file_list(const char *file_list, int *n, char **argv[]);
 
-static int usage() {
+static int usage(void) {
     fprintf(stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n"
             "Input options:\n"
             "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
@@ -120,6 +121,7 @@ static int usage() {
             "                          effectively removing any depth limit.\n"
             "Output options:\n"
             "  -m, --histogram         show histogram instead of tabular output\n"
+            "  -D, --plot-depth        plot depth instead of tabular output\n"
             "  -A, --ascii             show only ASCII characters in histogram\n"
             "  -o, --output FILE       write output to FILE [stdout]\n"
             "  -H, --no-header         don't print a header in tabular mode\n"
@@ -203,13 +205,13 @@ void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *s
             stats[tid].n_covered_bases,
             100.0 * stats[tid].n_covered_bases / region_len,
             stats[tid].summed_coverage / region_len,
-            stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
+            stats[tid].quality_bases > 0? stats[tid].summed_baseQ/(double) stats[tid].quality_bases : 0,
             stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
            );
 }
 
 void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
-        const int hist_size, const bool full_utf) {
+        const int hist_size, const bool full_utf, const bool plot_coverage) {
     int i, col;
     bool show_percentiles = false;
     const int n_rows = 10;
@@ -221,7 +223,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
     double hist_data[hist_size];
     double max_val = 0.0;
     for (i = 0; i < hist_size; ++i) {
-        hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
+        hist_data[i] = (plot_coverage?1:100) * hist[i] / (double) stats[tid].bin_width;
         if (hist_data[i] > max_val) max_val = hist_data[i];
     }
 
@@ -231,7 +233,9 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
     double row_bin_size = max_val / (double) n_rows;
     for (i = n_rows-1; i >= 0; --i) {
         double current_bin = row_bin_size * i;
-        if (show_percentiles) {
+        if (plot_coverage) {
+            fprintf(file_out, ">%8.1f ",i*row_bin_size);
+        } else if (show_percentiles) {
             fprintf(file_out, ">%3i%% ", i*10);
         } else {
             fprintf(file_out, ">%7.2f%% ", current_bin);
@@ -260,12 +264,18 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
             case 5: fprintf(file_out, "Mean coverage:   %.3gx",
                             stats[tid].summed_coverage / region_len); break;
             case 4: fprintf(file_out, "Mean baseQ:      %.3g",
-                            stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
+                            stats[tid].quality_bases > 0? stats[tid].summed_baseQ/(double) stats[tid].quality_bases : 0); break;
             case 3: fprintf(file_out, "Mean mapQ:       %.3g",
                             stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
             case 1: fprintf(file_out, "Histo bin width: %sbp",
                             readable_bps(stats[tid].bin_width, buf)); break;
-            case 0: fprintf(file_out, "Histo max bin:   %.5g%%", max_val); break;
+            case 0: if (plot_coverage) {
+                        fprintf(file_out, "Histo max cov:   %.5g",  max_val);
+                    } else {
+                        fprintf(file_out, "Histo max bin:   %.5g%%", max_val);
+                    }
+                    break;
+
         };
         fputc('\n', file_out);
     }
@@ -306,6 +316,7 @@ int main_coverage(int argc, char *argv[]) {
     char **fn = NULL;
     int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags
     int required_flags = 0;
+    int print_value_warning = 0;
 
     int *n_plp = NULL;
     sam_hdr_t *h = NULL; // BAM header of the 1st input
@@ -313,6 +324,7 @@ int main_coverage(int argc, char *argv[]) {
     bool opt_print_header = true;
     bool opt_print_tabular = true;
     bool opt_print_histogram = false;
+    bool opt_plot_coverage = false;
     bool opt_full_utf = true;
 
     FILE *file_out = stdout;
@@ -332,6 +344,7 @@ int main_coverage(int argc, char *argv[]) {
         {"min-bq", required_argument, NULL, 'Q'},
         {"histogram", no_argument, NULL, 'm'},
         {"ascii", no_argument, NULL, 'A'},
+        {"plot-depth", no_argument, NULL, 'D'},
         {"output", required_argument, NULL, 'o'},
         {"no-header", no_argument, NULL, 'H'},
         {"n-bins", required_argument, NULL, 'w'},
@@ -344,7 +357,7 @@ int main_coverage(int argc, char *argv[]) {
     // parse the command line
     int c;
     opterr = 0;
-    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
+    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:D", lopts, NULL)) != -1) {
         switch (c) {
             case 1:
                 if ((required_flags = bam_str2flag(optarg)) < 0) {
@@ -368,6 +381,10 @@ int main_coverage(int argc, char *argv[]) {
             case 'A': opt_full_utf = false;
                       opt_print_histogram = true; opt_print_tabular = false;
                       break;
+            case 'D': opt_print_histogram = true;
+                     opt_print_tabular = false;
+                     opt_plot_coverage = true;
+                     break;
             case 'H': opt_print_header = false; break;
             case 'h': return usage();
             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -555,12 +572,13 @@ int main_coverage(int argc, char *argv[]) {
         status = EXIT_FAILURE;
         goto coverage_end;
     }
+
     while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
 
         if (tid != old_tid) { // Next target sequence
             if (old_tid >= 0) {
                 if (opt_print_histogram) {
-                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
+                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf, opt_plot_coverage);
                     fputc('\n', file_out);
                 } else if (opt_print_tabular) {
                     print_tabular_line(file_out, h, stats, old_tid);
@@ -594,22 +612,32 @@ int main_coverage(int argc, char *argv[]) {
             for (j = 0; j < n_plp[i]; ++j) {
                 const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know
 
-                if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos
-                else if (p->qpos < p->b->core.l_qseq &&
-                        bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
-                else
-                    stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
+                if (p->is_del || p->is_refskip) {
+                    --depth_at_pos; // having dels or refskips at tid:pos
+                } else if (p->qpos < p->b->core.l_qseq) {
+                    if (bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) {
+                        --depth_at_pos; // low base quality
+                    } else {
+                        stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
+                        stats[tid].quality_bases++;
+                    }
+                } else {
+                    print_value_warning = 1; // no quality at position
+                }
             }
+
             if (depth_at_pos > 0) {
                 count_base = true;
                 stats[tid].summed_coverage += depth_at_pos;
             }
-            // hist[current_bin] += depth_at_pos;  // Add counts to the histogram here to have one based on coverage
-            //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+            if(current_bin < n_bins && opt_plot_coverage) {
+                hist[current_bin] += depth_at_pos;
+            }
         }
         if (count_base) {
             stats[tid].n_covered_bases++;
-            if (opt_print_histogram && current_bin < n_bins)
+            if (opt_print_histogram && current_bin < n_bins && !opt_plot_coverage)
                 ++(hist[current_bin]); // Histogram based on breadth of coverage
         }
     }
@@ -620,7 +648,7 @@ int main_coverage(int argc, char *argv[]) {
 
     if (tid < n_targets && tid >=0) {
         if (opt_print_histogram) {
-            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
+            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf, opt_plot_coverage);
         } else if (opt_print_tabular) {
             print_tabular_line(file_out, h, stats, tid);
         }
@@ -636,6 +664,10 @@ int main_coverage(int argc, char *argv[]) {
         }
     }
 
+    if (print_value_warning) {
+        print_error("coverage", "Warning:  Missing quality values in alignments.  Mean base quality calculated only on available values.");
+    }
+
     if (ret < 0) status = EXIT_FAILURE;
 
 coverage_end:
diff --git a/samtools/coverage.c.pysam.c b/samtools/coverage.c.pysam.c
index 894f4ac68..bfd6ec935 100644
--- a/samtools/coverage.c.pysam.c
+++ b/samtools/coverage.c.pysam.c
@@ -3,7 +3,7 @@
 /* coverage.c -- samtools coverage subcommand
 
     Copyright (C) 2018,2019 Florian Breitwieser
-    Portions copyright (C) 2019-2021 Genome Research Ltd.
+    Portions copyright (C) 2019-2021, 2023-2024 Genome Research Ltd.
 
     Author: Florian P Breitwieser <florian.bw@gmail.com>
 
@@ -62,6 +62,7 @@ typedef struct {  // auxiliary data structure to hold stats on coverage
     unsigned long long summed_coverage;
     unsigned long long summed_baseQ;
     unsigned long long summed_mapQ;
+    unsigned long long quality_bases;
     unsigned int n_reads;
     unsigned int n_selected_reads;
     bool covered;
@@ -107,7 +108,7 @@ static const char *const BLOCK_CHARS2[2] = {".", ":"};
 // in bam_plcmd.c
 int read_file_list(const char *file_list, int *n, char **argv[]);
 
-static int usage() {
+static int usage(void) {
     fprintf(samtools_stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n"
             "Input options:\n"
             "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
@@ -122,6 +123,7 @@ static int usage() {
             "                          effectively removing any depth limit.\n"
             "Output options:\n"
             "  -m, --histogram         show histogram instead of tabular output\n"
+            "  -D, --plot-depth        plot depth instead of tabular output\n"
             "  -A, --ascii             show only ASCII characters in histogram\n"
             "  -o, --output FILE       write output to FILE [samtools_stdout]\n"
             "  -H, --no-header         don't print a header in tabular mode\n"
@@ -205,13 +207,13 @@ void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *s
             stats[tid].n_covered_bases,
             100.0 * stats[tid].n_covered_bases / region_len,
             stats[tid].summed_coverage / region_len,
-            stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
+            stats[tid].quality_bases > 0? stats[tid].summed_baseQ/(double) stats[tid].quality_bases : 0,
             stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
            );
 }
 
 void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
-        const int hist_size, const bool full_utf) {
+        const int hist_size, const bool full_utf, const bool plot_coverage) {
     int i, col;
     bool show_percentiles = false;
     const int n_rows = 10;
@@ -223,7 +225,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
     double hist_data[hist_size];
     double max_val = 0.0;
     for (i = 0; i < hist_size; ++i) {
-        hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
+        hist_data[i] = (plot_coverage?1:100) * hist[i] / (double) stats[tid].bin_width;
         if (hist_data[i] > max_val) max_val = hist_data[i];
     }
 
@@ -233,7 +235,9 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
     double row_bin_size = max_val / (double) n_rows;
     for (i = n_rows-1; i >= 0; --i) {
         double current_bin = row_bin_size * i;
-        if (show_percentiles) {
+        if (plot_coverage) {
+            fprintf(file_out, ">%8.1f ",i*row_bin_size);
+        } else if (show_percentiles) {
             fprintf(file_out, ">%3i%% ", i*10);
         } else {
             fprintf(file_out, ">%7.2f%% ", current_bin);
@@ -262,12 +266,18 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in
             case 5: fprintf(file_out, "Mean coverage:   %.3gx",
                             stats[tid].summed_coverage / region_len); break;
             case 4: fprintf(file_out, "Mean baseQ:      %.3g",
-                            stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
+                            stats[tid].quality_bases > 0? stats[tid].summed_baseQ/(double) stats[tid].quality_bases : 0); break;
             case 3: fprintf(file_out, "Mean mapQ:       %.3g",
                             stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
             case 1: fprintf(file_out, "Histo bin width: %sbp",
                             readable_bps(stats[tid].bin_width, buf)); break;
-            case 0: fprintf(file_out, "Histo max bin:   %.5g%%", max_val); break;
+            case 0: if (plot_coverage) {
+                        fprintf(file_out, "Histo max cov:   %.5g",  max_val);
+                    } else {
+                        fprintf(file_out, "Histo max bin:   %.5g%%", max_val);
+                    }
+                    break;
+
         };
         fputc('\n', file_out);
     }
@@ -308,6 +318,7 @@ int main_coverage(int argc, char *argv[]) {
     char **fn = NULL;
     int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags
     int required_flags = 0;
+    int print_value_warning = 0;
 
     int *n_plp = NULL;
     sam_hdr_t *h = NULL; // BAM header of the 1st input
@@ -315,6 +326,7 @@ int main_coverage(int argc, char *argv[]) {
     bool opt_print_header = true;
     bool opt_print_tabular = true;
     bool opt_print_histogram = false;
+    bool opt_plot_coverage = false;
     bool opt_full_utf = true;
 
     FILE *file_out = samtools_stdout;
@@ -334,6 +346,7 @@ int main_coverage(int argc, char *argv[]) {
         {"min-bq", required_argument, NULL, 'Q'},
         {"histogram", no_argument, NULL, 'm'},
         {"ascii", no_argument, NULL, 'A'},
+        {"plot-depth", no_argument, NULL, 'D'},
         {"output", required_argument, NULL, 'o'},
         {"no-header", no_argument, NULL, 'H'},
         {"n-bins", required_argument, NULL, 'w'},
@@ -346,7 +359,7 @@ int main_coverage(int argc, char *argv[]) {
     // parse the command line
     int c;
     opterr = 0;
-    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
+    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:D", lopts, NULL)) != -1) {
         switch (c) {
             case 1:
                 if ((required_flags = bam_str2flag(optarg)) < 0) {
@@ -370,6 +383,10 @@ int main_coverage(int argc, char *argv[]) {
             case 'A': opt_full_utf = false;
                       opt_print_histogram = true; opt_print_tabular = false;
                       break;
+            case 'D': opt_print_histogram = true;
+                     opt_print_tabular = false;
+                     opt_plot_coverage = true;
+                     break;
             case 'H': opt_print_header = false; break;
             case 'h': return usage();
             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
@@ -557,12 +574,13 @@ int main_coverage(int argc, char *argv[]) {
         status = EXIT_FAILURE;
         goto coverage_end;
     }
+
     while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
 
         if (tid != old_tid) { // Next target sequence
             if (old_tid >= 0) {
                 if (opt_print_histogram) {
-                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
+                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf, opt_plot_coverage);
                     fputc('\n', file_out);
                 } else if (opt_print_tabular) {
                     print_tabular_line(file_out, h, stats, old_tid);
@@ -596,22 +614,32 @@ int main_coverage(int argc, char *argv[]) {
             for (j = 0; j < n_plp[i]; ++j) {
                 const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know
 
-                if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos
-                else if (p->qpos < p->b->core.l_qseq &&
-                        bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
-                else
-                    stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
+                if (p->is_del || p->is_refskip) {
+                    --depth_at_pos; // having dels or refskips at tid:pos
+                } else if (p->qpos < p->b->core.l_qseq) {
+                    if (bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) {
+                        --depth_at_pos; // low base quality
+                    } else {
+                        stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
+                        stats[tid].quality_bases++;
+                    }
+                } else {
+                    print_value_warning = 1; // no quality at position
+                }
             }
+
             if (depth_at_pos > 0) {
                 count_base = true;
                 stats[tid].summed_coverage += depth_at_pos;
             }
-            // hist[current_bin] += depth_at_pos;  // Add counts to the histogram here to have one based on coverage
-            //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+            if(current_bin < n_bins && opt_plot_coverage) {
+                hist[current_bin] += depth_at_pos;
+            }
         }
         if (count_base) {
             stats[tid].n_covered_bases++;
-            if (opt_print_histogram && current_bin < n_bins)
+            if (opt_print_histogram && current_bin < n_bins && !opt_plot_coverage)
                 ++(hist[current_bin]); // Histogram based on breadth of coverage
         }
     }
@@ -622,7 +650,7 @@ int main_coverage(int argc, char *argv[]) {
 
     if (tid < n_targets && tid >=0) {
         if (opt_print_histogram) {
-            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
+            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf, opt_plot_coverage);
         } else if (opt_print_tabular) {
             print_tabular_line(file_out, h, stats, tid);
         }
@@ -638,6 +666,10 @@ int main_coverage(int argc, char *argv[]) {
         }
     }
 
+    if (print_value_warning) {
+        print_error("coverage", "Warning:  Missing quality values in alignments.  Mean base quality calculated only on available values.");
+    }
+
     if (ret < 0) status = EXIT_FAILURE;
 
 coverage_end:
diff --git a/samtools/faidx.c b/samtools/faidx.c
index 63204d146..616d82529 100644
--- a/samtools/faidx.c
+++ b/samtools/faidx.c
@@ -1,6 +1,6 @@
 /*  faidx.c -- faidx subcommand.
 
-    Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -44,6 +44,9 @@ DEALINGS IN THE SOFTWARE.
 #include <htslib/hts.h>
 #include <htslib/hfile.h>
 #include <htslib/kstring.h>
+#include <htslib/bgzf.h>
+#include <htslib/thread_pool.h>
+#include "sam_opts.h"
 #include "samtools.h"
 
 // Negative indicates the same as input data
@@ -53,6 +56,15 @@ DEALINGS IN THE SOFTWARE.
 #   define ABS(x) ((x)>=0?(x):-(x))
 #endif
 
+//new params required for output creation
+typedef struct output {
+    int isbgzip;                //is bgzip or uncompressed file
+    FILE *fp;                   //uncompressed file pointer
+    BGZF *bgzf_fp;              //bgzf file pointer
+    sam_global_args *gopt;      //options
+    kstring_t buffer;
+} output;
+
 static unsigned char comp_base[256] = {
   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
@@ -98,8 +110,19 @@ static void reverse(char *str, const hts_pos_t len) {
     }
 }
 
+/// wrappedwrite  - wraps the fwrite and bgzf_write
+/** @param out    - pointer to data required to write output
+*   @param buffer - data to write
+*   @param length - data length
+* returns error or length written on success
+*/
+static inline size_t wrappedwrite(output *out, const char *buffer, size_t length)
+{
+    return out->isbgzip ? bgzf_write(out->bgzf_fp, buffer, length) :
+        fwrite(buffer, 1, length, out->fp);
+}
 
-static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name,
+static int write_line(faidx_t *faid, output *out, const char *line, const char *name,
                       const int ignore, const hts_pos_t length, const hts_pos_t seq_len) {
     int id;
     hts_pos_t beg, end;
@@ -124,8 +147,8 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n
     for (i = 0; i < seq_sz; i += length)
     {
         hts_pos_t len = i + length < seq_sz ? length : seq_sz - i;
-        if (fwrite(line + i, 1, len, file) < len ||
-            fputc('\n', file) == EOF) {
+        if (wrappedwrite(out, line + i, len) < len ||
+              wrappedwrite(out, "\n", 1) < 1) {
             print_error_errno("faidx", "failed to write output");
             return EXIT_FAILURE;
         }
@@ -135,58 +158,66 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n
 }
 
 
-static int write_output(faidx_t *faid, FILE *file, const char *name, const int ignore,
+static int write_output(faidx_t *faid, output *out, const char *name, const int ignore,
                         const hts_pos_t length, const int rev,
                         const char *pos_strand_name, const char *neg_strand_name,
                         enum fai_format_options format) {
-    hts_pos_t seq_len, wrap_len = length;
+    hts_pos_t seq_len, wrap_len = length, len = 0;
+    char *seq =  NULL, *qual = NULL;
+    int ret = EXIT_FAILURE;
+
     if (wrap_len < 0)
         wrap_len = fai_line_length(faid, name);
     if (wrap_len <= 0)
         wrap_len = HTS_POS_MAX;
-    char *seq = fai_fetch64(faid, name, &seq_len);
-
-    if (format == FAI_FASTA) {
-        fprintf(file, ">%s%s\n", name, rev ? neg_strand_name : pos_strand_name);
-    } else {
-        fprintf(file, "@%s%s\n", name, rev ? neg_strand_name : pos_strand_name);
-    }
 
+    seq = fai_fetch64(faid, name, &seq_len);
     if (rev && seq_len > 0) {
         reverse_complement(seq, seq_len);
     }
-
-    if (write_line(faid, file, seq, name, ignore, wrap_len, seq_len)
-        == EXIT_FAILURE) {
-        free(seq);
-        return EXIT_FAILURE;
+    //write the name
+    len = ksprintf(&out->buffer, "%c%s%s\n", format == FAI_FASTA ? '>' : '@', name, rev ? neg_strand_name : pos_strand_name);
+    if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) {
+        fprintf(stderr,"[faidx] Failed to write buffer\n");
+        goto exit;
+    }
+    ks_clear(&out->buffer);
+    //write bases
+    if ((ret = write_line(faid, out, seq, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) {
+        goto exit;
     }
-
-    free(seq);
 
     if (format == FAI_FASTQ) {
-        fprintf(file, "+\n");
-
-        char *qual = fai_fetchqual64(faid, name, &seq_len);
-
+        //write quality
+        qual = fai_fetchqual64(faid, name, &seq_len);
         if (rev && seq_len > 0) {
             reverse(qual, seq_len);
         }
 
-        if (write_line(faid, file, qual, name, ignore, wrap_len, seq_len)
-            == EXIT_FAILURE) {
-            free(qual);
-            return EXIT_FAILURE;
+        len = ksprintf(&out->buffer, "+\n");
+        if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) {
+            fprintf(stderr,"[faidx] Failed to write buffer\n");
+            goto exit;
         }
+        ks_clear(&out->buffer);
+        if ((ret = write_line(faid, out, qual, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) {
+            goto exit;
+        }
+    }
+    ret = EXIT_SUCCESS;
 
+exit:
+    if (seq) {
+        free(seq);
+    }
+    if (qual) {
         free(qual);
     }
-
-    return EXIT_SUCCESS;
+    return ret;
 }
 
 
-static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, const int ignore,
+static int read_regions_from_file(faidx_t *faid, hFILE *in_file, output *out, const int ignore,
                                   const hts_pos_t length, const int rev,
                                   const char *pos_strand_name,
                                   const char *neg_strand_name,
@@ -195,7 +226,7 @@ static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, con
     int ret = EXIT_FAILURE;
 
     while (line.l = 0, kgetline(&line, (kgets_func *)hgets, in_file) >= 0) {
-        if ((ret = write_output(faid, file, line.s, ignore, length, rev, pos_strand_name, neg_strand_name, format)) == EXIT_FAILURE) {
+        if ((ret = write_output(faid, out, line.s, ignore, length, rev, pos_strand_name, neg_strand_name, format)) == EXIT_FAILURE) {
             break;
         }
     }
@@ -221,26 +252,27 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status)
 
     fprintf(fp, "Usage: samtools %s [<reg> [...]]\n", tool);
     fprintf(fp, "Option: \n"
-                " -o, --output FILE        Write %s to file.\n"
-                " -n, --length INT         Length of %s sequence line. [60]\n"
-                " -c, --continue           Continue after trying to retrieve missing region.\n"
-                " -r, --region-file FILE   File of regions.  Format is chr:from-to. One per line.\n"
-                " -i, --reverse-complement Reverse complement sequences.\n"
-                "     --mark-strand TYPE   Add strand indicator to sequence name\n"
-                "                          TYPE = rc   for /rc on negative strand (default)\n"
-                "                                 no   for no strand indicator\n"
-                "                                 sign for (+) / (-)\n"
-                "                                 custom,<pos>,<neg> for custom indicator\n"
-                "     --fai-idx      FILE  name of the index file (default %s.fai).\n"
-                "     --gzi-idx      FILE  name of compressed file index (default %s.gz.gzi).\n",
+                "  -o, --output FILE        Write %s to file.\n"
+                "  -n, --length INT         Length of %s sequence line. [60]\n"
+                "  -c, --continue           Continue after trying to retrieve missing region.\n"
+                "  -r, --region-file FILE   File of regions.  Format is chr:from-to. One per line.\n"
+                "  -i, --reverse-complement Reverse complement sequences.\n"
+                "      --mark-strand TYPE   Add strand indicator to sequence name\n"
+                "                           TYPE = rc   for /rc on negative strand (default)\n"
+                "                                  no   for no strand indicator\n"
+                "                                  sign for (+) / (-)\n"
+                "                                  custom,<pos>,<neg> for custom indicator\n"
+                "      --fai-idx      FILE  name of the index file (default %s.fai).\n"
+                "      --gzi-idx      FILE  name of compressed file index (default %s.gz.gzi).\n",
                 file_type, file_type, index_name, index_name);
 
 
     if (format == FAI_FASTA) {
-       fprintf(fp, " -f, --fastq              File and index in FASTQ format.\n");
+       fprintf(fp, "  -f, --fastq              File and index in FASTQ format.\n");
     }
 
-    fprintf(fp, " -h, --help               This message.\n");
+    fprintf(fp, "  -h, --help               This message.\n");
+    sam_global_opt_help(fp, "---.-@--");
 
     return exit_status;
 }
@@ -256,9 +288,14 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
     char *strand_names = NULL; // Used for custom strand annotation
     char *fai_name = NULL; // specified index name
     char *gzi_name = NULL; // specified compressed index name
-    FILE* file_out = stdout;/* output stream */
+    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    int exit_status = EXIT_FAILURE, flushed = 0;
+    struct output out = { 0, stdout, NULL, &ga, KS_INITIALIZE}; //data required for output writing
+    faidx_t *fai = NULL;
+    hts_tpool *pool = NULL;
 
     static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'),     //output format opt and thread count - long options
         { "output", required_argument,       NULL, 'o' },
         { "help",   no_argument,             NULL, 'h' },
         { "length", required_argument,       NULL, 'n' },
@@ -272,9 +309,20 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "ho:n:cr:fi", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "ho:n:cr:fi@:", lopts, NULL)) >= 0) {
         switch (c) {
-            case 'o': output_file = optarg; break;
+            case 'o':
+                {
+                    output_file = optarg;
+                    char *ext = strrchr(output_file, '.');
+                    if (!ext) {
+                        break;
+                    }
+                    if (!strcmp(ext, ".gz") || !strcmp(ext, ".bgz") || !strcmp(ext, ".bgzf")) {
+                        out.isbgzip = 1;        //bgzip output
+                    }
+                    break;
+                }
             case 'n': line_len = strtol(optarg, NULL, 10);
                       if (line_len < 0) {
                         fprintf(stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,ABS(DEFAULT_FASTA_LINE_LEN));
@@ -285,8 +333,14 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
             case 'r': region_file = optarg; break;
             case 'f': format = FAI_FASTQ; break;
             case 'i': rev = 1; break;
-            case '?': return usage(stderr, format, EXIT_FAILURE);
-            case 'h': return usage(stdout, format, EXIT_SUCCESS);
+            case '?':
+                exit_status = usage(stderr, format, EXIT_FAILURE);
+                goto exit2;
+                break;
+            case 'h':
+                exit_status = usage(stdout, format, EXIT_SUCCESS);
+                goto exit2;
+                break;
             case 1000:
                 if (strcmp(optarg, "no") == 0) {
                     pos_strand_name = neg_strand_name = "";
@@ -303,7 +357,7 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
                     strand_names = pos_strand_name = malloc(len + 2);
                     if (!strand_names) {
                         fprintf(stderr, "[faidx] Out of memory\n");
-                        return EXIT_FAILURE;
+                        goto exit2;
                     }
                     neg_strand_name = pos_strand_name + comma + 1;
                     memcpy(pos_strand_name, optarg + 7, comma);
@@ -314,17 +368,26 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
                     neg_strand_name[len - comma] = '\0';
                 } else {
                     fprintf(stderr, "[faidx] Unknown --mark-strand option \"%s\"\n", optarg);
-                    return usage(stderr, format, EXIT_FAILURE);
+                    exit_status = usage(stderr, format, EXIT_FAILURE);
+                    goto exit2;
                 }
                 break;
             case 1001: fai_name = optarg; break;
             case 1002: gzi_name = optarg; break;
-            default:  break;
+            // handle standard samtools options like thread count, compression level...
+            default:
+                if (parse_sam_global_opt(c, optarg, lopts, &ga)) {
+                    fprintf(stderr, "[faidx] Invalid option \"%s\"\n", optarg);
+                    goto exit2;
+                }
+                break;
         }
     }
 
-    if ( argc==optind )
-        return usage(stdout, format, EXIT_SUCCESS);
+    if ( argc==optind ) {
+        exit_status = usage(stdout, format, EXIT_SUCCESS);
+        goto exit2;
+    }
 
     if (optind+1 == argc && !region_file) {
         if (output_file && !fai_name)
@@ -341,73 +404,127 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
             else
                 fprintf(stderr, "\n");
 
-            return EXIT_FAILURE;
+            goto exit2;
         }
+        exit_status = EXIT_SUCCESS;
+        goto exit2;
+    }
 
-        return 0;
+    if (out.gopt->nthreads > 0) {       //setup thread pool
+        if (!(pool = hts_tpool_init(out.gopt->nthreads))) {
+            fprintf(stderr, "Failed to setup thread pool\n");
+        }
     }
 
-    faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
+    fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
 
     if (!fai) {
         if (fai_name)
             fprintf(stderr, "[faidx] Could not load fai index %s", fai_name);
         else
-            fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+            fprintf(stderr, "[faidx] Could not load fai index %s.fai", argv[optind]);
 
         if (gzi_name)
             fprintf(stderr, " or compressed index %s\n", gzi_name);
         else
             fprintf(stderr, "\n");
 
-        return EXIT_FAILURE;
+        goto exit2;
+    }
+
+    if (pool) {                         //use thread pool if set
+        if (fai_thread_pool(fai, pool, 0)) {
+            fprintf(stderr, "Failed to set thread pool for reading\n");
+        }
     }
 
     /** output file provided by user */
     if( output_file != NULL ) {
         if( strcmp( output_file, argv[optind] ) == 0 ) {
             fprintf(stderr,"[faidx] Same input/output : %s\n", output_file);
-            return EXIT_FAILURE;
+            goto exit2;
+        }
+        if (!out.isbgzip) {
+            out.fp = fopen( output_file, "w" );
+        } else {
+            hts_opt *opts = (hts_opt *)(out.gopt->out.specific);
+            char mode[13] = "w";
+            int level = 4;                                      //default compression level
+            while (opts) {
+                if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) {   //compression level
+                    level = opts->val.i;
+                    break;
+                }
+                opts = opts->next;
+            }
+            if (level >= 0) {
+                snprintf(mode, sizeof(mode), "w%d", level);     //pass compression with mode
+            }
+            out.bgzf_fp = bgzf_open(output_file, mode);
         }
 
-        file_out = fopen( output_file, "w" );
-
-        if( file_out == NULL) {
+        if( (!out.isbgzip && out.fp == NULL) || (out.isbgzip && out.bgzf_fp == NULL)) {
             fprintf(stderr,"[faidx] Cannot open \"%s\" for writing :%s.\n", output_file, strerror(errno) );
-            return EXIT_FAILURE;
+            goto exit2;
+        }
+        if (out.isbgzip && pool) {                              //use thread pool if set
+            if (bgzf_thread_pool(out.bgzf_fp, pool, 0)) {
+                fprintf(stderr, "Failed to set thread pool for writing\n");
+            }
         }
     }
 
-    int exit_status = EXIT_SUCCESS;
-
     if (region_file) {
         hFILE *rf;
 
         if ((rf = hopen(region_file, "r"))) {
-            exit_status = read_regions_from_file(fai, rf, file_out, ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
+            exit_status = read_regions_from_file(fai, rf, &out, ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
 
             if (hclose(rf) != 0) {
                 fprintf(stderr, "[faidx] Warning: failed to close %s", region_file);
             }
+            if (exit_status == EXIT_FAILURE) {
+                goto exit1;
+            }
         } else {
             fprintf(stderr, "[faidx] Failed to open \"%s\" for reading.\n", region_file);
-            exit_status = EXIT_FAILURE;
+            goto exit1;
         }
     }
 
+    exit_status = EXIT_SUCCESS;
     while ( ++optind<argc && exit_status == EXIT_SUCCESS) {
-        exit_status = write_output(fai, file_out, argv[optind], ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
+        exit_status = write_output(fai, &out, argv[optind], ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
     }
 
-    fai_destroy(fai);
-
-    if (fflush(file_out) == EOF) {
-        print_error_errno("faidx", "failed to flush output");
+    flushed = out.isbgzip ? bgzf_flush(out.bgzf_fp) : fflush(out.fp);
+    if (flushed == EOF) {
+        print_error_errno("faidx", "Failed to flush output\n");
         exit_status = EXIT_FAILURE;
     }
 
-    if( output_file != NULL) fclose(file_out);
-    free(strand_names);
+exit1:
+    if( output_file != NULL && !out.isbgzip) {
+        fclose(out.fp);     //no need to check result as already flushed
+    } else if( output_file != NULL && out.isbgzip) {
+        if (bgzf_close(out.bgzf_fp) < 0) {
+            print_error_errno("faidx", "Failed to close output\n");
+            exit_status = EXIT_FAILURE;
+        }
+    }
+
+exit2:
+    if (strand_names) {
+        free(strand_names);
+    }
+    if (fai) {
+        fai_destroy(fai);
+    }
+    if (pool) {
+        hts_tpool_destroy(pool);
+    }
+    sam_global_args_free(&ga);
+    ks_free(&out.buffer);
 
     return exit_status;
 }
diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c
index 6160661ea..f40209b41 100644
--- a/samtools/faidx.c.pysam.c
+++ b/samtools/faidx.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  faidx.c -- faidx subcommand.
 
-    Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -46,6 +46,9 @@ DEALINGS IN THE SOFTWARE.
 #include <htslib/hts.h>
 #include <htslib/hfile.h>
 #include <htslib/kstring.h>
+#include <htslib/bgzf.h>
+#include <htslib/thread_pool.h>
+#include "sam_opts.h"
 #include "samtools.h"
 
 // Negative indicates the same as input data
@@ -55,6 +58,15 @@ DEALINGS IN THE SOFTWARE.
 #   define ABS(x) ((x)>=0?(x):-(x))
 #endif
 
+//new params required for output creation
+typedef struct output {
+    int isbgzip;                //is bgzip or uncompressed file
+    FILE *fp;                   //uncompressed file pointer
+    BGZF *bgzf_fp;              //bgzf file pointer
+    sam_global_args *gopt;      //options
+    kstring_t buffer;
+} output;
+
 static unsigned char comp_base[256] = {
   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
@@ -100,8 +112,19 @@ static void reverse(char *str, const hts_pos_t len) {
     }
 }
 
+/// wrappedwrite  - wraps the fwrite and bgzf_write
+/** @param out    - pointer to data required to write output
+*   @param buffer - data to write
+*   @param length - data length
+* returns error or length written on success
+*/
+static inline size_t wrappedwrite(output *out, const char *buffer, size_t length)
+{
+    return out->isbgzip ? bgzf_write(out->bgzf_fp, buffer, length) :
+        fwrite(buffer, 1, length, out->fp);
+}
 
-static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name,
+static int write_line(faidx_t *faid, output *out, const char *line, const char *name,
                       const int ignore, const hts_pos_t length, const hts_pos_t seq_len) {
     int id;
     hts_pos_t beg, end;
@@ -126,8 +149,8 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n
     for (i = 0; i < seq_sz; i += length)
     {
         hts_pos_t len = i + length < seq_sz ? length : seq_sz - i;
-        if (fwrite(line + i, 1, len, file) < len ||
-            fputc('\n', file) == EOF) {
+        if (wrappedwrite(out, line + i, len) < len ||
+              wrappedwrite(out, "\n", 1) < 1) {
             print_error_errno("faidx", "failed to write output");
             return EXIT_FAILURE;
         }
@@ -137,58 +160,66 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n
 }
 
 
-static int write_output(faidx_t *faid, FILE *file, const char *name, const int ignore,
+static int write_output(faidx_t *faid, output *out, const char *name, const int ignore,
                         const hts_pos_t length, const int rev,
                         const char *pos_strand_name, const char *neg_strand_name,
                         enum fai_format_options format) {
-    hts_pos_t seq_len, wrap_len = length;
+    hts_pos_t seq_len, wrap_len = length, len = 0;
+    char *seq =  NULL, *qual = NULL;
+    int ret = EXIT_FAILURE;
+
     if (wrap_len < 0)
         wrap_len = fai_line_length(faid, name);
     if (wrap_len <= 0)
         wrap_len = HTS_POS_MAX;
-    char *seq = fai_fetch64(faid, name, &seq_len);
-
-    if (format == FAI_FASTA) {
-        fprintf(file, ">%s%s\n", name, rev ? neg_strand_name : pos_strand_name);
-    } else {
-        fprintf(file, "@%s%s\n", name, rev ? neg_strand_name : pos_strand_name);
-    }
 
+    seq = fai_fetch64(faid, name, &seq_len);
     if (rev && seq_len > 0) {
         reverse_complement(seq, seq_len);
     }
-
-    if (write_line(faid, file, seq, name, ignore, wrap_len, seq_len)
-        == EXIT_FAILURE) {
-        free(seq);
-        return EXIT_FAILURE;
+    //write the name
+    len = ksprintf(&out->buffer, "%c%s%s\n", format == FAI_FASTA ? '>' : '@', name, rev ? neg_strand_name : pos_strand_name);
+    if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) {
+        fprintf(samtools_stderr,"[faidx] Failed to write buffer\n");
+        goto exit;
+    }
+    ks_clear(&out->buffer);
+    //write bases
+    if ((ret = write_line(faid, out, seq, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) {
+        goto exit;
     }
-
-    free(seq);
 
     if (format == FAI_FASTQ) {
-        fprintf(file, "+\n");
-
-        char *qual = fai_fetchqual64(faid, name, &seq_len);
-
+        //write quality
+        qual = fai_fetchqual64(faid, name, &seq_len);
         if (rev && seq_len > 0) {
             reverse(qual, seq_len);
         }
 
-        if (write_line(faid, file, qual, name, ignore, wrap_len, seq_len)
-            == EXIT_FAILURE) {
-            free(qual);
-            return EXIT_FAILURE;
+        len = ksprintf(&out->buffer, "+\n");
+        if (wrappedwrite(out, out->buffer.s, out->buffer.l) < len) {
+            fprintf(samtools_stderr,"[faidx] Failed to write buffer\n");
+            goto exit;
         }
+        ks_clear(&out->buffer);
+        if ((ret = write_line(faid, out, qual, name, ignore, wrap_len, seq_len) == EXIT_FAILURE)) {
+            goto exit;
+        }
+    }
+    ret = EXIT_SUCCESS;
 
+exit:
+    if (seq) {
+        free(seq);
+    }
+    if (qual) {
         free(qual);
     }
-
-    return EXIT_SUCCESS;
+    return ret;
 }
 
 
-static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, const int ignore,
+static int read_regions_from_file(faidx_t *faid, hFILE *in_file, output *out, const int ignore,
                                   const hts_pos_t length, const int rev,
                                   const char *pos_strand_name,
                                   const char *neg_strand_name,
@@ -197,7 +228,7 @@ static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, con
     int ret = EXIT_FAILURE;
 
     while (line.l = 0, kgetline(&line, (kgets_func *)hgets, in_file) >= 0) {
-        if ((ret = write_output(faid, file, line.s, ignore, length, rev, pos_strand_name, neg_strand_name, format)) == EXIT_FAILURE) {
+        if ((ret = write_output(faid, out, line.s, ignore, length, rev, pos_strand_name, neg_strand_name, format)) == EXIT_FAILURE) {
             break;
         }
     }
@@ -223,26 +254,27 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status)
 
     fprintf(fp, "Usage: samtools %s [<reg> [...]]\n", tool);
     fprintf(fp, "Option: \n"
-                " -o, --output FILE        Write %s to file.\n"
-                " -n, --length INT         Length of %s sequence line. [60]\n"
-                " -c, --continue           Continue after trying to retrieve missing region.\n"
-                " -r, --region-file FILE   File of regions.  Format is chr:from-to. One per line.\n"
-                " -i, --reverse-complement Reverse complement sequences.\n"
-                "     --mark-strand TYPE   Add strand indicator to sequence name\n"
-                "                          TYPE = rc   for /rc on negative strand (default)\n"
-                "                                 no   for no strand indicator\n"
-                "                                 sign for (+) / (-)\n"
-                "                                 custom,<pos>,<neg> for custom indicator\n"
-                "     --fai-idx      FILE  name of the index file (default %s.fai).\n"
-                "     --gzi-idx      FILE  name of compressed file index (default %s.gz.gzi).\n",
+                "  -o, --output FILE        Write %s to file.\n"
+                "  -n, --length INT         Length of %s sequence line. [60]\n"
+                "  -c, --continue           Continue after trying to retrieve missing region.\n"
+                "  -r, --region-file FILE   File of regions.  Format is chr:from-to. One per line.\n"
+                "  -i, --reverse-complement Reverse complement sequences.\n"
+                "      --mark-strand TYPE   Add strand indicator to sequence name\n"
+                "                           TYPE = rc   for /rc on negative strand (default)\n"
+                "                                  no   for no strand indicator\n"
+                "                                  sign for (+) / (-)\n"
+                "                                  custom,<pos>,<neg> for custom indicator\n"
+                "      --fai-idx      FILE  name of the index file (default %s.fai).\n"
+                "      --gzi-idx      FILE  name of compressed file index (default %s.gz.gzi).\n",
                 file_type, file_type, index_name, index_name);
 
 
     if (format == FAI_FASTA) {
-       fprintf(fp, " -f, --fastq              File and index in FASTQ format.\n");
+       fprintf(fp, "  -f, --fastq              File and index in FASTQ format.\n");
     }
 
-    fprintf(fp, " -h, --help               This message.\n");
+    fprintf(fp, "  -h, --help               This message.\n");
+    sam_global_opt_help(fp, "---.-@--");
 
     return exit_status;
 }
@@ -258,9 +290,14 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
     char *strand_names = NULL; // Used for custom strand annotation
     char *fai_name = NULL; // specified index name
     char *gzi_name = NULL; // specified compressed index name
-    FILE* file_out = samtools_stdout;/* output stream */
+    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    int exit_status = EXIT_FAILURE, flushed = 0;
+    struct output out = { 0, samtools_stdout, NULL, &ga, KS_INITIALIZE}; //data required for output writing
+    faidx_t *fai = NULL;
+    hts_tpool *pool = NULL;
 
     static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'),     //output format opt and thread count - long options
         { "output", required_argument,       NULL, 'o' },
         { "help",   no_argument,             NULL, 'h' },
         { "length", required_argument,       NULL, 'n' },
@@ -274,9 +311,20 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
         { NULL, 0, NULL, 0 }
     };
 
-    while ((c = getopt_long(argc, argv, "ho:n:cr:fi", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "ho:n:cr:fi@:", lopts, NULL)) >= 0) {
         switch (c) {
-            case 'o': output_file = optarg; break;
+            case 'o':
+                {
+                    output_file = optarg;
+                    char *ext = strrchr(output_file, '.');
+                    if (!ext) {
+                        break;
+                    }
+                    if (!strcmp(ext, ".gz") || !strcmp(ext, ".bgz") || !strcmp(ext, ".bgzf")) {
+                        out.isbgzip = 1;        //bgzip output
+                    }
+                    break;
+                }
             case 'n': line_len = strtol(optarg, NULL, 10);
                       if (line_len < 0) {
                         fprintf(samtools_stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,ABS(DEFAULT_FASTA_LINE_LEN));
@@ -287,8 +335,14 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
             case 'r': region_file = optarg; break;
             case 'f': format = FAI_FASTQ; break;
             case 'i': rev = 1; break;
-            case '?': return usage(samtools_stderr, format, EXIT_FAILURE);
-            case 'h': return usage(samtools_stdout, format, EXIT_SUCCESS);
+            case '?':
+                exit_status = usage(samtools_stderr, format, EXIT_FAILURE);
+                goto exit2;
+                break;
+            case 'h':
+                exit_status = usage(samtools_stdout, format, EXIT_SUCCESS);
+                goto exit2;
+                break;
             case 1000:
                 if (strcmp(optarg, "no") == 0) {
                     pos_strand_name = neg_strand_name = "";
@@ -305,7 +359,7 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
                     strand_names = pos_strand_name = malloc(len + 2);
                     if (!strand_names) {
                         fprintf(samtools_stderr, "[faidx] Out of memory\n");
-                        return EXIT_FAILURE;
+                        goto exit2;
                     }
                     neg_strand_name = pos_strand_name + comma + 1;
                     memcpy(pos_strand_name, optarg + 7, comma);
@@ -316,17 +370,26 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
                     neg_strand_name[len - comma] = '\0';
                 } else {
                     fprintf(samtools_stderr, "[faidx] Unknown --mark-strand option \"%s\"\n", optarg);
-                    return usage(samtools_stderr, format, EXIT_FAILURE);
+                    exit_status = usage(samtools_stderr, format, EXIT_FAILURE);
+                    goto exit2;
                 }
                 break;
             case 1001: fai_name = optarg; break;
             case 1002: gzi_name = optarg; break;
-            default:  break;
+            // handle standard samtools options like thread count, compression level...
+            default:
+                if (parse_sam_global_opt(c, optarg, lopts, &ga)) {
+                    fprintf(samtools_stderr, "[faidx] Invalid option \"%s\"\n", optarg);
+                    goto exit2;
+                }
+                break;
         }
     }
 
-    if ( argc==optind )
-        return usage(samtools_stdout, format, EXIT_SUCCESS);
+    if ( argc==optind ) {
+        exit_status = usage(samtools_stdout, format, EXIT_SUCCESS);
+        goto exit2;
+    }
 
     if (optind+1 == argc && !region_file) {
         if (output_file && !fai_name)
@@ -343,73 +406,127 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
             else
                 fprintf(samtools_stderr, "\n");
 
-            return EXIT_FAILURE;
+            goto exit2;
         }
+        exit_status = EXIT_SUCCESS;
+        goto exit2;
+    }
 
-        return 0;
+    if (out.gopt->nthreads > 0) {       //setup thread pool
+        if (!(pool = hts_tpool_init(out.gopt->nthreads))) {
+            fprintf(samtools_stderr, "Failed to setup thread pool\n");
+        }
     }
 
-    faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
+    fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
 
     if (!fai) {
         if (fai_name)
             fprintf(samtools_stderr, "[faidx] Could not load fai index %s", fai_name);
         else
-            fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+            fprintf(samtools_stderr, "[faidx] Could not load fai index %s.fai", argv[optind]);
 
         if (gzi_name)
             fprintf(samtools_stderr, " or compressed index %s\n", gzi_name);
         else
             fprintf(samtools_stderr, "\n");
 
-        return EXIT_FAILURE;
+        goto exit2;
+    }
+
+    if (pool) {                         //use thread pool if set
+        if (fai_thread_pool(fai, pool, 0)) {
+            fprintf(samtools_stderr, "Failed to set thread pool for reading\n");
+        }
     }
 
     /** output file provided by user */
     if( output_file != NULL ) {
         if( strcmp( output_file, argv[optind] ) == 0 ) {
             fprintf(samtools_stderr,"[faidx] Same input/output : %s\n", output_file);
-            return EXIT_FAILURE;
+            goto exit2;
+        }
+        if (!out.isbgzip) {
+            out.fp = fopen( output_file, "w" );
+        } else {
+            hts_opt *opts = (hts_opt *)(out.gopt->out.specific);
+            char mode[13] = "w";
+            int level = 4;                                      //default compression level
+            while (opts) {
+                if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) {   //compression level
+                    level = opts->val.i;
+                    break;
+                }
+                opts = opts->next;
+            }
+            if (level >= 0) {
+                snprintf(mode, sizeof(mode), "w%d", level);     //pass compression with mode
+            }
+            out.bgzf_fp = bgzf_open(output_file, mode);
         }
 
-        file_out = fopen( output_file, "w" );
-
-        if( file_out == NULL) {
+        if( (!out.isbgzip && out.fp == NULL) || (out.isbgzip && out.bgzf_fp == NULL)) {
             fprintf(samtools_stderr,"[faidx] Cannot open \"%s\" for writing :%s.\n", output_file, strerror(errno) );
-            return EXIT_FAILURE;
+            goto exit2;
+        }
+        if (out.isbgzip && pool) {                              //use thread pool if set
+            if (bgzf_thread_pool(out.bgzf_fp, pool, 0)) {
+                fprintf(samtools_stderr, "Failed to set thread pool for writing\n");
+            }
         }
     }
 
-    int exit_status = EXIT_SUCCESS;
-
     if (region_file) {
         hFILE *rf;
 
         if ((rf = hopen(region_file, "r"))) {
-            exit_status = read_regions_from_file(fai, rf, file_out, ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
+            exit_status = read_regions_from_file(fai, rf, &out, ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
 
             if (hclose(rf) != 0) {
                 fprintf(samtools_stderr, "[faidx] Warning: failed to close %s", region_file);
             }
+            if (exit_status == EXIT_FAILURE) {
+                goto exit1;
+            }
         } else {
             fprintf(samtools_stderr, "[faidx] Failed to open \"%s\" for reading.\n", region_file);
-            exit_status = EXIT_FAILURE;
+            goto exit1;
         }
     }
 
+    exit_status = EXIT_SUCCESS;
     while ( ++optind<argc && exit_status == EXIT_SUCCESS) {
-        exit_status = write_output(fai, file_out, argv[optind], ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
+        exit_status = write_output(fai, &out, argv[optind], ignore_error, line_len, rev, pos_strand_name, neg_strand_name, format);
     }
 
-    fai_destroy(fai);
-
-    if (fflush(file_out) == EOF) {
-        print_error_errno("faidx", "failed to flush output");
+    flushed = out.isbgzip ? bgzf_flush(out.bgzf_fp) : fflush(out.fp);
+    if (flushed == EOF) {
+        print_error_errno("faidx", "Failed to flush output\n");
         exit_status = EXIT_FAILURE;
     }
 
-    if( output_file != NULL) fclose(file_out);
-    free(strand_names);
+exit1:
+    if( output_file != NULL && !out.isbgzip) {
+        fclose(out.fp);     //no need to check result as already flushed
+    } else if( output_file != NULL && out.isbgzip) {
+        if (bgzf_close(out.bgzf_fp) < 0) {
+            print_error_errno("faidx", "Failed to close output\n");
+            exit_status = EXIT_FAILURE;
+        }
+    }
+
+exit2:
+    if (strand_names) {
+        free(strand_names);
+    }
+    if (fai) {
+        fai_destroy(fai);
+    }
+    if (pool) {
+        hts_tpool_destroy(pool);
+    }
+    sam_global_args_free(&ga);
+    ks_free(&out.buffer);
 
     return exit_status;
 }
diff --git a/samtools/lz4/lz4.c b/samtools/lz4/lz4.c
index 41c0a28ba..35c9358b0 100644
--- a/samtools/lz4/lz4.c
+++ b/samtools/lz4/lz4.c
@@ -411,7 +411,7 @@ typedef enum { full = 0, partial = 1 } earlyEnd_directive;
 int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
 const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
 int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
-int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+int LZ4_sizeofState(void) { return LZ4_STREAMSIZE; }
 
 
 /*-******************************
@@ -1434,7 +1434,7 @@ int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize,
 
 /* Obsolete Streaming functions */
 
-int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+int LZ4_sizeofStreamState(void) { return LZ4_STREAMSIZE; }
 
 static void LZ4_init(LZ4_stream_t* lz4ds, BYTE* base)
 {
diff --git a/samtools/lz4/lz4.c.pysam.c b/samtools/lz4/lz4.c.pysam.c
index a4a8ab43d..74897045d 100644
--- a/samtools/lz4/lz4.c.pysam.c
+++ b/samtools/lz4/lz4.c.pysam.c
@@ -413,7 +413,7 @@ typedef enum { full = 0, partial = 1 } earlyEnd_directive;
 int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
 const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
 int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
-int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+int LZ4_sizeofState(void) { return LZ4_STREAMSIZE; }
 
 
 /*-******************************
@@ -1436,7 +1436,7 @@ int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize,
 
 /* Obsolete Streaming functions */
 
-int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+int LZ4_sizeofStreamState(void) { return LZ4_STREAMSIZE; }
 
 static void LZ4_init(LZ4_stream_t* lz4ds, BYTE* base)
 {
diff --git a/samtools/phase.c b/samtools/phase.c
index a4a735136..62a278f22 100644
--- a/samtools/phase.c
+++ b/samtools/phase.c
@@ -116,11 +116,21 @@ static void count1(int l, const uint8_t *seq, int *cnt)
 static int **count_all(int l, int vpos, nseq_t *hash)
 {
     khint_t k;
-    int i, j, **cnt;
-    uint8_t *seq;
+    int i, j, **cnt = NULL;
+    uint8_t *seq = NULL;
+    size_t cnt_sz = ((size_t)1) << l;
+    if (cnt_sz > SSIZE_MAX / sizeof(int) / vpos) {
+        errno = ENOMEM;
+        goto fail;
+    }
     seq = calloc(l, 1);
+    if (!seq) goto fail;
     cnt = calloc(vpos, sizeof(int*));
-    for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+    if (!cnt) goto fail;
+    for (i = 0; i < vpos; ++i) {
+        cnt[i] = calloc(cnt_sz, sizeof(int));
+        if (!cnt[i]) goto fail;
+    }
     for (k = 0; k < kh_end(hash); ++k) {
         if (kh_exist(hash, k)) {
             frag_t *f = &kh_val(hash, k);
@@ -138,6 +148,15 @@ static int **count_all(int l, int vpos, nseq_t *hash)
     }
     free(seq);
     return cnt;
+ fail:
+    free(seq);
+    if (cnt) {
+        for (i = 0; i < vpos; i++)
+            free(cnt[i]);
+        free(cnt);
+    }
+    print_error_errno("phase", "Couldn't allocate memory for counts");
+    return NULL;
 }
 
 // phasing
@@ -413,6 +432,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *
         printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
         sitemask = calloc(vpos, 1);
         cnt = count_all(g->k, vpos, hash);
+        if (!cnt) return -1;
         path = dynaprog(g->k, vpos, cnt);
         for (i = 0; i < vpos; ++i) free(cnt[i]);
         free(cnt);
diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c
index b0b525767..c239d232a 100644
--- a/samtools/phase.c.pysam.c
+++ b/samtools/phase.c.pysam.c
@@ -118,11 +118,21 @@ static void count1(int l, const uint8_t *seq, int *cnt)
 static int **count_all(int l, int vpos, nseq_t *hash)
 {
     khint_t k;
-    int i, j, **cnt;
-    uint8_t *seq;
+    int i, j, **cnt = NULL;
+    uint8_t *seq = NULL;
+    size_t cnt_sz = ((size_t)1) << l;
+    if (cnt_sz > SSIZE_MAX / sizeof(int) / vpos) {
+        errno = ENOMEM;
+        goto fail;
+    }
     seq = calloc(l, 1);
+    if (!seq) goto fail;
     cnt = calloc(vpos, sizeof(int*));
-    for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+    if (!cnt) goto fail;
+    for (i = 0; i < vpos; ++i) {
+        cnt[i] = calloc(cnt_sz, sizeof(int));
+        if (!cnt[i]) goto fail;
+    }
     for (k = 0; k < kh_end(hash); ++k) {
         if (kh_exist(hash, k)) {
             frag_t *f = &kh_val(hash, k);
@@ -140,6 +150,15 @@ static int **count_all(int l, int vpos, nseq_t *hash)
     }
     free(seq);
     return cnt;
+ fail:
+    free(seq);
+    if (cnt) {
+        for (i = 0; i < vpos; i++)
+            free(cnt[i]);
+        free(cnt);
+    }
+    print_error_errno("phase", "Couldn't allocate memory for counts");
+    return NULL;
 }
 
 // phasing
@@ -415,6 +434,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *
         fprintf(samtools_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
         sitemask = calloc(vpos, 1);
         cnt = count_all(g->k, vpos, hash);
+        if (!cnt) return -1;
         path = dynaprog(g->k, vpos, cnt);
         for (i = 0; i < vpos; ++i) free(cnt[i]);
         free(cnt);
diff --git a/samtools/reset.c b/samtools/reset.c
index 4e522cddf..8f76a564e 100644
--- a/samtools/reset.c
+++ b/samtools/reset.c
@@ -1,7 +1,7 @@
 /*  reset.c --  removes aligner updates and reference data from input sam /
                 bam / cram file and makes read data raw for new processing
 
-    Copyright (C) 2022, 2023 Genome Research Ltd.
+    Copyright (C) 2022 - 2024 Genome Research Ltd.
 
     Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
 
@@ -38,10 +38,13 @@ DEALINGS IN THE SOFTWARE
 #define TAGNUM(X) (((X)[0] << 8) | (X)[1])  //to create key for aux tags, like type key in htslib
 #define LONG_OPT(X) (128 + (X))             //to handle long and short options with same char
 
+#define RESET_KEEPDUPFLAG   1               //keep dup flag as such, as in initial implementation
+
 typedef struct conf_data
 {
     int keepRGs;                    //RG line handling
     int noPGentry;                  //PG line for reset op or not
+    int ctrlFlags;                  //control flags
     auxhash_t aux_keep;             //SET that holds the aux tags to be retained
     auxhash_t aux_remove;           //SET that holds the aux tags to be removed
     char *pgid;                     //PG id onwards which to be removed
@@ -62,49 +65,74 @@ static void usage(FILE *fp)
       --reject-PG ID\n\
                Removes PG line with ID matching to input and succeeding PG lines\n\
       --no-RG  To have RG lines or not\n\
-      --no-PG  To have PG entry or not for reset operation\n");
+      --no-PG  To have PG entry or not for reset operation\n\
+      --dupflag\n\
+               Keeps the duplicate flag as it is\n");
 
     sam_global_opt_help(fp, "--O--@--");
     return;
 }
 
-/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set
-/** @param bamdata - pointer to the bamdata from which needs the filtering
- *  @param config - pointer to conf_data
+/// update_aux_conf - update the user given aux tag configuration with defaults
+/** @param config - pointer to conf_data
 returns nothing
 */
-void removeauxtags(bam1_t *bamdata, conf_data *config)
+void update_aux_conf(conf_data *config)
 {
-    uint8_t *auxdata = NULL;
-    const char *tag = NULL, rg[] = "RG";
+    const char rg[] = "RG";
+    const char *default_tags[] = {"AS", "CC", "CG", "CP", "H1", "H2", "HI", "H0", "IH",
+                                    "MC", "MD", "MQ", "NM", "SA", "TS"};
     khint_t iter = 0;
-    int ret = 0;
+    int ret = 0, i = 0;
 
-    if (!bamdata || !config || (!config->aux_keep && !config->aux_remove && config->keepRGs))
+    if (!config)
         return;
 
-    //remove RG tags from bamdata if keepRG is false
-    if (!config->keepRGs) {
-        if (!config->aux_keep && !config->aux_remove) {
-            //none of aux tag filter in use, create remove filter
-            config->aux_remove = kh_init(aux_exists);
-        }
-
-        if (config->aux_keep) {
-            //keep set in use, remove RG if present
+    if (!config->aux_keep && !config->aux_remove) {
+        //none of aux tag filter in use, create remove filter
+        config->aux_remove = kh_init(aux_exists);
+    }
+    if (config->aux_keep) {
+        //keep set in use, remove RG if present
+        if (!config->keepRGs) {
             iter = kh_get(aux_exists, config->aux_keep, TAGNUM(rg));
             if (iter != kh_end(config->aux_keep)) {
                 kh_del(aux_exists, config->aux_keep, iter);
             }
         }
-        if (config->aux_remove) {
+    }
+    if (config->aux_remove) {
+        if (!config->keepRGs) {
             //remove set in use, add RG if not present
             iter = kh_get(aux_exists, config->aux_remove, TAGNUM(rg));
             if (iter == kh_end(config->aux_remove)) {
                 kh_put(aux_exists, config->aux_remove, TAGNUM(rg), &ret);
             }
         }
+        //add the default tags if not present in remove set
+        //note, keep has priority and this may not be honoured
+        for (i = 0; i < sizeof(default_tags) / sizeof(default_tags[0]); ++i) {
+            iter = kh_get(aux_exists, config->aux_remove, TAGNUM(default_tags[i]));
+            if (iter == kh_end(config->aux_remove)) {
+                kh_put(aux_exists, config->aux_remove, TAGNUM(default_tags[i]), &ret);
+            }
+        }
     }
+}
+
+/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set
+/** @param bamdata - pointer to the bamdata from which needs the filtering
+ *  @param config - pointer to conf_data
+returns nothing
+*/
+void removeauxtags(bam1_t *bamdata, conf_data *config)
+{
+    uint8_t *auxdata = NULL;
+    const char *tag = NULL;
+    khint_t iter = 0;
+
+    if (!bamdata || !config)
+        return;
 
     for (auxdata = bam_aux_first(bamdata); auxdata; ) {
         tag = bam_aux_tag(auxdata);
@@ -322,7 +350,6 @@ int reset(samFile *infile, samFile *outfile, conf_data *config, char *args)
         if (bamdata->core.flag & BAM_FSECONDARY || bamdata->core.flag & BAM_FSUPPLEMENTARY) {
             continue;
         }
-
         //update flags
         uint16_t flags = bamdata->core.flag & ~BAM_FPROPER_PAIR;    //reset pair info
         flags |= BAM_FUNMAP;                                        //mark as unmapped
@@ -330,6 +357,9 @@ int reset(samFile *infile, samFile *outfile, conf_data *config, char *args)
             flags |= BAM_FMUNMAP;                                   //mark mate as unmapped, if it was a pair
         }
         flags &= ~BAM_FMREVERSE;                                    //reset mate orientation
+        if (!(config->ctrlFlags & RESET_KEEPDUPFLAG)) {
+            flags &= ~BAM_FDUP;                                     //reset dup flag from alignment
+        }
 
         if (0 > ks_resize(&querydata, bamdata->core.l_qseq) ||
             0 > ks_resize(&qualdata, bamdata->core.l_qseq)) {
@@ -439,6 +469,7 @@ int main_reset(int argc, char *argv[])
         //reject PG lines from input, default is to keep them (i.e. option not given); without optional filename, all PGs removed and those given in file are filtered when optional filename is given
         {"reject-PG", required_argument, NULL, 'p'},                //reject entries from this PG onwards
         {"no-PG", no_argument, NULL, 2},                            //do not add PG entry for reset operation, default is to add it
+        {"dupflag", no_argument, NULL, 3},                          //keep the dup flag as it is - as in initial reset implementation
         {NULL, 0, NULL, 0}
     };
     samFile *infile = NULL, *outfile = NULL;
@@ -447,8 +478,7 @@ int main_reset(int argc, char *argv[])
     const char *inname = NULL, *outname = NULL;
     int c = 0, ret = EXIT_FAILURE;
     char outmode[4] = "w", *args = NULL;
-    conf_data resetconf = {1, 0, NULL, NULL, NULL};                //keep RGs and PGs by default
-
+    conf_data resetconf = {1, 0, 0, NULL, NULL, NULL};              //keep RGs and PGs by default, ctrlflags = 0
 
     //samtools reset -o outfile -x/--remove-tag ... --keep-tag ... --threads=n --output-fmt=fmt --no-RG --reject-PG pgid --no-PG [<infile>]
     while ((c = getopt_long(argc, argv, "o:@:x:O:", lopts, NULL)) >= 0)
@@ -469,6 +499,9 @@ int main_reset(int argc, char *argv[])
             }
             resetconf.noPGentry = 1;
             break;
+        case 3:                             //keep dup flag as it is / no reset
+            resetconf.ctrlFlags |= RESET_KEEPDUPFLAG;
+            break;
         case 'p':                           //--reject-PG=<id>
             if (resetconf.pgid) {
                 usage(stderr);              //already given!
@@ -498,7 +531,7 @@ int main_reset(int argc, char *argv[])
                 }
             }
             break;
-        case LONG_OPT('x'):                  //keep aux tags
+        case LONG_OPT('x'):                 //keep aux tags
             if (parse_aux_list(&resetconf.aux_keep, optarg, "main_reset")) {
                 usage(stderr);
                 goto exit;
@@ -542,6 +575,8 @@ int main_reset(int argc, char *argv[])
         inname = "-";
     }
 
+    //update aux tag configuration
+    update_aux_conf(&resetconf);
     //set output file format based on name
     sam_open_mode(outmode + 1, outname, NULL);
 
diff --git a/samtools/reset.c.pysam.c b/samtools/reset.c.pysam.c
index c98946f02..db1dba7da 100644
--- a/samtools/reset.c.pysam.c
+++ b/samtools/reset.c.pysam.c
@@ -3,7 +3,7 @@
 /*  reset.c --  removes aligner updates and reference data from input sam /
                 bam / cram file and makes read data raw for new processing
 
-    Copyright (C) 2022, 2023 Genome Research Ltd.
+    Copyright (C) 2022 - 2024 Genome Research Ltd.
 
     Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
 
@@ -40,10 +40,13 @@ DEALINGS IN THE SOFTWARE
 #define TAGNUM(X) (((X)[0] << 8) | (X)[1])  //to create key for aux tags, like type key in htslib
 #define LONG_OPT(X) (128 + (X))             //to handle long and short options with same char
 
+#define RESET_KEEPDUPFLAG   1               //keep dup flag as such, as in initial implementation
+
 typedef struct conf_data
 {
     int keepRGs;                    //RG line handling
     int noPGentry;                  //PG line for reset op or not
+    int ctrlFlags;                  //control flags
     auxhash_t aux_keep;             //SET that holds the aux tags to be retained
     auxhash_t aux_remove;           //SET that holds the aux tags to be removed
     char *pgid;                     //PG id onwards which to be removed
@@ -64,49 +67,74 @@ static void usage(FILE *fp)
       --reject-PG ID\n\
                Removes PG line with ID matching to input and succeeding PG lines\n\
       --no-RG  To have RG lines or not\n\
-      --no-PG  To have PG entry or not for reset operation\n");
+      --no-PG  To have PG entry or not for reset operation\n\
+      --dupflag\n\
+               Keeps the duplicate flag as it is\n");
 
     sam_global_opt_help(fp, "--O--@--");
     return;
 }
 
-/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set
-/** @param bamdata - pointer to the bamdata from which needs the filtering
- *  @param config - pointer to conf_data
+/// update_aux_conf - update the user given aux tag configuration with defaults
+/** @param config - pointer to conf_data
 returns nothing
 */
-void removeauxtags(bam1_t *bamdata, conf_data *config)
+void update_aux_conf(conf_data *config)
 {
-    uint8_t *auxdata = NULL;
-    const char *tag = NULL, rg[] = "RG";
+    const char rg[] = "RG";
+    const char *default_tags[] = {"AS", "CC", "CG", "CP", "H1", "H2", "HI", "H0", "IH",
+                                    "MC", "MD", "MQ", "NM", "SA", "TS"};
     khint_t iter = 0;
-    int ret = 0;
+    int ret = 0, i = 0;
 
-    if (!bamdata || !config || (!config->aux_keep && !config->aux_remove && config->keepRGs))
+    if (!config)
         return;
 
-    //remove RG tags from bamdata if keepRG is false
-    if (!config->keepRGs) {
-        if (!config->aux_keep && !config->aux_remove) {
-            //none of aux tag filter in use, create remove filter
-            config->aux_remove = kh_init(aux_exists);
-        }
-
-        if (config->aux_keep) {
-            //keep set in use, remove RG if present
+    if (!config->aux_keep && !config->aux_remove) {
+        //none of aux tag filter in use, create remove filter
+        config->aux_remove = kh_init(aux_exists);
+    }
+    if (config->aux_keep) {
+        //keep set in use, remove RG if present
+        if (!config->keepRGs) {
             iter = kh_get(aux_exists, config->aux_keep, TAGNUM(rg));
             if (iter != kh_end(config->aux_keep)) {
                 kh_del(aux_exists, config->aux_keep, iter);
             }
         }
-        if (config->aux_remove) {
+    }
+    if (config->aux_remove) {
+        if (!config->keepRGs) {
             //remove set in use, add RG if not present
             iter = kh_get(aux_exists, config->aux_remove, TAGNUM(rg));
             if (iter == kh_end(config->aux_remove)) {
                 kh_put(aux_exists, config->aux_remove, TAGNUM(rg), &ret);
             }
         }
+        //add the default tags if not present in remove set
+        //note, keep has priority and this may not be honoured
+        for (i = 0; i < sizeof(default_tags) / sizeof(default_tags[0]); ++i) {
+            iter = kh_get(aux_exists, config->aux_remove, TAGNUM(default_tags[i]));
+            if (iter == kh_end(config->aux_remove)) {
+                kh_put(aux_exists, config->aux_remove, TAGNUM(default_tags[i]), &ret);
+            }
+        }
     }
+}
+
+/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set
+/** @param bamdata - pointer to the bamdata from which needs the filtering
+ *  @param config - pointer to conf_data
+returns nothing
+*/
+void removeauxtags(bam1_t *bamdata, conf_data *config)
+{
+    uint8_t *auxdata = NULL;
+    const char *tag = NULL;
+    khint_t iter = 0;
+
+    if (!bamdata || !config)
+        return;
 
     for (auxdata = bam_aux_first(bamdata); auxdata; ) {
         tag = bam_aux_tag(auxdata);
@@ -324,7 +352,6 @@ int reset(samFile *infile, samFile *outfile, conf_data *config, char *args)
         if (bamdata->core.flag & BAM_FSECONDARY || bamdata->core.flag & BAM_FSUPPLEMENTARY) {
             continue;
         }
-
         //update flags
         uint16_t flags = bamdata->core.flag & ~BAM_FPROPER_PAIR;    //reset pair info
         flags |= BAM_FUNMAP;                                        //mark as unmapped
@@ -332,6 +359,9 @@ int reset(samFile *infile, samFile *outfile, conf_data *config, char *args)
             flags |= BAM_FMUNMAP;                                   //mark mate as unmapped, if it was a pair
         }
         flags &= ~BAM_FMREVERSE;                                    //reset mate orientation
+        if (!(config->ctrlFlags & RESET_KEEPDUPFLAG)) {
+            flags &= ~BAM_FDUP;                                     //reset dup flag from alignment
+        }
 
         if (0 > ks_resize(&querydata, bamdata->core.l_qseq) ||
             0 > ks_resize(&qualdata, bamdata->core.l_qseq)) {
@@ -441,6 +471,7 @@ int main_reset(int argc, char *argv[])
         //reject PG lines from input, default is to keep them (i.e. option not given); without optional filename, all PGs removed and those given in file are filtered when optional filename is given
         {"reject-PG", required_argument, NULL, 'p'},                //reject entries from this PG onwards
         {"no-PG", no_argument, NULL, 2},                            //do not add PG entry for reset operation, default is to add it
+        {"dupflag", no_argument, NULL, 3},                          //keep the dup flag as it is - as in initial reset implementation
         {NULL, 0, NULL, 0}
     };
     samFile *infile = NULL, *outfile = NULL;
@@ -449,8 +480,7 @@ int main_reset(int argc, char *argv[])
     const char *inname = NULL, *outname = NULL;
     int c = 0, ret = EXIT_FAILURE;
     char outmode[4] = "w", *args = NULL;
-    conf_data resetconf = {1, 0, NULL, NULL, NULL};                //keep RGs and PGs by default
-
+    conf_data resetconf = {1, 0, 0, NULL, NULL, NULL};              //keep RGs and PGs by default, ctrlflags = 0
 
     //samtools reset -o outfile -x/--remove-tag ... --keep-tag ... --threads=n --output-fmt=fmt --no-RG --reject-PG pgid --no-PG [<infile>]
     while ((c = getopt_long(argc, argv, "o:@:x:O:", lopts, NULL)) >= 0)
@@ -471,6 +501,9 @@ int main_reset(int argc, char *argv[])
             }
             resetconf.noPGentry = 1;
             break;
+        case 3:                             //keep dup flag as it is / no reset
+            resetconf.ctrlFlags |= RESET_KEEPDUPFLAG;
+            break;
         case 'p':                           //--reject-PG=<id>
             if (resetconf.pgid) {
                 usage(samtools_stderr);              //already given!
@@ -500,7 +533,7 @@ int main_reset(int argc, char *argv[])
                 }
             }
             break;
-        case LONG_OPT('x'):                  //keep aux tags
+        case LONG_OPT('x'):                 //keep aux tags
             if (parse_aux_list(&resetconf.aux_keep, optarg, "main_reset")) {
                 usage(samtools_stderr);
                 goto exit;
@@ -544,6 +577,8 @@ int main_reset(int argc, char *argv[])
         inname = "-";
     }
 
+    //update aux tag configuration
+    update_aux_conf(&resetconf);
     //set output file format based on name
     sam_open_mode(outmode + 1, outname, NULL);
 
diff --git a/samtools/sam_utils.c b/samtools/sam_utils.c
index d7178b2e5..d12a1e656 100644
--- a/samtools/sam_utils.c
+++ b/samtools/sam_utils.c
@@ -42,7 +42,8 @@ void release_autoflush(htsFile *fp) {
     if (samtools_stdout == fp) samtools_stdout = NULL;
 }
 
-static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+static void HTS_FORMAT(HTS_PRINTF_FMT, 2, 0)
+vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
 {
     fflush(stdout);
     if (samtools_stdout) hts_flush(samtools_stdout);
diff --git a/samtools/sam_utils.c.pysam.c b/samtools/sam_utils.c.pysam.c
index 304dd3827..a32921993 100644
--- a/samtools/sam_utils.c.pysam.c
+++ b/samtools/sam_utils.c.pysam.c
@@ -44,7 +44,8 @@ void release_autoflush(htsFile *fp) {
     if (samtools_stdout_internal == fp) samtools_stdout_internal = NULL;
 }
 
-static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+static void HTS_FORMAT(HTS_PRINTF_FMT, 2, 0)
+vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
 {
     fflush(samtools_stdout);
     if (samtools_stdout_internal) hts_flush(samtools_stdout_internal);
diff --git a/samtools/sam_view.c b/samtools/sam_view.c
index aa5b92310..6afd653f2 100644
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -1,6 +1,6 @@
 /*  sam_view.c -- SAM<->BAM<->CRAM conversion.
 
-    Copyright (C) 2009-2023 Genome Research Ltd.
+    Copyright (C) 2009-2024 Genome Research Ltd.
     Portions copyright (C) 2009, 2011, 2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -54,6 +54,8 @@ typedef struct samview_settings {
     strhash_t rnhash;
     strhash_t tvhash;
     int min_mapQ;
+    int rghash_discard; // 0 keep, 1 discard
+    int rnhash_discard; // 0 keep, 1 discard
 
     // Described here in the same terms as the usage statement.
     // The code however always negates to "reject if"         keep if:
@@ -176,7 +178,8 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
         uint8_t *s = bam_aux_get(b, "RG");
         if (s) {
             khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
-            if (k == kh_end(settings->rghash)) return 1;
+            if ((k == kh_end(settings->rghash)) != settings->rghash_discard)
+                return 1;
         }
     }
     if (settings->tag) {
@@ -204,9 +207,11 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
     }
     if (settings->rnhash) {
         const char* rn = bam_get_qname(b);
-        if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+        strhash_t h = settings->rnhash;
+        if (!rn && !settings->rnhash_discard)
+            return 1;
+        if ((kh_get(str, h, rn) == kh_end(h)) != settings->rnhash_discard)
             return 1;
-        }
     }
     if (settings->library) {
         const char *p = bam_get_library((sam_hdr_t*)h, b);
@@ -305,7 +310,12 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
     if (settings->rghash == NULL) {
         settings->rghash = kh_init(str);
         if (settings->rghash == NULL) goto err;
+    } else if (settings->rghash_discard == 1) {
+        print_error("view", "cannot mix include and exclude read-group files in the same command line");
+        free(d);
+        return -1;
     }
+    settings->rghash_discard = 0;
 
     kh_put(str, settings->rghash, d, &ret);
     if (ret == -1) goto err;
@@ -326,8 +336,14 @@ static int add_read_names_file(const char *subcmd, samview_settings_t *settings,
             perror(NULL);
             return -1;
         }
+    } else if ((settings->rnhash_discard == 0 && *fn == '^') ||
+        (settings->rnhash_discard == 1 && *fn != '^')) {
+        print_error("view", "cannot mix include and exclude read-name files in the same command line");
+        return -1;
     }
-    return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+    settings->rnhash_discard = (*fn == '^');
+    return populate_lookup_from_file(subcmd, settings->rnhash,
+                                     fn + (*fn == '^'));
 }
 
 static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
@@ -338,8 +354,14 @@ static int add_read_groups_file(const char *subcmd, samview_settings_t *settings
             perror(NULL);
             return -1;
         }
+    } else if ((settings->rghash_discard == 0 && *fn == '^') ||
+        (settings->rghash_discard == 1 && *fn != '^')) {
+        print_error("view", "cannot mix include and exclude read-group files in the same command line");
+        return -1;
     }
-    return populate_lookup_from_file(subcmd, settings->rghash, fn);
+    settings->rghash_discard = (*fn == '^');
+    return populate_lookup_from_file(subcmd, settings->rghash,
+                                     fn + (*fn == '^'));
 }
 
 static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
@@ -533,12 +555,20 @@ hts_itr_multi_t *multi_region_init(samview_settings_t *conf, char **regs, int nr
     int filter_state = ALL;
     if ( nregs ) {
         int filter_op = 0;
-        conf->bed = bed_hash_regions(conf->bed, regs, 0, nregs, &filter_op); // insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file
+        void *bed = bed_hash_regions(conf->bed, regs, 0, nregs, &filter_op); // insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file
+        if (!bed) {
+            print_error_errno("view", "Couldn't %s region list",
+                              filter_op ? "build" : "filter");
+            return NULL;
+        }
+        conf->bed = bed;
         if ( !filter_op )
             filter_state = FILTERED;
     }
     else
         bed_unify(conf->bed);
+
+    // This check is probably redundant, but left just in case
     if ( !conf->bed) { // index is unavailable or no regions have been specified
         print_error("view", "No regions or BED file have been provided. Aborting.");
         return NULL;
@@ -609,8 +639,9 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *
             }
         }
 
-        if ( rec->core.mtid < 0 || (rec->core.flag & BAM_FMUNMAP) ) nunmap = 1;
-        if ( rec->core.mtid >= 0 ) {
+        if ( rec->core.mtid < 0 ) {
+            nunmap = 1;
+        } else {
             if (_reglist_push(&conf->reglist, &conf->nreglist, rec->core.mtid, rec->core.mpos,rec->core.mpos+1) != 0)
                 goto out;
         }
@@ -755,13 +786,13 @@ static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter)
     while ((result = sam_itr_multi_next(conf->in, iter, b)) >= 0) {
         if (process_one_record(conf, b, &write_error) < 0) break;
     }
-    hts_itr_multi_destroy(iter);
     bam_destroy1(b);
 
     if (result < -1) {
         print_error("view", "retrieval of region #%d failed", iter->curr_tid);
-        return 1;
+        write_error = 1;
     }
+    hts_itr_multi_destroy(iter);
     return write_error;
 }
 
@@ -853,6 +884,7 @@ int main_samview(int argc, char *argv[])
         {"use-index", no_argument, NULL, 'M'},
         {"with-header", no_argument, NULL, 'h'},
         {"sanitize", required_argument, NULL, 'z'},
+        {NULL, 0, NULL, 0}
     };
 
     /* parse command-line options */
@@ -868,6 +900,8 @@ int main_samview(int argc, char *argv[])
     opterr = 0;
 
     char *tmp;
+    int tmp_flag;
+
     while ((c = getopt_long(argc, argv,
                             "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pPz:",
                             lopts, NULL)) >= 0) {
@@ -912,19 +946,47 @@ int main_samview(int argc, char *argv[])
         case 'U': settings.fn_un_out = strdup(optarg); break;
         case 'X': has_index_file = 1; break;
         case 'f':
-            settings.flag_on |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_on |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case 'F':
-            settings.flag_off |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_off |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case LONGOPT('g'):
-            settings.flag_anyon |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_anyon |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case 'G':
-            settings.flag_alloff |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_alloff |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case 'q':
@@ -1084,8 +1146,27 @@ int main_samview(int argc, char *argv[])
             }
             settings.count_rf = INT_MAX; // no way to know what we need
             break;
-        case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
-        case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
+        case LONGOPT('r'):
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.remove_flag |= tmp_flag;
+            break;
+
+        case LONGOPT('a'):
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.add_flag |= tmp_flag;
+            break;
 
         case 'x':
             if (*optarg == '^') {
@@ -1354,8 +1435,6 @@ int main_samview(int argc, char *argv[])
         }
     }
 
-    if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx);
-
     if (ga.write_index) {
         if (sam_idx_save(settings.out) < 0) {
             print_error_errno("view", "writing index failed");
@@ -1368,6 +1447,8 @@ int main_samview(int argc, char *argv[])
     }
 
 view_end:
+    if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx);
+
     if (settings.is_count && ret == 0) {
         if (fprintf(settings.fn_out? fp_out : stdout, "%" PRId64 "\n", settings.count) < 0) {
             if (settings.fn_out) print_error_errno("view", "writing to \"%s\" failed", settings.fn_out);
@@ -1458,9 +1539,10 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
 "\n"
 "Filtering options (Only include in output reads that...):\n"
 "  -L, --target[s]-file FILE  ...overlap (BED) regions in FILE\n"
+"  -N, --qname-file [^]FILE   ...whose read name is listed in FILE (\"^\" negates)\n"
 "  -r, --read-group STR       ...are in read group STR\n"
-"  -R, --read-group-file FILE ...are in a read group listed in FILE\n"
-"  -N, --qname-file FILE      ...whose read name is listed in FILE\n"
+"  -R, --read-group-file [^]FILE\n"
+"                             ...are in a read group listed in FILE\n"
 "  -d, --tag STR1[:STR2]      ...have a tag STR1 (with associated value STR2)\n"
 "  -D, --tag-file STR:FILE    ...have a tag STR whose value is listed in FILE\n"
 "  -q, --min-MQ INT           ...have mapping quality >= INT\n"
@@ -1634,7 +1716,7 @@ int main_head(int argc, char *argv[])
     if (nrecords > 0) {
         b = bam_init1();
         uint64_t n;
-        int r;
+        int r = 0;
         for (n = 0; n < nrecords && (r = sam_read1(fp, hdr, b)) >= 0; n++) {
             if (sam_format1(hdr, b, &str) < 0) {
                 print_error_errno("head", "couldn't format record");
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c
index e1b681b82..d1f55ee9f 100644
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  sam_view.c -- SAM<->BAM<->CRAM conversion.
 
-    Copyright (C) 2009-2023 Genome Research Ltd.
+    Copyright (C) 2009-2024 Genome Research Ltd.
     Portions copyright (C) 2009, 2011, 2012 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -56,6 +56,8 @@ typedef struct samview_settings {
     strhash_t rnhash;
     strhash_t tvhash;
     int min_mapQ;
+    int rghash_discard; // 0 keep, 1 discard
+    int rnhash_discard; // 0 keep, 1 discard
 
     // Described here in the same terms as the usage statement.
     // The code however always negates to "reject if"         keep if:
@@ -178,7 +180,8 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
         uint8_t *s = bam_aux_get(b, "RG");
         if (s) {
             khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
-            if (k == kh_end(settings->rghash)) return 1;
+            if ((k == kh_end(settings->rghash)) != settings->rghash_discard)
+                return 1;
         }
     }
     if (settings->tag) {
@@ -206,9 +209,11 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
     }
     if (settings->rnhash) {
         const char* rn = bam_get_qname(b);
-        if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+        strhash_t h = settings->rnhash;
+        if (!rn && !settings->rnhash_discard)
+            return 1;
+        if ((kh_get(str, h, rn) == kh_end(h)) != settings->rnhash_discard)
             return 1;
-        }
     }
     if (settings->library) {
         const char *p = bam_get_library((sam_hdr_t*)h, b);
@@ -307,7 +312,12 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
     if (settings->rghash == NULL) {
         settings->rghash = kh_init(str);
         if (settings->rghash == NULL) goto err;
+    } else if (settings->rghash_discard == 1) {
+        print_error("view", "cannot mix include and exclude read-group files in the same command line");
+        free(d);
+        return -1;
     }
+    settings->rghash_discard = 0;
 
     kh_put(str, settings->rghash, d, &ret);
     if (ret == -1) goto err;
@@ -328,8 +338,14 @@ static int add_read_names_file(const char *subcmd, samview_settings_t *settings,
             perror(NULL);
             return -1;
         }
+    } else if ((settings->rnhash_discard == 0 && *fn == '^') ||
+        (settings->rnhash_discard == 1 && *fn != '^')) {
+        print_error("view", "cannot mix include and exclude read-name files in the same command line");
+        return -1;
     }
-    return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+    settings->rnhash_discard = (*fn == '^');
+    return populate_lookup_from_file(subcmd, settings->rnhash,
+                                     fn + (*fn == '^'));
 }
 
 static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
@@ -340,8 +356,14 @@ static int add_read_groups_file(const char *subcmd, samview_settings_t *settings
             perror(NULL);
             return -1;
         }
+    } else if ((settings->rghash_discard == 0 && *fn == '^') ||
+        (settings->rghash_discard == 1 && *fn != '^')) {
+        print_error("view", "cannot mix include and exclude read-group files in the same command line");
+        return -1;
     }
-    return populate_lookup_from_file(subcmd, settings->rghash, fn);
+    settings->rghash_discard = (*fn == '^');
+    return populate_lookup_from_file(subcmd, settings->rghash,
+                                     fn + (*fn == '^'));
 }
 
 static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
@@ -535,12 +557,20 @@ hts_itr_multi_t *multi_region_init(samview_settings_t *conf, char **regs, int nr
     int filter_state = ALL;
     if ( nregs ) {
         int filter_op = 0;
-        conf->bed = bed_hash_regions(conf->bed, regs, 0, nregs, &filter_op); // insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file
+        void *bed = bed_hash_regions(conf->bed, regs, 0, nregs, &filter_op); // insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file
+        if (!bed) {
+            print_error_errno("view", "Couldn't %s region list",
+                              filter_op ? "build" : "filter");
+            return NULL;
+        }
+        conf->bed = bed;
         if ( !filter_op )
             filter_state = FILTERED;
     }
     else
         bed_unify(conf->bed);
+
+    // This check is probably redundant, but left just in case
     if ( !conf->bed) { // index is unavailable or no regions have been specified
         print_error("view", "No regions or BED file have been provided. Aborting.");
         return NULL;
@@ -611,8 +641,9 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *
             }
         }
 
-        if ( rec->core.mtid < 0 || (rec->core.flag & BAM_FMUNMAP) ) nunmap = 1;
-        if ( rec->core.mtid >= 0 ) {
+        if ( rec->core.mtid < 0 ) {
+            nunmap = 1;
+        } else {
             if (_reglist_push(&conf->reglist, &conf->nreglist, rec->core.mtid, rec->core.mpos,rec->core.mpos+1) != 0)
                 goto out;
         }
@@ -757,13 +788,13 @@ static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter)
     while ((result = sam_itr_multi_next(conf->in, iter, b)) >= 0) {
         if (process_one_record(conf, b, &write_error) < 0) break;
     }
-    hts_itr_multi_destroy(iter);
     bam_destroy1(b);
 
     if (result < -1) {
         print_error("view", "retrieval of region #%d failed", iter->curr_tid);
-        return 1;
+        write_error = 1;
     }
+    hts_itr_multi_destroy(iter);
     return write_error;
 }
 
@@ -855,6 +886,7 @@ int main_samview(int argc, char *argv[])
         {"use-index", no_argument, NULL, 'M'},
         {"with-header", no_argument, NULL, 'h'},
         {"sanitize", required_argument, NULL, 'z'},
+        {NULL, 0, NULL, 0}
     };
 
     /* parse command-line options */
@@ -870,6 +902,8 @@ int main_samview(int argc, char *argv[])
     opterr = 0;
 
     char *tmp;
+    int tmp_flag;
+
     while ((c = getopt_long(argc, argv,
                             "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pPz:",
                             lopts, NULL)) >= 0) {
@@ -914,19 +948,47 @@ int main_samview(int argc, char *argv[])
         case 'U': settings.fn_un_out = strdup(optarg); break;
         case 'X': has_index_file = 1; break;
         case 'f':
-            settings.flag_on |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_on |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case 'F':
-            settings.flag_off |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_off |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case LONGOPT('g'):
-            settings.flag_anyon |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_anyon |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case 'G':
-            settings.flag_alloff |= bam_str2flag(optarg);
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.flag_alloff |= tmp_flag;
             settings.count_rf |= SAM_FLAG | SAM_RNEXT;
             break;
         case 'q':
@@ -1086,8 +1148,27 @@ int main_samview(int argc, char *argv[])
             }
             settings.count_rf = INT_MAX; // no way to know what we need
             break;
-        case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
-        case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
+        case LONGOPT('r'):
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.remove_flag |= tmp_flag;
+            break;
+
+        case LONGOPT('a'):
+            tmp_flag = bam_str2flag(optarg);
+
+            if (tmp_flag < 0) {
+                print_error("view", "Unknown flag '%s'", optarg);
+                return 1;
+            }
+
+            settings.add_flag |= tmp_flag;
+            break;
 
         case 'x':
             if (*optarg == '^') {
@@ -1356,8 +1437,6 @@ int main_samview(int argc, char *argv[])
         }
     }
 
-    if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx);
-
     if (ga.write_index) {
         if (sam_idx_save(settings.out) < 0) {
             print_error_errno("view", "writing index failed");
@@ -1370,6 +1449,8 @@ int main_samview(int argc, char *argv[])
     }
 
 view_end:
+    if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx);
+
     if (settings.is_count && ret == 0) {
         if (fprintf(settings.fn_out? fp_out : samtools_stdout, "%" PRId64 "\n", settings.count) < 0) {
             if (settings.fn_out) print_error_errno("view", "writing to \"%s\" failed", settings.fn_out);
@@ -1460,9 +1541,10 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
 "\n"
 "Filtering options (Only include in output reads that...):\n"
 "  -L, --target[s]-file FILE  ...overlap (BED) regions in FILE\n"
+"  -N, --qname-file [^]FILE   ...whose read name is listed in FILE (\"^\" negates)\n"
 "  -r, --read-group STR       ...are in read group STR\n"
-"  -R, --read-group-file FILE ...are in a read group listed in FILE\n"
-"  -N, --qname-file FILE      ...whose read name is listed in FILE\n"
+"  -R, --read-group-file [^]FILE\n"
+"                             ...are in a read group listed in FILE\n"
 "  -d, --tag STR1[:STR2]      ...have a tag STR1 (with associated value STR2)\n"
 "  -D, --tag-file STR:FILE    ...have a tag STR whose value is listed in FILE\n"
 "  -q, --min-MQ INT           ...have mapping quality >= INT\n"
@@ -1636,7 +1718,7 @@ int main_head(int argc, char *argv[])
     if (nrecords > 0) {
         b = bam_init1();
         uint64_t n;
-        int r;
+        int r = 0;
         for (n = 0; n < nrecords && (r = sam_read1(fp, hdr, b)) >= 0; n++) {
             if (sam_format1(hdr, b, &str) < 0) {
                 print_error_errno("head", "couldn't format record");
diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h
index cb63b60aa..5fedbea27 100644
--- a/samtools/samtools.pysam.h
+++ b/samtools/samtools.pysam.h
@@ -69,6 +69,12 @@ extern int samtools_main(int argc, char *argv[]);
 #define bam_smpl_destroy samtools_bam_smpl_destroy
 #define read_file_list samtools_read_file_list
 
+/*! A non-static error() function name is used in bcftools, which collides
+    with glibc's error() function and leads to the wrong function being called
+    on some platforms. #define this name with a prefix to avoid this collision.
+ */
+#define error samtools_error
+
 #endif
 
 #endif
diff --git a/samtools/stats.c b/samtools/stats.c
index 44783a974..eebfd6775 100644
--- a/samtools/stats.c
+++ b/samtools/stats.c
@@ -1,6 +1,6 @@
 /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
 
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
     Author: Sam Nicholls <sam@samnicholls.net>
@@ -766,6 +766,9 @@ static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) {
             continue;
 
         uint32_t barcode_len = strlen(barcode);
+        if (!barcode_len) {
+            continue;        //consider 0 size barcode same as no barcode - avoids issues with realloc below
+        }
         if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time
             uint32_t offset = 0;
             for (i = 0; i < stats->ntags; i++)
@@ -2086,7 +2089,8 @@ static void init_group_id(stats_t *stats, stats_info_t *info, const char *id)
 }
 
 
-static void HTS_NORETURN error(const char *format, ...)
+static void  HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
+error(const char *format, ...)
 {
     if ( !format )
     {
@@ -2129,7 +2133,7 @@ static void HTS_NORETURN error(const char *format, ...)
 
 void cleanup_stats_info(stats_info_t* info){
     if (info->fai) fai_destroy(info->fai);
-    sam_close(info->sam);
+    if (info->sam) sam_close(info->sam);
     free(info);
 }
 
@@ -2249,7 +2253,7 @@ int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFor
     return 0;
 }
 
-stats_t* stats_init()
+stats_t* stats_init(void)
 {
     stats_t *stats = calloc(1,sizeof(stats_t));
     if (!stats)
@@ -2437,14 +2441,34 @@ int main_stats(int argc, char *argv[])
         {"cov-threshold", required_argument, NULL, 'g'},
         {NULL, 0, NULL, 0}
     };
-    int opt;
+    int opt, tmp_flag;
 
     while ( (opt=getopt_long(argc,argv,"?hdsXxpr:c:l:i:t:m:q:f:F:g:I:S:P:@:",loptions,NULL))>0 )
     {
         switch (opt)
         {
-            case 'f': info->flag_require = bam_str2flag(optarg); break;
-            case 'F': info->flag_filter |= bam_str2flag(optarg); break;
+            case 'f':
+                tmp_flag = bam_str2flag(optarg);
+
+                if (tmp_flag < 0) {
+                    print_error("stats", "Unknown flag '%s'", optarg);
+                    return 1;
+                }
+
+                info->flag_require = tmp_flag;
+                break;
+
+            case 'F':
+                tmp_flag = bam_str2flag(optarg);
+
+                if (tmp_flag < 0) {
+                    print_error("stats", "Unknown flag '%s'", optarg);
+                    return 1;
+                }
+
+                info->flag_filter |= tmp_flag;
+                break;
+
             case 'd': info->flag_filter |= BAM_FDUP; break;
             case 'X': has_index_file = 1; break;
             case 's': break;
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c
index b3462ccb2..6bd7946ba 100644
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -2,7 +2,7 @@
 
 /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
 
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
     Author: Sam Nicholls <sam@samnicholls.net>
@@ -768,6 +768,9 @@ static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) {
             continue;
 
         uint32_t barcode_len = strlen(barcode);
+        if (!barcode_len) {
+            continue;        //consider 0 size barcode same as no barcode - avoids issues with realloc below
+        }
         if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time
             uint32_t offset = 0;
             for (i = 0; i < stats->ntags; i++)
@@ -2088,7 +2091,8 @@ static void init_group_id(stats_t *stats, stats_info_t *info, const char *id)
 }
 
 
-static void HTS_NORETURN error(const char *format, ...)
+static void  HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
+error(const char *format, ...)
 {
     if ( !format )
     {
@@ -2131,7 +2135,7 @@ static void HTS_NORETURN error(const char *format, ...)
 
 void cleanup_stats_info(stats_info_t* info){
     if (info->fai) fai_destroy(info->fai);
-    sam_close(info->sam);
+    if (info->sam) sam_close(info->sam);
     free(info);
 }
 
@@ -2251,7 +2255,7 @@ int init_stat_info_fname(stats_info_t* info, const char* bam_fname, const htsFor
     return 0;
 }
 
-stats_t* stats_init()
+stats_t* stats_init(void)
 {
     stats_t *stats = calloc(1,sizeof(stats_t));
     if (!stats)
@@ -2439,14 +2443,34 @@ int main_stats(int argc, char *argv[])
         {"cov-threshold", required_argument, NULL, 'g'},
         {NULL, 0, NULL, 0}
     };
-    int opt;
+    int opt, tmp_flag;
 
     while ( (opt=getopt_long(argc,argv,"?hdsXxpr:c:l:i:t:m:q:f:F:g:I:S:P:@:",loptions,NULL))>0 )
     {
         switch (opt)
         {
-            case 'f': info->flag_require = bam_str2flag(optarg); break;
-            case 'F': info->flag_filter |= bam_str2flag(optarg); break;
+            case 'f':
+                tmp_flag = bam_str2flag(optarg);
+
+                if (tmp_flag < 0) {
+                    print_error("stats", "Unknown flag '%s'", optarg);
+                    return 1;
+                }
+
+                info->flag_require = tmp_flag;
+                break;
+
+            case 'F':
+                tmp_flag = bam_str2flag(optarg);
+
+                if (tmp_flag < 0) {
+                    print_error("stats", "Unknown flag '%s'", optarg);
+                    return 1;
+                }
+
+                info->flag_filter |= tmp_flag;
+                break;
+
             case 'd': info->flag_filter |= BAM_FDUP; break;
             case 'X': has_index_file = 1; break;
             case 's': break;
diff --git a/samtools/tmp_file.c b/samtools/tmp_file.c
index 123f425e1..81aebbfd6 100644
--- a/samtools/tmp_file.c
+++ b/samtools/tmp_file.c
@@ -43,7 +43,8 @@ DEALINGS IN THE SOFTWARE
 #include "htslib/sam.h"
 
 
-static void tmp_print_error(tmp_file_t *tmp, const char *fmt, ...) {
+static void HTS_FORMAT(HTS_PRINTF_FMT, 2, 3)
+tmp_print_error(tmp_file_t *tmp, const char *fmt, ...) {
     va_list argp;
 
     if (tmp->verbose) {
@@ -253,8 +254,8 @@ int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) {
         int ret;
 
         if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 2))) {
-            tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n",
-                (tmp->input_size + inbam->l_data));
+            tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%zu).\n",
+                            (tmp->input_size + inbam->l_data));
 
             return ret;
         }
@@ -405,7 +406,7 @@ int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) {
     tmp->entry_number++;
 
     if (tmp->read_size > tmp->output_size) {
-        tmp_print_error(tmp, "[tmp_file] Error: wrong size of data returned RS:%ld OS:%ld EN:%ld GS:%ld.\n",
+        tmp_print_error(tmp, "[tmp_file] Error: wrong size of data returned RS:%zu OS:%zu EN:%zu GS:%zu.\n",
             tmp->read_size, tmp->output_size, tmp->entry_number, tmp->group_size);
         return TMP_SAM_LZ4_ERROR;
     }
diff --git a/samtools/tmp_file.c.pysam.c b/samtools/tmp_file.c.pysam.c
index 096933248..6cab29379 100644
--- a/samtools/tmp_file.c.pysam.c
+++ b/samtools/tmp_file.c.pysam.c
@@ -45,7 +45,8 @@ DEALINGS IN THE SOFTWARE
 #include "htslib/sam.h"
 
 
-static void tmp_print_error(tmp_file_t *tmp, const char *fmt, ...) {
+static void HTS_FORMAT(HTS_PRINTF_FMT, 2, 3)
+tmp_print_error(tmp_file_t *tmp, const char *fmt, ...) {
     va_list argp;
 
     if (tmp->verbose) {
@@ -255,8 +256,8 @@ int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) {
         int ret;
 
         if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 2))) {
-            tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n",
-                (tmp->input_size + inbam->l_data));
+            tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%zu).\n",
+                            (tmp->input_size + inbam->l_data));
 
             return ret;
         }
@@ -407,7 +408,7 @@ int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) {
     tmp->entry_number++;
 
     if (tmp->read_size > tmp->output_size) {
-        tmp_print_error(tmp, "[tmp_file] Error: wrong size of data returned RS:%ld OS:%ld EN:%ld GS:%ld.\n",
+        tmp_print_error(tmp, "[tmp_file] Error: wrong size of data returned RS:%zu OS:%zu EN:%zu GS:%zu.\n",
             tmp->read_size, tmp->output_size, tmp->entry_number, tmp->group_size);
         return TMP_SAM_LZ4_ERROR;
     }
diff --git a/samtools/version.sh b/samtools/version.sh
index 7d17aee18..618a098a6 100755
--- a/samtools/version.sh
+++ b/samtools/version.sh
@@ -24,7 +24,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Master version, for use in tarballs or non-git source copies
-VERSION=1.18
+VERSION=1.21
 
 # If we have a git clone, then check against the current tag
 if [ -e .git ]
diff --git a/setup.py b/setup.py
index dbef939fa..5c2632576 100644
--- a/setup.py
+++ b/setup.py
@@ -28,9 +28,9 @@
 from setuptools.extension import Extension
 
 try:
-    from setuptools.errors import LinkError
+    from setuptools.errors import CompileError, LinkError
 except ImportError:
-    from distutils.errors import LinkError
+    from distutils.errors import CompileError, LinkError
 
 try:
     from Cython.Distutils import build_ext
@@ -78,7 +78,7 @@ def run_make(targets):
 
 
 def run_make_print_config():
-    stdout = subprocess.check_output(["make", "-s", "print-config"], encoding="ascii")
+    stdout = subprocess.check_output([os.environ.get("MAKE", "make"), "-s", "print-config"], encoding="ascii")
 
     make_print_config = {}
     for line in stdout.splitlines():
@@ -99,11 +99,14 @@ def run_nm_defined_symbols(objfile):
         if symtype not in "UFNWw":
             if IS_DARWIN:
                 # On macOS, all symbols have a leading underscore
-                symbols.add(sym.lstrip('_'))
+                symbols.add(sym[1:] if sym.startswith("_") else sym)
             else:
                 # Ignore symbols such as _edata (present in all shared objects)
                 if sym[0] not in "_$.@": symbols.add(sym)
 
+    # Work around Cython 3.1.2 bug whereby this function is not static
+    symbols.discard("__pyx_CommonTypesMetaclass_get_module")
+
     return symbols
 
 
@@ -160,7 +163,7 @@ def write_configvars_header(filename, ext, prefix):
     log.info("creating %s for '%s' extension", filename, ext.name)
     with open(filename, "w") as outf:
         for var, value in config.items():
-            outf.write('#define {}_{} "{}"\n'.format(prefix, var, value))
+            outf.write(f'#define {prefix}_{var} "{value}"\n')
 
 
 @contextmanager
@@ -170,12 +173,12 @@ def set_compiler_envvars():
         if var in os.environ:
             if var == 'CFLAGS' and 'CCSHARED' in sysconfig.get_config_vars():
                 os.environ[var] += ' ' + sysconfig.get_config_var('CCSHARED')
-            print("# pysam: (env) {}={}".format(var, os.environ[var]))
+            print(f"# pysam: (env) {var}={os.environ[var]}")
         elif var in sysconfig.get_config_vars():
             value = sysconfig.get_config_var(var)
             if var == 'CFLAGS' and 'CCSHARED' in sysconfig.get_config_vars():
                 value += ' ' + sysconfig.get_config_var('CCSHARED')
-            print("# pysam: (sysconfig) {}={}".format(var, value))
+            print(f"# pysam: (sysconfig) {var}={value}")
             os.environ[var] = value
             tmp_vars += [var]
 
@@ -186,6 +189,10 @@ def set_compiler_envvars():
             del os.environ[var]
 
 
+def format_macro_option(name, value):
+    return f"-D{name}={value}" if value is not None else f"-D{name}"
+
+
 def configure_library(library_dir, env_options=None, options=[]):
 
     configure_script = os.path.join(library_dir, "configure")
@@ -196,8 +203,7 @@ def configure_library(library_dir, env_options=None, options=[]):
         env_options = "--disable-bz2"
 
     if not os.path.exists(configure_script):
-        raise ValueError(
-            "configure script {} does not exist".format(configure_script))
+        raise ValueError(f"configure script {configure_script!r} does not exist")
 
     with changedir(library_dir), set_compiler_envvars():
         if env_options is not None:
@@ -269,6 +275,22 @@ def check_ext_symbol_conflicts(self):
 
         if errors > 0: raise LinkError("symbols defined in multiple extensions")
 
+    def c99_compile_args(self):
+        """Determines whether any compiler flags are needed to ensure C99 compilation."""
+        compiler = getattr(self.compiler, "compiler", "C compiler")
+        if isinstance(compiler, list): compiler = compiler[0]
+        log.info("checking for %s option to enable C99 features...", compiler)
+        for flags in [None, ["-std=c99"], ["-std=gnu99"]]:
+            try:
+                self.compiler.compile(["pysam/conftest_cstd.c"], output_dir=self.build_temp, extra_preargs=flags)
+                log.info("%s option to enable C99 features: %s", compiler, " ".join(flags) if flags else "none needed")
+                return flags
+            except CompileError:
+                log.info("(ignoring errors from test probes)")
+
+        log.error("%s cannot compile C99 source code", compiler)
+        return None
+
     def run(self):
         if sys.platform == 'darwin':
             ldshared = os.environ.get('LDSHARED', sysconfig.get_config_var('LDSHARED'))
@@ -283,6 +305,19 @@ def run(self):
         except subprocess.CalledProcessError:
             log.warning("skipping symbol collision check (invoking nm failed)")
 
+    def build_extensions(self):
+        c99_flags = self.c99_compile_args()
+        if c99_flags:
+            executables = {}
+            for executable in ["compiler", "compiler_so"]:
+                command = getattr(self.compiler, executable, None)
+                if command:
+                    if isinstance(command, list):  executables[executable] = command + c99_flags
+                    elif isinstance(command, str): executables[executable] = f"{command} {' '.join(c99_flags)}"
+            self.compiler.set_executables(**executables)
+
+        super().build_extensions()
+
     def build_extension(self, ext):
 
         if isinstance(ext, CyExtension) and ext._init_func:
@@ -307,7 +342,7 @@ def build_extension(self, ext):
             ext.extra_link_args += ['-dynamiclib',
                                     '-rpath', '@loader_path',
                                     '-Wl,-headerpad_max_install_names',
-                                    '-Wl,-install_name,%s' % library_path,
+                                    f'-Wl,-install_name,{library_path}',
                                     '-Wl,-x']
         else:
             if not ext.extra_link_args:
@@ -389,24 +424,21 @@ def run(self):
 # the .pyx files. If no cython is available, the C-files included in the
 # distribution will be used.
 if HAVE_CYTHON:
-    print("# pysam: cython is available - using cythonize if necessary")
+    print(f"# pysam: Cython {cython.__version__} is available - using cythonize if necessary")
     source_pattern = "pysam/libc%s.pyx"
 else:
-    print("# pysam: no cython available - using pre-compiled C")
+    print("# pysam: no Cython available - using pre-compiled C")
     source_pattern = "pysam/libc%s.c"
 
 # Exit if there are no pre-compiled files and no cython available
 fn = source_pattern % "htslib"
 if not os.path.exists(fn):
     raise ValueError(
-        "no cython installed, but can not find {}."
-        "Make sure that cython is installed when building "
-        "from the repository"
-        .format(fn))
-
-print("# pysam: htslib mode is {}".format(HTSLIB_MODE))
-print("# pysam: HTSLIB_CONFIGURE_OPTIONS={}".format(
-    HTSLIB_CONFIGURE_OPTIONS))
+        f"no Cython installed, but cannot find {fn}. "
+        "Make sure that Cython is installed when building from the repository")
+
+print(f"# pysam: htslib mode is {HTSLIB_MODE}")
+print(f"# pysam: HTSLIB_CONFIGURE_OPTIONS={HTSLIB_CONFIGURE_OPTIONS}")
 htslib_configure_options = None
 
 if HTSLIB_MODE in ['shared', 'separate']:
@@ -421,8 +453,7 @@ def run(self):
          "--disable-libcurl"])
 
     HTSLIB_SOURCE = "builtin"
-    print("# pysam: htslib configure options: {}".format(
-        str(htslib_configure_options)))
+    print(f"# pysam: htslib configure options: {htslib_configure_options}")
 
     config_headers += ["htslib/config.h"]
     if htslib_configure_options is None:
@@ -437,7 +468,7 @@ def run(self):
         htslib_make_options = run_make_print_config()
 
     for key, value in htslib_make_options.items():
-        print("# pysam: htslib_config {}={}".format(key, value))
+        print(f"# pysam: htslib_config {key}={value}")
 
     external_htslib_libraries = ['z']
     if "LIBS" in htslib_make_options:
@@ -474,11 +505,11 @@ def run(self):
     htslib_library_dirs = ["."] # when using setup.py develop?
     htslib_include_dirs = ['htslib']
 else:
-    raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
+    raise ValueError(f"unknown HTSLIB value {HTSLIB_MODE!r}")
 
 # build config.py
 with open(os.path.join("pysam", "config.py"), "w") as outf:
-    outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE))
+    outf.write(f'HTSLIB = "{HTSLIB_SOURCE}"\n')
     config_values = collections.defaultdict(int)
 
     if HTSLIB_SOURCE == "builtin":
@@ -498,8 +529,8 @@ def run(self):
                         "HAVE_LIBDEFLATE",
                         "HAVE_LIBLZMA",
                         "HAVE_MMAP"]:
-                outf.write("{} = {}\n".format(key, config_values[key]))
-                print("# pysam: config_option: {}={}".format(key, config_values[key]))
+                outf.write(f"{key} = {config_values[key]}\n")
+                print(f"# pysam: config_option: {key}={config_values[key]}")
 
 # create empty config.h files if they have not been created automatically
 # or created by the user:
@@ -532,16 +563,21 @@ def run(self):
 
 define_macros = []
 
+if os.environ.get("CIBUILDWHEEL", "0") == "1":
+    define_macros.append(("BUILDING_WHEEL", None))
+
 suffix = sysconfig.get_config_var('EXT_SUFFIX')
 
 internal_htslib_libraries = [
-    os.path.splitext("chtslib{}".format(suffix))[0]]
+    os.path.splitext(f"chtslib{suffix}")[0],
+    ]
 internal_samtools_libraries = [
-    os.path.splitext("csamtools{}".format(suffix))[0],
-    os.path.splitext("cbcftools{}".format(suffix))[0],
+    os.path.splitext(f"csamtools{suffix}")[0],
+    os.path.splitext(f"cbcftools{suffix}")[0],
     ]
 internal_pysamutil_libraries = [
-    os.path.splitext("cutils{}".format(suffix))[0]]
+    os.path.splitext(f"cutils{suffix}")[0],
+    ]
 
 libraries_for_pysam_module = external_htslib_libraries + internal_htslib_libraries + internal_pysamutil_libraries
 
@@ -567,7 +603,8 @@ def prebuild_libchtslib(ext, force):
             # TODO Eventually by running configure here, we can set these
             # extra flags for configure instead of hacking on ALL_CPPFLAGS.
             args = " ".join(ext.extra_compile_args)
-            run_make(["ALL_CPPFLAGS=-I. " + args + " $(CPPFLAGS)", "lib-static"])
+            defines = " ".join([format_macro_option(*pair) for pair in ext.define_macros])
+            run_make(["ALL_CPPFLAGS=-I. " + args + " " + defines + " $(CPPFLAGS)", "lib-static"])
     else:
         log.warning("skipping 'libhts.a' (already built)")
 
@@ -593,7 +630,7 @@ def prebuild_libcsamtools(ext, force):
          extra_objects=separate_htslib_objects,
          libraries=external_htslib_libraries + internal_htslib_libraries),
     dict(name="pysam.libcutils",
-         sources=[source_pattern % "utils", "pysam/pysam_util.c"] + os_c_files,
+         sources=[source_pattern % "utils"] + os_c_files,
          extra_objects=separate_htslib_objects,
          libraries=external_htslib_libraries + internal_htslib_libraries + internal_samtools_libraries),
     dict(name="pysam.libcalignmentfile",
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py
index 2669df2f6..0e850600a 100644
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -6,6 +6,7 @@
 import struct
 import copy
 import array
+from pysam import CDEL, CDIFF, CEQUAL, CINS, CMATCH, CPAD, CREF_SKIP, CSOFT_CLIP
 
 from TestUtils import (
     checkFieldEqual,
@@ -49,6 +50,38 @@ class TestAlignedSegment(ReadTest):
     and manipulated.
     """
 
+    def check_get_aligned_pairs_combos(self, a, exp):
+        def positions(exp):  return [(ppos, rpos)        for ppos, rpos, base, cigar in exp]
+        def with_seq(exp):   return [(ppos, rpos, base)  for ppos, rpos, base, cigar in exp]
+        def with_cigar(exp): return [(ppos, rpos, cigar) for ppos, rpos, base, cigar in exp]
+
+        self.assertEqual(a.get_aligned_pairs(), positions(exp))
+        self.assertEqual(a.get_aligned_pairs(with_seq=True), with_seq(exp))
+        self.assertEqual(a.get_aligned_pairs(with_cigar=True), with_cigar(exp))
+        self.assertEqual(a.get_aligned_pairs(with_seq=True, with_cigar=True), exp)
+
+        exp = [(ppos, rpos, base, cigar) for ppos, rpos, base, cigar in exp if ppos is not None and rpos is not None]
+
+        self.assertEqual(a.get_aligned_pairs(matches_only=True), positions(exp))
+        self.assertEqual(a.get_aligned_pairs(matches_only=True, with_seq=True), with_seq(exp))
+        self.assertEqual(a.get_aligned_pairs(matches_only=True, with_cigar=True), with_cigar(exp))
+        self.assertEqual(a.get_aligned_pairs(matches_only=True, with_seq=True, with_cigar=True), exp)
+
+    def check_get_aligned_pairs_combos_without_MD(self, a, exp):
+        def positions(exp): return [(ppos, rpos) for ppos, rpos, cigar in exp]
+
+        self.assertEqual(a.get_aligned_pairs(), positions(exp))
+        with self.assertRaises(ValueError): a.get_aligned_pairs(with_seq=True)
+        self.assertEqual(a.get_aligned_pairs(with_cigar=True), exp)
+        with self.assertRaises(ValueError): a.get_aligned_pairs(with_seq=True, with_cigar=True)
+
+        exp = [(ppos, rpos, cigar) for ppos, rpos, cigar in exp if ppos is not None and rpos is not None]
+
+        self.assertEqual(a.get_aligned_pairs(matches_only=True), positions(exp))
+        with self.assertRaises(ValueError): a.get_aligned_pairs(matches_only=True, with_seq=True)
+        self.assertEqual(a.get_aligned_pairs(matches_only=True, with_cigar=True), exp)
+        with self.assertRaises(ValueError): a.get_aligned_pairs(matches_only=True, with_seq=True, with_cigar=True)
+
     def testEmpty(self):
 
         a = pysam.AlignedSegment()
@@ -203,6 +236,42 @@ def testUpdate2(self):
 
         self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10])
 
+    def testClearSequence(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        self.assertEqual(a.query_sequence, "ATGC")
+        a.query_sequence = None
+        self.assertEqual(a.query_length, 0)
+
+        a.query_sequence = "ATGC"
+        self.assertEqual(a.query_sequence, "ATGC")
+        a.query_sequence = ""
+        self.assertEqual(a.query_length, 0)
+
+        a.query_sequence = "ATGC"
+        self.assertEqual(a.query_sequence, "ATGC")
+        a.query_sequence = "*"
+        self.assertEqual(a.query_length, 0)
+
+    def testUpdateSequenceEffects1(self):
+        a = self.build_read()
+        a.query_sequence = "ATGCATGC"
+        a.cigarstring = "1S5M2S"
+        self.assertEqual(a.query_alignment_sequence, "TGCAT")
+
+        a.query_sequence = "AATTGGCC"
+        self.assertEqual(a.query_alignment_sequence, "ATTGG")
+
+    def testUpdateSequenceEffects2(self):
+        a = self.build_read()
+        a.query_sequence = "ATGCATGC"
+        a.cigarstring = "1S5M2S"
+        self.assertEqual(a.query_alignment_sequence, "TGCAT")
+
+        a.query_sequence = "*"
+        self.assertIsNone(a.query_sequence)
+        self.assertIsNone(a.query_alignment_sequence)
+
     def testUpdateQual(self):
         """Ensure SEQ and QUAL updates leading to absent QUAL set all bytes to 0xff"""
 
@@ -240,6 +309,80 @@ def testUpdateQual(self):
 
                     self.assertEqual(qual, b'\xff' * l_seq)
 
+    def testClearQual(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities = pysam.qualitystring_to_array("qrst")
+        a.query_qualities = None
+        self.assertIsNone(a.query_qualities)
+
+    def testClearQualStr(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities_str = "qrst"
+        self.assertEqual(a.query_qualities, pysam.qualitystring_to_array("qrst"))
+        self.assertEqual(a.query_qualities_str, "qrst")
+
+        a.query_qualities_str = None
+        self.assertIsNone(a.query_qualities)
+        self.assertIsNone(a.query_qualities_str)
+
+        a.query_qualities_str = "qrst"
+        a.query_qualities_str = ""
+        self.assertIsNone(a.query_qualities)
+        self.assertIsNone(a.query_qualities_str)
+
+        a.query_qualities_str = "qrst"
+        a.query_qualities_str = "*"
+        self.assertIsNone(a.query_qualities)
+        self.assertIsNone(a.query_qualities_str)
+
+    def testUpdateQualArrayB(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities = array.array('B', [80, 81, 82, 83])
+        self.assertEqual(len(a.query_qualities), 4)
+        self.assertEqual(a.query_qualities_str, "qrst")
+
+    def testUpdateQualArrayI(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities = array.array('I', [80, 81, 82, 83])
+        self.assertEqual(len(a.query_qualities), 4)
+        self.assertEqual(a.query_qualities_str, "qrst")
+
+    def testUpdateQualList(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        qual = [80, 81, 82, 83]
+        a.query_qualities = qual
+        qual.pop()
+        self.assertEqual(len(a.query_qualities), 4)
+        self.assertEqual(a.query_qualities_str, "qrst")
+
+    def testUpdateQualString(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities = "qrst"
+        self.assertEqual(len(a.query_qualities), 4)
+        self.assertEqual(a.query_qualities_str, "qrst")
+        self.assertEqual(a.qual, "qrst")
+
+    def testUpdateQualString2(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities_str = "qrst"
+        self.assertEqual(len(a.query_qualities), 4)
+        self.assertEqual(a.query_qualities_str, "qrst")
+        self.assertEqual(a.qual, "qrst")
+
+    def testUpdateQualTuple(self):
+        a = pysam.AlignedSegment()
+        a.query_sequence = "ATGC"
+        a.query_qualities = (80, 81, 82, 83)
+        self.assertEqual(len(a.query_qualities), 4)
+        self.assertEqual(a.query_qualities_str, "qrst")
+
     def testLargeRead(self):
         """build an example read."""
 
@@ -313,50 +456,50 @@ def testPositions(self):
             ],
         )
 
-        self.assertEqual(
-            a.get_aligned_pairs(),
+        self.check_get_aligned_pairs_combos_without_MD(
+            a,
             [
-                (0, 20),
-                (1, 21),
-                (2, 22),
-                (3, 23),
-                (4, 24),
-                (5, 25),
-                (6, 26),
-                (7, 27),
-                (8, 28),
-                (9, 29),
-                (None, 30),
-                (10, 31),
-                (11, 32),
-                (12, 33),
-                (13, 34),
-                (14, 35),
-                (15, 36),
-                (16, 37),
-                (17, 38),
-                (18, 39),
-                (19, None),
-                (20, 40),
-                (21, 41),
-                (22, 42),
-                (23, 43),
-                (24, 44),
-                (25, 45),
-                (26, 46),
-                (27, 47),
-                (28, 48),
-                (29, 49),
-                (30, 50),
-                (31, 51),
-                (32, 52),
-                (33, 53),
-                (34, 54),
-                (35, 55),
-                (36, 56),
-                (37, 57),
-                (38, 58),
-                (39, 59),
+                (0, 20, CMATCH),
+                (1, 21, CMATCH),
+                (2, 22, CMATCH),
+                (3, 23, CMATCH),
+                (4, 24, CMATCH),
+                (5, 25, CMATCH),
+                (6, 26, CMATCH),
+                (7, 27, CMATCH),
+                (8, 28, CMATCH),
+                (9, 29, CMATCH),
+                (None, 30, CDEL),
+                (10, 31, CMATCH),
+                (11, 32, CMATCH),
+                (12, 33, CMATCH),
+                (13, 34, CMATCH),
+                (14, 35, CMATCH),
+                (15, 36, CMATCH),
+                (16, 37, CMATCH),
+                (17, 38, CMATCH),
+                (18, 39, CMATCH),
+                (19, None, CINS),
+                (20, 40, CMATCH),
+                (21, 41, CMATCH),
+                (22, 42, CMATCH),
+                (23, 43, CMATCH),
+                (24, 44, CMATCH),
+                (25, 45, CMATCH),
+                (26, 46, CMATCH),
+                (27, 47, CMATCH),
+                (28, 48, CMATCH),
+                (29, 49, CMATCH),
+                (30, 50, CMATCH),
+                (31, 51, CMATCH),
+                (32, 52, CMATCH),
+                (33, 53, CMATCH),
+                (34, 54, CMATCH),
+                (35, 55, CMATCH),
+                (36, 56, CMATCH),
+                (37, 57, CMATCH),
+                (38, 58, CMATCH),
+                (39, 59, CMATCH),
             ],
         )
 
@@ -435,40 +578,24 @@ def test_infer_read_length(self):
     def test_get_aligned_pairs_soft_clipping(self):
         a = self.build_read()
         a.cigartuples = ((4, 2), (0, 35), (4, 3))
-        self.assertEqual(
-            a.get_aligned_pairs(),
-            [(0, None), (1, None)]
+        self.check_get_aligned_pairs_combos_without_MD(
+            a,
+            [(0, None, CSOFT_CLIP), (1, None, CSOFT_CLIP)]
             + [
-                (qpos, refpos)
+                (qpos, refpos, CMATCH)
                 for (qpos, refpos) in zip(range(2, 2 + 35), range(20, 20 + 35))
             ]
-            + [(37, None), (38, None), (39, None)],
-        )
-        self.assertEqual(
-            a.get_aligned_pairs(True),
-            # [(0, None), (1, None)] +
-            [
-                (qpos, refpos)
-                for (qpos, refpos) in zip(range(2, 2 + 35), range(20, 20 + 35))
-            ]
-            # [(37, None), (38, None), (39, None)]
+            + [(37, None, CSOFT_CLIP), (38, None, CSOFT_CLIP), (39, None, CSOFT_CLIP)],
         )
 
     def test_get_aligned_pairs_hard_clipping(self):
         a = self.build_read()
         a.cigartuples = ((5, 2), (0, 35), (5, 3))
-        self.assertEqual(
-            a.get_aligned_pairs(),
+        self.check_get_aligned_pairs_combos_without_MD(
+            a,
             # No seq, no seq pos
             [
-                (qpos, refpos)
-                for (qpos, refpos) in zip(range(0, 0 + 35), range(20, 20 + 35))
-            ],
-        )
-        self.assertEqual(
-            a.get_aligned_pairs(True),
-            [
-                (qpos, refpos)
+                (qpos, refpos, CMATCH)
                 for (qpos, refpos) in zip(range(0, 0 + 35), range(20, 20 + 35))
             ],
         )
@@ -476,23 +603,12 @@ def test_get_aligned_pairs_hard_clipping(self):
     def test_get_aligned_pairs_skip(self):
         a = self.build_read()
         a.cigarstring = "2M100D38M"
-        self.assertEqual(
-            a.get_aligned_pairs(),
-            [(0, 20), (1, 21)]
-            + [(None, refpos) for refpos in range(22, 22 + 100)]
+        self.check_get_aligned_pairs_combos_without_MD(
+            a,
+            [(0, 20, CMATCH), (1, 21, CMATCH)]
+            + [(None, refpos, CDEL) for refpos in range(22, 22 + 100)]
             + [
-                (qpos, refpos)
-                for (qpos, refpos) in zip(
-                    range(2, 2 + 38), range(20 + 2 + 100, 20 + 2 + 100 + 38)
-                )
-            ],
-        )
-        self.assertEqual(
-            a.get_aligned_pairs(True),
-            [(0, 20), (1, 21)] +
-            # [(None, refpos) for refpos in range(21, 21+100)] +
-            [
-                (qpos, refpos)
+                (qpos, refpos, CMATCH)
                 for (qpos, refpos) in zip(
                     range(2, 2 + 38), range(20 + 2 + 100, 20 + 2 + 100 + 38)
                 )
@@ -502,17 +618,10 @@ def test_get_aligned_pairs_skip(self):
     def test_get_aligned_pairs_match_mismatch(self):
         a = self.build_read()
         a.cigartuples = ((7, 20), (8, 20))
-        self.assertEqual(
-            a.get_aligned_pairs(),
+        self.check_get_aligned_pairs_combos_without_MD(
+            a,
             [
-                (qpos, refpos)
-                for (qpos, refpos) in zip(range(0, 0 + 40), range(20, 20 + 40))
-            ],
-        )
-        self.assertEqual(
-            a.get_aligned_pairs(True),
-            [
-                (qpos, refpos)
+                (qpos, refpos, CEQUAL if qpos < 20 else CDIFF)
                 for (qpos, refpos) in zip(range(0, 0 + 40), range(20, 20 + 40))
             ],
         )
@@ -522,8 +631,16 @@ def test_get_aligned_pairs_padding(self):
         a.cigartuples = ((0, 1), (6, 1), (0, 1))
         # The padding operation is like an insertion into the reference.
         # See comment in test_get_aligned_pairs_padding_with_seq (below).
-        self.assertEqual(a.get_aligned_pairs(),
-                         [(0, 20), (1, None), (2, 21)])
+        self.check_get_aligned_pairs_combos_without_MD(a,
+                         [(0, 20, CMATCH), (1, None, CPAD), (2, 21, CMATCH)])
+
+    def test_get_aligned_pairs_padding_via_cigarstring(self):
+        a = self.build_read()
+        a.cigarstring = "1M1P1M"
+        # The padding operation is like an insertion into the reference.
+        # See comment in test_get_aligned_pairs_padding_with_seq (below).
+        self.check_get_aligned_pairs_combos_without_MD(a,
+                         [(0, 20, CMATCH), (1, None, CPAD), (2, 21, CMATCH)])
 
     def test_get_aligned_pairs_padding_with_seq(self):
         a = self.build_read()
@@ -550,80 +667,80 @@ def test_get_aligned_pairs_padding_with_seq(self):
         # string: "Alternatively, to describe the same alignments, we can
         # modify the reference sequence to contain pads that make room for
         # sequences inserted relative to the reference."
-        self.assertEqual(a.get_aligned_pairs(with_seq=True),
-                         [(0, 20, 'A'), (1, None, None), (2, 21, 'T')])
+        self.check_get_aligned_pairs_combos(a,
+            [(0, 20, "A", CMATCH), (1, None, None, CPAD), (2, 21, "T", CMATCH)])
 
     def test_get_aligned_pairs(self):
         a = self.build_read()
         a.query_sequence = "A" * 9
         a.cigarstring = "9M"
         a.set_tag("MD", "9")
-        self.assertEqual(
-            a.get_aligned_pairs(with_seq=True),
+        self.check_get_aligned_pairs_combos(
+            a,
             [
-                (0, 20, "A"),
-                (1, 21, "A"),
-                (2, 22, "A"),
-                (3, 23, "A"),
-                (4, 24, "A"),
-                (5, 25, "A"),
-                (6, 26, "A"),
-                (7, 27, "A"),
-                (8, 28, "A"),
+                (0, 20, "A", CMATCH),
+                (1, 21, "A", CMATCH),
+                (2, 22, "A", CMATCH),
+                (3, 23, "A", CMATCH),
+                (4, 24, "A", CMATCH),
+                (5, 25, "A", CMATCH),
+                (6, 26, "A", CMATCH),
+                (7, 27, "A", CMATCH),
+                (8, 28, "A", CMATCH),
             ],
         )
 
         a.set_tag("MD", "4C4")
-        self.assertEqual(
-            a.get_aligned_pairs(with_seq=True),
+        self.check_get_aligned_pairs_combos(
+            a,
             [
-                (0, 20, "A"),
-                (1, 21, "A"),
-                (2, 22, "A"),
-                (3, 23, "A"),
-                (4, 24, "c"),
-                (5, 25, "A"),
-                (6, 26, "A"),
-                (7, 27, "A"),
-                (8, 28, "A"),
+                (0, 20, "A", CMATCH),
+                (1, 21, "A", CMATCH),
+                (2, 22, "A", CMATCH),
+                (3, 23, "A", CMATCH),
+                (4, 24, "c", CMATCH),
+                (5, 25, "A", CMATCH),
+                (6, 26, "A", CMATCH),
+                (7, 27, "A", CMATCH),
+                (8, 28, "A", CMATCH),
             ],
         )
 
         a.cigarstring = "5M2D4M"
         a.set_tag("MD", "4C^TT4")
-        self.assertEqual(
-            a.get_aligned_pairs(with_seq=True),
+        self.check_get_aligned_pairs_combos(
+            a,
             [
-                (0, 20, "A"),
-                (1, 21, "A"),
-                (2, 22, "A"),
-                (3, 23, "A"),
-                (4, 24, "c"),
-                (None, 25, "T"),
-                (None, 26, "T"),
-                (5, 27, "A"),
-                (6, 28, "A"),
-                (7, 29, "A"),
-                (8, 30, "A"),
+                (0, 20, "A", CMATCH),
+                (1, 21, "A", CMATCH),
+                (2, 22, "A", CMATCH),
+                (3, 23, "A", CMATCH),
+                (4, 24, "c", CMATCH),
+                (None, 25, "T", CDEL),
+                (None, 26, "T", CDEL),
+                (5, 27, "A", CMATCH),
+                (6, 28, "A", CMATCH),
+                (7, 29, "A", CMATCH),
+                (8, 30, "A", CMATCH),
             ],
         )
 
         a.cigarstring = "5M2D2I2M"
         a.set_tag("MD", "4C^TT2")
-        self.assertEqual(
-            a.get_aligned_pairs(with_seq=True),
+        self.check_get_aligned_pairs_combos(
+            a,
             [
-                (0, 20, "A"),
-                (1, 21, "A"),
-                (2, 22, "A"),
-                (3, 23, "A"),
-                (4, 24, "c"),
-                (None, 25, "T"),
-                (None, 26, "T"),
-                (5, None, None),
-                (6, None, None),
-                (7, 27, "A"),
-                (8, 28, "A"),
+                (0, 20, "A", CMATCH),
+                (1, 21, "A", CMATCH),
+                (2, 22, "A", CMATCH),
+                (3, 23, "A", CMATCH),
+                (4, 24, "c", CMATCH),
+                (None, 25, "T", CDEL),
+                (None, 26, "T", CDEL),
+                (5, None, None, CINS),
+                (6, None, None, CINS),
+                (7, 27, "A", CMATCH),
+                (8, 28, "A", CMATCH),
             ],
         )
 
@@ -643,54 +760,21 @@ def test_get_aligned_pairs_skip_reference(self):
         a.cigarstring = "5M1N5M"
         a.set_tag("MD", "10")
 
-        self.assertEqual(
-            a.get_aligned_pairs(with_seq=True),
+        self.check_get_aligned_pairs_combos(
+            a,
             [
-                (0, 20, "A"),
-                (1, 21, "A"),
-                (2, 22, "A"),
-                (3, 23, "A"),
-                (4, 24, "A"),
-                (None, 25, None),
-                (5, 26, "A"),
-                (6, 27, "A"),
-                (7, 28, "A"),
-                (8, 29, "A"),
-                (9, 30, "A"),
-            ],
-        )
-
-        self.assertEqual(
-            a.get_aligned_pairs(with_seq=False),
-            [
-                (0, 20),
-                (1, 21),
-                (2, 22),
-                (3, 23),
-                (4, 24),
-                (None, 25),
-                (5, 26),
-                (6, 27),
-                (7, 28),
-                (8, 29),
-                (9, 30),
-            ],
-        )
-
-        self.assertEqual(
-            a.get_aligned_pairs(matches_only=True, with_seq=False),
-            [
-                (0, 20),
-                (1, 21),
-                (2, 22),
-                (3, 23),
-                (4, 24),
-                (5, 26),
-                (6, 27),
-                (7, 28),
-                (8, 29),
-                (9, 30),
-            ],
+                (0, 20, "A", CMATCH),
+                (1, 21, "A", CMATCH),
+                (2, 22, "A", CMATCH),
+                (3, 23, "A", CMATCH),
+                (4, 24, "A", CMATCH),
+                (None, 25, None, CREF_SKIP),
+                (5, 26, "A", CMATCH),
+                (6, 27, "A", CMATCH),
+                (7, 28, "A", CMATCH),
+                (8, 29, "A", CMATCH),
+                (9, 30, "A", CMATCH),
+            ]
         )
 
     def test_equivalence_matches_only_and_with_seq(self):
@@ -698,32 +782,22 @@ def test_equivalence_matches_only_and_with_seq(self):
         a.query_sequence = "ACGT" * 2
         a.cigarstring = "4M1D4M"
         a.set_tag("MD", "4^x4")
-        full = (
-            list(zip(range(0, 4), range(20, 24), "ACGT"))
-            + [(None, 24, "x")]
-            + list(zip(range(4, 8), range(25, 29), "ACGT"))
-        )
-        self.assertEqual(a.get_aligned_pairs(matches_only=False, with_seq=True), full)
-
-        self.assertEqual(
-            a.get_aligned_pairs(matches_only=True, with_seq=True),
-            [x for x in full if x[0] is not None and x[1] is not None],
+        self.check_get_aligned_pairs_combos(
+            a,
+            list(zip(range(0, 4), range(20, 24), "ACGT", [CMATCH] * 4))
+            + [(None, 24, "x", CDEL)]
+            + list(zip(range(4, 8), range(25, 29), "ACGT", [CMATCH] * 4)),
         )
 
         a = self.build_read()
         a.query_sequence = "ACGT" * 2
         a.cigarstring = "4M1N4M"
         a.set_tag("MD", "8")
-        full = (
-            list(zip(range(0, 4), range(20, 24), "ACGT"))
-            + [(None, 24, None)]
-            + list(zip(range(4, 8), range(25, 29), "ACGT"))
-        )
-        self.assertEqual(a.get_aligned_pairs(matches_only=False, with_seq=True), full)
-
-        self.assertEqual(
-            a.get_aligned_pairs(matches_only=True, with_seq=True),
-            [x for x in full if x[0] is not None and x[1] is not None],
+        self.check_get_aligned_pairs_combos(
+            a,
+            list(zip(range(0, 4), range(20, 24), "ACGT", [CMATCH] * 4))
+            + [(None, 24, None, 3)]
+            + list(zip(range(4, 8), range(25, 29), "ACGT", [CMATCH] * 4)),
         )
 
     def test_get_aligned_pairs_lowercase_md(self):
@@ -991,6 +1065,16 @@ def testCigarString(self):
         r.cigarstring = None
         self.assertEqual(r.cigarstring, None)
 
+        r.cigarstring = "40M"
+        self.assertEqual(r.cigartuples, [(0, 40)])
+        r.cigarstring = ""
+        self.assertEqual(r.cigarstring, None)
+
+        r.cigarstring = "40M"
+        self.assertEqual(r.cigartuples, [(0, 40)])
+        r.cigarstring = "*"
+        self.assertEqual(r.cigarstring, None)
+
     def testCigar(self):
         r = self.build_read()
         self.assertEqual(r.cigartuples, [(0, 10), (2, 1), (0, 9), (1, 1), (0, 20)])
@@ -1028,6 +1112,16 @@ def testStats(self):
             expected[1][i] = 1
             self.assertEqual([list(x) for x in a.get_cigar_stats()], expected)
 
+        for i in range(1, 100):
+            cigarstring = "".join("10{}".format(x)
+                                  for x in iter("MIDNSHP=X")) * i
+            a.cigarstring = cigarstring
+            self.assertEqual(a.cigarstring, cigarstring)
+            expected = [[i * 10 for j in range(len("MIDNSHP=X"))] + [0, 0],
+                        [i for j in range(len("MIDNSHP=X"))] + [0, 0]]
+            obtained = [list(x) for x in a.get_cigar_stats()]
+            self.assertEqual(obtained, expected)
+
         a.cigarstring = "10M"
         a.set_tag("NM", 5)
         self.assertEqual(
@@ -1069,7 +1163,7 @@ def testChebi(self):
         expect = {
             ("C", 0, "m"): [(6, 102), (17, 128), (20, 153), (31, 179), (34, 204)],
             ("N", 0, "n"): [(15, 212)],
-            ("C", 0, 76792): [(19, 161), (34, 187)],
+            ("C", 0, 76792): [(19, 161), (34, 33)],
         }
 
         with pysam.AlignmentFile(filename, check_sq=False) as inf:
@@ -1090,6 +1184,21 @@ def testDouble(self):
             r = next(iter(inf))
             self.assertDictEqual(r.modified_bases, expect)
 
+    def testExplicit(self):
+        """reference bases should always be the same nucleotide
+        """
+        filename = os.path.join(BAM_DATADIR, "MM-explicit.bam")
+        expected_output = [
+            {("C", 0, "m"): [(9, 200), (10, 50), (14, 160)], ("C", 0, "h"): [(9, 10), (10, 170), (14, 20)]},
+            {("C", 0, "m"): [(9, 200), (10, 50), (13, 10), (14, 160), (16, 10)],
+             ("C", 0, "h"): [(9, 10), (10, 170), (13, 5), (14, 20), (16, 5)]},
+            {("C", 0, "m"): [(9, 200), (14, 160)], ("C", 0, "h"): [(9, 10), (10, 170), (13, 5), (14, 20), (16, 5)]},
+        ]
+
+        with pysam.AlignmentFile(filename, check_sq=False) as inf:
+            for r, expected in zip(inf, expected_output):
+                self.assertDictEqual(r.modified_bases, expected)
+
     def testMulti(self):
         """reference bases should always be the same nucleotide
         """
@@ -1609,6 +1718,18 @@ def test_set_tag_with_automated_type_detection(self):
             alt_value_type="I",
         )
 
+    def test_set_tag_invalid_value_type(self):
+        with self.assertRaises(ValueError):
+            self.check_tag("TT", "abc", value_type="#")
+
+    def test_set_array_tag_invalid_value_type(self):
+        with self.assertRaises(ValueError):
+            self.check_tag("TT", array.array('I', range(4)), value_type='#')
+
+    def test_set_array_tag_invalid_typecode(self):
+        with self.assertRaises(ValueError):
+            self.check_tag("TT", array.array('L', range(4)), value_type=None)
+
 
 class TestSetTagsGetTag(TestSetTagGetTag):
     def check_tag(self, tag, value, value_type, alt_value_type=None):
diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py
index 91e044c20..a3d971487 100644
--- a/tests/AlignmentFileHeader_test.py
+++ b/tests/AlignmentFileHeader_test.py
@@ -286,7 +286,6 @@ def compare_headers(self, a, header_b):
     def check_read_write(self, flag_write, header):
 
         fn = get_temp_filename()
-        print(fn)
         with pysam.AlignmentFile(
                 fn,
                 flag_write,
@@ -294,12 +293,13 @@ def check_read_write(self, flag_write, header):
                 reference_filename=os.path.join(BAM_DATADIR, "ex1.fa")) as outf:
             a = pysam.AlignedSegment()
             a.query_name = "abc"
+            a.flag = pysam.FUNMAP
             outf.write(a)
 
         with pysam.AlignmentFile(fn) as inf:
             read_header = inf.header
 
-        # os.unlink(fn)
+        os.unlink(fn)
         self.compare_headers(header, read_header)
         expected_lengths = dict([(x["SN"], x["LN"]) for x in header["SQ"]])
         self.assertEqual(expected_lengths,
diff --git a/tests/TestUtils.py b/tests/TestUtils.py
index cde197e6d..11944c26a 100644
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -5,6 +5,8 @@
 import inspect
 import subprocess
 import tempfile
+import time
+
 import pysam
 
 WORKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
@@ -239,15 +241,27 @@ def get_temp_context(suffix="", keep=False):
 
 
 def make_data_files(directory):
-    what = None
+    if os.path.exists(os.path.join(directory, 'all.stamp')):
+        return
+
+    make = os.environ.get('MAKE', 'make')
+
+    for attempt in range(1, 6):
+        try:
+            os.mkdir(os.path.join(directory, 'all.lock'), 0o700)
+            break
+        except FileExistsError:
+            time.sleep(attempt)
+            continue
+    else:
+        raise RuntimeError(f'Directory {directory!r} already locked: try `{make} clean` there')
+
     try:
-        if not os.path.exists(os.path.join(directory, "all.stamp")):
-            subprocess.check_output(["make", "-C", directory], stderr=subprocess.STDOUT)
+        subprocess.check_output([make, '-C', directory], stderr=subprocess.STDOUT, encoding='ascii')
     except subprocess.CalledProcessError as e:
-        what = "Making test data in '%s' failed:\n%s" % (directory, force_str(e.output))
-
-    if what is not None:
-        raise RuntimeError(what)
+        raise RuntimeError(f'Making test data in {directory!r} failed:\n{e.output}') from None
+    finally:
+        os.rmdir(os.path.join(directory, 'all.lock'))
 
 
 def load_and_convert(filename, encode=True):
diff --git a/tests/VariantFile_bench.py b/tests/VariantFile_bench.py
index 9663ea6a2..ba67a4945 100644
--- a/tests/VariantFile_bench.py
+++ b/tests/VariantFile_bench.py
@@ -23,7 +23,7 @@ def genomes_data():
     fn_small = "small.vcf.gz"
     if not os.path.exists(fn_small):
         os.system("bcftools view {} | head -n 10000 | bgzip > {}".format(fn, fn_small))
-        os.system("tabix -p vcf {}".format(fn_small))
+        os.system("tabix -f -p vcf {}".format(fn_small))
         
     return fn_small
 
diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py
index 310b83839..9b59fb9f7 100644
--- a/tests/VariantRecord_test.py
+++ b/tests/VariantRecord_test.py
@@ -88,3 +88,23 @@ def test_set_sample_alleles(vcf_header):
 
     with pytest.raises(ValueError, match='Use .allele_indices to set integer allele indices'):
         record.samples['sample1'].alleles = (1, 0)
+
+
+def test_repeated_new_record(vcf_header):
+    vcf_header.formats.add('GT', 1, 'String', "Genotype")
+    vcf_header.formats.add("AA", 1, "String", "An annotation")
+    vcf_header.formats.add("BB", 1, "String", "Another annotation")
+
+    data = {'id': 'INS_1', 'contig': '1', 'start': 10, 'stop': 15, 'alleles': ['A', 'TCGA'],
+            'samples': [{'AA': ('one'), 'GT': (0, 1), 'BB': ('two')},
+                        {'GT': (1, 0), 'BB': ('three')}]}
+
+    record1 = vcf_header.new_record(**data)
+    assert '\tGT:' in str(record1)  # Verify that GT is output first
+    assert record1.samples['sample1'].alleles == ('A', 'TCGA')
+    assert record1.samples['sample2'].alleles == ('TCGA', 'A')
+
+    record2 = vcf_header.new_record(**data)
+    assert '\tGT:' in str(record2)  # Verify that GT is actually emitted and is output first
+    assert record2.samples['sample1'].alleles == ('A', 'TCGA')
+    assert record2.samples['sample2'].alleles == ('TCGA', 'A')
diff --git a/tests/cbcf_data/Makefile b/tests/cbcf_data/Makefile
index 9c3fe7573..8a155fcad 100644
--- a/tests/cbcf_data/Makefile
+++ b/tests/cbcf_data/Makefile
@@ -11,15 +11,15 @@ all.stamp: $(VCFGZ) $(BCF)
 
 %.vcf.gz: %.vcf
 	bgzip < $< > $@
-	tabix -p vcf $@    # create tbi index
-	bcftools index $@  # create csi index
+	tabix -f -p vcf $@    # create tbi index
+	bcftools index -f $@  # create csi index
 
 %.bcf: %.vcf.gz
 	bcftools view -O b $< -o $@
-	bcftools index $@
+	bcftools index -f $@
 
 example_empty.bcf: example_empty.vcf.gz
 	touch $@
 
 clean:
-	-rm -f all.stamp *.gz *.tbi *.csi *.bcf
+	-rm -rf all.lock all.stamp *.gz *.tbi *.csi *.bcf
diff --git a/tests/compile_test.py b/tests/compile_test.py
index 2ea3343a4..4ce3d2e49 100644
--- a/tests/compile_test.py
+++ b/tests/compile_test.py
@@ -10,11 +10,12 @@
 import os
 import pytest
 import pysam
-from TestUtils import make_data_files, BAM_DATADIR, TABIX_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR, CBCF_DATADIR, TABIX_DATADIR
 
 
 def setUpModule():
     make_data_files(BAM_DATADIR)
+    make_data_files(CBCF_DATADIR)
     make_data_files(TABIX_DATADIR)
 
 
@@ -50,3 +51,30 @@ def test_gtf():
     nread = _compile_test.testCountGTF(
         pysam.Tabixfile(input_filename))
     assert nread == 237
+
+
+class TestBinaryCompatibility:
+    def test_alignments(self):
+        fp = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"))
+        hdr = pysam.AlignmentHeader()
+        aln = pysam.AlignedSegment()
+
+        assert fp.__sizeof__() == 120
+        assert hdr.__sizeof__() == 24
+        assert aln.__sizeof__() == 72
+
+    def test_tabix(self):
+        gzit = pysam.GZIterator(os.path.join(TABIX_DATADIR, "example.gtf.gz"))
+
+        with open(os.path.join(TABIX_DATADIR, "example.gtf.gz")) as fp:
+            tfit = pysam.tabix_file_iterator(fp, pysam.asTuple())
+
+        assert gzit.__sizeof__() == 80
+        assert tfit.__sizeof__() == 96
+
+    def test_variants(self):
+        fp = pysam.VariantFile(os.path.join(CBCF_DATADIR, "example_vcf43.vcf"))
+        hdr = pysam.VariantHeader()
+
+        assert fp.__sizeof__() == 120
+        assert hdr.__sizeof__() == 32
diff --git a/tests/pysam_data/MM-chebi.sam b/tests/pysam_data/MM-chebi.sam
index 62920ecc1..475a7d599 100644
--- a/tests/pysam_data/MM-chebi.sam
+++ b/tests/pysam_data/MM-chebi.sam
@@ -1,2 +1,2 @@
 @CO	Separate m, h and N modifications
-*	0	*	0	0	*	*	0	0	AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA	*	Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15;	Ml:B:C,102,128,153,179,204,161,187,212,169
+*	0	*	0	0	*	*	0	0	AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA	*	Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15;	Ml:B:C,102,128,153,179,204,161,33,212
diff --git a/tests/pysam_data/MM-explicit.sam b/tests/pysam_data/MM-explicit.sam
new file mode 100644
index 000000000..c230a9d82
--- /dev/null
+++ b/tests/pysam_data/MM-explicit.sam
@@ -0,0 +1,27 @@
+@CO	Testing explicit vs implicit base modifications.
+@CO	This covers the case where a lack of a signal could be either
+@CO	implicitly assumed to be no-mod (default) or assumed to be
+@CO	unchecked and require an explicit statement to indicate it was
+@CO	looked at and no base modification was observed.
+@CO	
+@CO	ATCATCATTCCTACCGCTATAGCCT  r1; implicit
+@CO	  -  -   ..  -. -     --
+@CO	         Mm   M
+@CO	  -  -   ..  -. -     --
+@CO	         hH   h
+@CO	
+@CO	ATCATCATTCCTACCGCTATAGCCT  r2; explicit to a small region
+@CO	  -  -   ??  ?? ?     --
+@CO	         Mm  mM m
+@CO	  -  -   ??  ?? ?     --
+@CO	         hH  hh h
+@CO	
+@CO	ATCATCATTCCTACCGCTATAGCCT  r3; mixture
+@CO	  -  -   .   -. -     --
+@CO	         M    M
+@CO	  -  -   ??  ?? ?     --
+@CO	         hH  hh h     --
+@CO	
+r1	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh,2,0,1;	Ml:B:C,200,10,50,170,160,20
+r2	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh?,2,0,0,0,0;	Ml:B:C,200,10,50,170,10,5,160,20,10,5
+r3	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0;	Ml:B:C,200,160,10,170,5,20,5
diff --git a/tests/pysam_data/MM-multi.sam b/tests/pysam_data/MM-multi.sam
index b2259a09e..ac2831bc1 100644
--- a/tests/pysam_data/MM-multi.sam
+++ b/tests/pysam_data/MM-multi.sam
@@ -3,5 +3,5 @@
 @CO	r2 has them combined together, for example as produced by
 @CO	a joint basecaller which assigns probabilities to all
 @CO	trained events simultaneously.
-r1	0	*	0	0	*	*	0	0	AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA	*	Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2;	Ml:B:C,128,153,179,204,230,159,6,215,240
+r1	0	*	0	0	*	*	0	0	AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA	*	Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2;	Ml:B:C,128,153,179,204,230,159,6,215,240	MN:i:36
 r2	0	*	0	0	*	*	0	0	AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA	*	Mm:Z:C+mh,2,2,0,0,4,1;N+n,15;	Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240
diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile
index d87044064..16d533adf 100644
--- a/tests/pysam_data/Makefile
+++ b/tests/pysam_data/Makefile
@@ -100,8 +100,8 @@ explicit_index.cram: ex1.cram
 	cp ex1.cram $@
 
 clean:
-	rm -fr [a-z]*.bam *.bai *.csi *.fai *.gzi *.pileup* [a-z]*.cram *.crai \
-	all.stamp *~ calDepth *.dSYM pysam_*.sam \
+	rm -fr [A-Za-z]*.bam *.bai *.csi *.fai *.gzi *.pileup* [A-Za-z]*.cram *.crai \
+	all.lock all.stamp *~ calDepth *.dSYM pysam_*.sam \
 	ex2.sam ex2.sam.gz ex1.sam ex1.fa.gz \
 	with_md.sam.gz \
 	*.fq.gz
diff --git a/tests/refactoring.pl b/tests/refactoring.pl
deleted file mode 100644
index 33001114e..000000000
--- a/tests/refactoring.pl
+++ /dev/null
@@ -1,60 +0,0 @@
-while (<STDIN>) {
-    # Samfile refactoring
-    s/Samfile/AlignmentFile/g;
-
-    # AlignedRead refactoring
-    s/AlignedRead/AlignedSegment/g;
-    
-    # Do these patterns first as they match
-    # the new names
-    s/\.query/\.query_alignment_sequence/g;
-    s/\.positions/\.getReferencePositions()/g;
-
-    # Tabixfile, etc
-    s/Tabixfile/TabixFile/g;
-    s/Fastafile/FastaFile/g;
-    s/Fastqfile/FastqFile/g;
-
-    # basic attributes
-    s/\.qname/\.query_name/g;
-    s/\.tid/\.reference_id/g;
-    s/\.pos/\.reference_start/g;
-    s/\.mapq/\.mapping_quality/g;
-    s/\.rnext/\.next_reference_id/g;
-    s/\.pnext/\.next_reference_start/g;
-    s/\.tlen/\.query_length/g;
-    s/\.seq/\.query_sequence/g;
-    if (/\.qual =/) {
-	s/([[\].0-9a-zA-Z]*)\.qual = (\S*)/$1.query_qualities = pysam.fromQualityString($2)/g;
-    } else {
-	s/([[\].0-9a-zA-Z]*)\.qual/pysam.toQualityString($1\.query_qualities)/g;
-    }
-    s/\.alen/\.reference_length/g;
-    s/\.aend/\.reference_end/g;
-    s/\.rlen/\.query_alignment_length/g;
-    s/([[\].0-9a-zA-Z]*)\.qqual/pysam.toQualityString($1\.query_alignment_qualities)/g;
-    s/\.qstart/\.query_alignment_start/g;
-    s/\.qend/\.query_alignment_end/g;
-    s/\.qlen/\.query_alignment_length/g;
-    s/\.mrnm/\.next_reference_id/g;
-    s/\.rnext/\.next_reference_id/g;
-    s/\.mpos/\.next_reference_start/g;
-    s/\.rname/\.reference_id/g;
-    s/\.isize/\.query_length/g;
-    s/\.cigar/\.cigartuples/g unless (/\.cigarstring/);
-
-    s/\.blocks/\.getBlocks()/g;
-    s/\.aligned_pairs/\.getAlignedPairs()/g;
-    s/\.inferred_length/\.getInferredQueryLength()/g;
-
-    s/\.overlap()/\.getOverlap()/g;
-
-    # PileupProxy
-    s/\.n([^a-zA-Z])/\.nsegments$1/g;
-    
-    # if (/\.mrnm/ || /\.rnext/ || /\.mpos/ || /\.rname/)
-    # { 
-    #     warn "Deprecated tag $& at line $.\n";
-    # }
-    print;
-}
diff --git a/tests/samtools_test.py b/tests/samtools_test.py
index 6d49a94f3..fc31d8c06 100644
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -83,7 +83,8 @@ class SamtoolsTest(unittest.TestCase):
         "index ex1.bam %(out)s_ex1.bam.fai",
         "index -@2 ex1.bam %(out)s_ex1.bam.fai",
         "idxstats ex1.bam > %(out)s_ex1.idxstats",
-        "fixmate ex1.bam %(out)s_ex1.fixmate.bam",
+        # TODO: fixmate behaviour changed in 1.21
+        #"fixmate ex1.bam %(out)s_ex1.fixmate.bam",
         "flagstat ex1.bam > %(out)s_ex1.flagstat",
         "calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam",
         # use -s option, otherwise the following error in samtools 1.2:
@@ -178,8 +179,6 @@ def check_statement(self, statement):
 
         command = self.get_command(statement)
 
-        # self.assertTrue(command in pysam.SAMTOOLS_DISPATCH)
-
         targets = [x for x in parts if "%(out)s" in x]
         samtools_targets = [x % r_samtools for x in targets]
         pysam_targets = [x % r_pysam for x in targets]
@@ -292,6 +291,16 @@ def testEmptyIndexWithExtraArg(self):
                           "exdoesntexist.bam")
 
 
+class ExerciseSubcommands(unittest.TestCase):
+    def testFailingSamtools(self):
+        with self.assertRaises(pysam.SamtoolsError):
+            pysam.samtools.view("nonexistent.bam")
+
+    def testFailingBCFtools(self):
+        with self.assertRaises(pysam.SamtoolsError):
+            pysam.bcftools.view("nonexistent.vcf")
+
+
 if sys.platform != "darwin":
     # fails with segfault with htslib 1.5 on Osx, an issue with flockfile
     # issue seems to be with repeated calls to interface
diff --git a/tests/tabix_data/Makefile b/tests/tabix_data/Makefile
index 19812df06..4927586b7 100644
--- a/tests/tabix_data/Makefile
+++ b/tests/tabix_data/Makefile
@@ -25,16 +25,16 @@ all.stamp: $(DERIVED_FILES)
 	bgzip -c $< > $@
 
 %.gff2.gz.tbi: %.gff2.gz
-	tabix -p gff $<
+	tabix -f -p gff $<
 
 %.gff3.gz.tbi: %.gff3.gz
-	tabix -p gff $<
+	tabix -f -p gff $<
 
 %.gtf.gz.tbi: %.gtf.gz
-	tabix -p gff $<
+	tabix -f -p gff $<
 
 %.gz.tbi: %.gz
-	tabix -p $(subst .,,$(suffix $*)) $<
+	tabix -f -p $(subst .,,$(suffix $*)) $<
 
 clean:
-	-rm -f all.stamp $(DERIVED_FILES)
+	-rm -rf all.lock all.stamp $(DERIVED_FILES)
diff --git a/tests/typechecking_test.py b/tests/typechecking_test.py
new file mode 100644
index 000000000..5dce4856f
--- /dev/null
+++ b/tests/typechecking_test.py
@@ -0,0 +1,135 @@
+import inspect
+import os
+import pytest
+import re
+from typing import TYPE_CHECKING
+
+import pysam
+import pysam.samtools
+import pysam.bcftools
+
+from TestUtils import BAM_DATADIR, make_data_files
+
+try:
+    import mypy.api
+except ImportError:
+    pytest.skip('mypy API not available', allow_module_level=True)
+
+PREAMBLE = """
+import re
+from typing import TYPE_CHECKING
+
+import pysam
+import pysam.samtools
+import pysam.bcftools
+
+def typecheck(check_locals = True): pass
+"""
+
+MYPY_OPTIONS = ['--no-incremental', '--no-error-summary', '--follow-imports', 'silent']
+
+
+def typecheck(check_locals: bool = True):
+    myframe = inspect.currentframe()
+    if not myframe: pytest.skip('current stack frame not available')
+    caller = myframe.f_back
+    if not caller: pytest.skip('caller stack frame not available')
+
+    code = inspect.getsource(caller)
+    stdout, stderr, status = mypy.api.run(MYPY_OPTIONS + ['--command', PREAMBLE + code])
+    assert status == 0, f'mypy failed:\n{stdout}{stderr}'
+
+    types = {}
+    for line in stdout.splitlines():
+        m = re.search(r'note:   *(\w+): ([\w.]*)', line)
+        if m: types[m.group(1)] = m.group(2)
+
+    def _plain(s):
+        s = re.sub(r"<class '([^']*)'>", r'\1', s)
+        s = re.sub(r'builtins\.', '', s)
+        return s
+
+    if check_locals:
+        for var, vartype in types.items():
+            assert _plain(vartype) == _plain(repr(type(caller.f_locals[var]))), f'Incorrect type for {var!r}'
+
+    return types
+
+
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+
+
+@pytest.fixture
+def aln():
+    header = pysam.AlignmentHeader.from_references(['chr1', 'chr2'], [50000, 20000])
+    a = pysam.AlignedSegment(header)
+    a.query_name = 'read_one'
+    a.flag = pysam.FPAIRED | pysam.FREAD1
+    a.reference_id = 0
+    a.reference_start = 1000
+    a.mapping_quality = 20
+    a.next_reference_id = 1
+    a.next_reference_start = 5000
+    a.template_length = 0
+    a.cigartuples = ((pysam.CMATCH, 6), (pysam.CINS, 4), (pysam.CMATCH, 2))
+    a.query_sequence = 'ATGCATGCATGC'
+    a.query_qualities = pysam.qualitystring_to_array('abcdefghijkl')
+    return a
+
+
+@pytest.fixture
+def sam_fname():
+    return os.path.join(BAM_DATADIR, 'ex3.sam')
+
+
+@pytest.fixture
+def bam_fname():
+    return os.path.join(BAM_DATADIR, 'ex1.bam')
+
+
+def test_AlignedSegment_is_mapped(aln: pysam.AlignedSegment) -> None:
+    rmap = aln.is_mapped
+    mmap = aln.mate_is_mapped
+    rfwd = aln.is_forward
+    mfwd = aln.mate_is_forward
+
+    if TYPE_CHECKING: reveal_locals()
+    types = typecheck()
+
+
+def test_AlignmentHeader_get_reference_length(sam_fname: str) -> None:
+    inf = pysam.AlignmentFile(sam_fname)
+    n1 = inf.get_reference_length('chr1')
+    hdr = inf.header
+    n2 = hdr.get_reference_length('chr1')
+
+    if TYPE_CHECKING: reveal_locals()
+    types = typecheck()
+
+
+def test_pileup_iterator_column(bam_fname: str) -> None:
+    inf = pysam.AlignmentFile(bam_fname)
+    pu = inf.pileup('chr1')
+    for p in pu:
+        pid = p.reference_id
+        ppos = p.reference_pos
+
+    if TYPE_CHECKING: reveal_locals()
+    types = typecheck(check_locals=False)
+    assert re.search(r'\.IteratorColumn(|All|AllRefs|Region)$', types['pu'])
+    assert types['p'].endswith('PileupColumn')
+    assert types['pid'] == 'builtins.int'
+    assert types['ppos'] == 'builtins.int'
+
+
+def test_samtools_subcommands() -> None:
+    p1_samtools_faidx = pysam.samtools.faidx
+    p2_faidx = pysam.faidx
+    p3_view = pysam.view
+    p4_bcftools_view = pysam.bcftools.view
+
+    if TYPE_CHECKING: reveal_locals()
+    types = typecheck()
+    for var, vartype in types.items():
+        assert vartype.endswith('PysamDispatcher'), f'{var!r} is not a dispatcher'