diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 15314a4fc..36323d1fe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,25 +39,28 @@ jobs: matrix: # Tests [amd64] # - os: [ubuntu-latest, macos-latest, windows-2019] + os: [ubuntu-22.04, macos-latest, windows-2022] python-version: - - "3.6" - - "3.7" - "3.8" - "3.9" - "3.10" # quotes to avoid being interpreted as the number 3.1 - "3.11" - "3.12" - - "3.13-dev" + - "3.13" + - "3.14-dev" env: [{ STATIC_DEPS: true }, { STATIC_DEPS: false }] include: + #- os: ubuntu-22.04 + # python-version: "3.14-dev" + # allowed_failure: true + - os: ubuntu-latest python-version: "3.9" env: {STATIC_DEPS: true, WITH_REFNANNY: true} extra_hash: "-refnanny" - os: ubuntu-latest - python-version: "3.12" + python-version: "3.13" env: {STATIC_DEPS: true, WITH_REFNANNY: true} extra_hash: "-refnanny" @@ -73,7 +76,7 @@ jobs: # Old library setup with minimum version requirements - os: ubuntu-latest - python-version: "3.10" + python-version: "3.12" env: { STATIC_DEPS: true, LIBXML2_VERSION: 2.9.2, @@ -81,7 +84,7 @@ jobs: } extra_hash: "-oldlibs29" - os: ubuntu-latest - python-version: "3.10" + python-version: "3.12" env: { STATIC_DEPS: true, LIBXML2_VERSION: 2.10.3, @@ -89,7 +92,7 @@ jobs: } extra_hash: "-oldlibs210" - os: ubuntu-latest - python-version: "3.10" + python-version: "3.12" env: { STATIC_DEPS: true, LIBXML2_VERSION: 2.11.7, @@ -97,19 +100,40 @@ jobs: } extra_hash: "-oldlibs211" + - os: ubuntu-latest + python-version: "3.12" + #allowed_failure: true + env: { + STATIC_DEPS: true, + LIBXML2_VERSION: "", + LIBXSLT_VERSION: "", + } + extra_hash: "-latestlibs" + + - os: ubuntu-latest + python-version: "3.12" + #allowed_failure: true + env: { + STATIC_DEPS: "true", + LIBXML2_VERSION: "", + LIBXSLT_VERSION: "", + WITHOUT_ZLIB: "true", + } + extra_hash: "-nozlib" + # Ubuntu sub-jobs: # ================ # Pypy - os: ubuntu-latest - python-version: pypy-3.8 + python-version: pypy-3.9 env: { STATIC_DEPS: false } allowed_failure: true - os: ubuntu-latest - python-version: pypy-3.9 + python-version: pypy-3.10 env: { STATIC_DEPS: false } allowed_failure: true - os: ubuntu-latest - python-version: pypy-3.10 + python-version: pypy-3.11 env: { STATIC_DEPS: false } allowed_failure: true @@ -118,30 +142,32 @@ jobs: #- os: macos-latest # allowed_failure: true # Unicode parsing fails in Py3 - - os: ubuntu-20.04 - python-version: "3.6" - env: { STATIC_DEPS: true } # only static + # Legacy jobs + # =========== + #- os: ubuntu-22.04 + # python-version: "3.7" + # env: { STATIC_DEPS: true } + #- os: ubuntu-22.04 + # python-version: "3.7" + # env: { STATIC_DEPS: false } exclude: - - os: ubuntu-latest - python-version: "3.6" - # Windows sub-jobs # ============== - - os: windows-2019 + - os: windows-2022 env: { STATIC_DEPS: false } # always static # This defaults to 360 minutes (6h) which is way too long and if a test gets stuck, it can block other pipelines. - # From testing, the runs tend to take 3-8 minutes, so a limit of 30 minutes should be enough. - timeout-minutes: 30 + # From testing, the runs tend to take 8-20 minutes, so a limit of 45 minutes should be enough. + timeout-minutes: 45 runs-on: ${{ matrix.os }} env: OS_NAME: ${{ matrix.os }} PYTHON_VERSION: ${{ matrix.python-version }} MACOSX_DEPLOYMENT_TARGET: 11.0 - LIBXML2_VERSION: 2.12.6 - LIBXSLT_VERSION: 1.1.39 + LIBXML2_VERSION: 2.14.3 + LIBXSLT_VERSION: 1.1.43 COVERAGE: false GCC_VERSION: 9 USE_CCACHE: 1 @@ -156,7 +182,7 @@ jobs: fetch-depth: 1 - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: ${{ matrix.python-version }} @@ -166,22 +192,23 @@ jobs: brew install automake libtool ccache ln -s /usr/local/bin/glibtoolize /usr/local/bin/libtoolize - - name: Cache [ccache] - uses: pat-s/always-upload-cache@9a0d1c3e1a8260b05500f9b67a5be8f2a1299819 # v3.0.11 + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 if: runner.os == 'Linux' || runner.os == 'macOS' with: - path: ~/.ccache - key: ${{ runner.os }}-ccache${{ matrix.extra_hash }}-${{ matrix.python-version }}-${{ hashFiles('.github/workflows/ci.yml', 'tools/ci-run.sh') }} + max-size: 100M + create-symlink: true + key: ${{ runner.os }}-ccache${{ matrix.extra_hash }}-${{ matrix.python-version }}-${{ matrix.env.STATIC_DEPS }}-${{ env.LIBXML2_VERSION }}-${{ env.LIBXSLT_VERSION }} - name: Cache [libs] - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 if: matrix.env.STATIC_DEPS with: path: | libs/*.xz libs/*.gz libs/*.zip - key: libs-${{ runner.os }}-${{ env.LIBXML2_VERSION }}-${{ env.LIBXSLT_VERSION }} + key: libs-${{ runner.os }}-${{ matrix.env.LIBXML2_VERSION }}-${{ matrix.env.LIBXSLT_VERSION }} - name: Run CI continue-on-error: ${{ matrix.allowed_failure || false }} @@ -193,7 +220,7 @@ jobs: run: make html - name: Upload docs - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: matrix.extra_hash == '-docs' with: name: website_html @@ -201,48 +228,51 @@ jobs: if-no-files-found: ignore - name: Upload Coverage Report - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: matrix.env.COVERAGE with: name: pycoverage_html path: coverage* if-no-files-found: ignore - - name: Upload Wheel - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 - if: matrix.env.STATIC_DEPS && !matrix.extra_hash - with: - name: wheels-${{ runner.os }}-${{ matrix.python-version }} - path: dist/*.whl - if-no-files-found: ignore - - collect-wheels: - needs: [ci] + benchmarks: runs-on: ubuntu-latest + env: + CFLAGS: -march=core2 -O3 -flto -fPIC -g -Wall -Wextra + CCACHE_SLOPPINESS: "pch_defines,time_macros" + CCACHE_COMPRESS: 1 + CCACHE_COMPRESSLEVEL: 5 + STATIC_DEPS: true + LIBXML2_VERSION: 2.14.3 + LIBXSLT_VERSION: 1.1.43 + steps: - - name: Collect wheels - uses: actions/download-artifact@c850b930e6ba138125429b7e5c93fc707a7f8427 # v4.1.4 + - name: Checkout repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - path: ~/downloads - merge-multiple: true - - - name: List downloaded artifacts - run: ls -la ~/downloads + fetch-depth: 0 + fetch-tags: true - - name: Upload Linux wheels - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + if: runner.os == 'Linux' || runner.os == 'macOS' with: - name: wheels-linux - path: ~/downloads/*linux*.whl + max-size: 150M + create-symlink: true + key: ${{ runner.os }}-benchmarks-${{ env.LIBXML2_VERSION }}-${{ env.LIBXSLT_VERSION }} - - name: Upload macOS wheels - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + - name: Setup Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - name: wheels-macosx - path: ~/downloads/*macosx*.whl + python-version: | + 3.12 + 3.14-dev - - name: Upload Windows wheels - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 - with: - name: wheels-windows - path: ~/downloads/*-win*/*.whl + - name: Run Benchmarks + run: | + # Run benchmarks in all Python versions. + for PYTHON in python3.14 python3.12 ; do + ${PYTHON} -m pip install setuptools "Cython>=3.1.2" + # Compare against arbitrary 6.0-pre baseline revision (compatible with Cython 3.1) and current master. + ${PYTHON} benchmark/run_benchmarks.py 0eb4f0029497957e58a9f15280b3529bdb18d117 origin/master HEAD + done diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 84bdedb9e..cfd78d409 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -36,7 +36,7 @@ permissions: {} jobs: sdist: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 permissions: contents: write @@ -45,12 +45,12 @@ jobs: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Set up Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: "3.x" - name: Install lib dependencies - run: sudo apt-get update -y -q && sudo apt-get install -y -q "libxml2=2.9.13*" "libxml2-dev=2.9.13*" libxslt1.1 libxslt1-dev + run: sudo apt-get update -y -q && sudo apt-get install -y -q "libxml2=2.9.14*" "libxml2-dev=2.9.14*" libxslt1.1 libxslt1-dev - name: Install Python dependencies run: python -m pip install -U pip setuptools && python -m pip install -U docutils pygments sphinx sphinx-rtd-theme -r requirements.txt @@ -60,13 +60,13 @@ jobs: env: { STATIC_DEPS: false; CFLAGS="-Og" } # it's run-once, so build more quickly - name: Upload sdist - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: sdist path: dist/*.tar.gz - name: Upload website - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: website path: doc/html @@ -76,24 +76,25 @@ jobs: # This enables the next step to run cibuildwheel in parallel. # From https://iscinumpy.dev/post/cibuildwheel-2-10-0/#only-210 name: Generate wheels matrix - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 outputs: include: ${{ steps.set-matrix.outputs.include }} steps: - uses: actions/checkout@v4 - name: Install cibuildwheel # Nb. keep cibuildwheel version pin consistent with job below - run: pipx install cibuildwheel==2.15.0 + run: pipx install cibuildwheel==2.22.0 - id: set-matrix run: | MATRIX=$( { cibuildwheel --print-build-identifiers --platform linux \ - | jq -nRc '{"only": inputs, "os": "ubuntu-latest"}' \ + | jq -nRc '{"only": inputs, "os": "ubuntu-22.04"}' \ + | sed -e '/aarch64/s|ubuntu-22.04|ubuntu-22.04-arm|' \ && cibuildwheel --print-build-identifiers --platform macos \ | jq -nRc '{"only": inputs, "os": "macos-latest"}' \ && cibuildwheel --print-build-identifiers --platform windows \ - | jq -nRc '{"only": inputs, "os": "windows-2019"}' + | jq -nRc '{"only": inputs, "os": "windows-2022"}' } | jq -sc ) echo "include=$MATRIX" @@ -110,15 +111,15 @@ jobs: include: ${{ fromJson(needs.generate-wheels-matrix.outputs.include) }} env: - LIBXML2_VERSION: 2.12.6 - LIBXSLT_VERSION: 1.1.39 + LIBXML2_VERSION: 2.14.3 + LIBXSLT_VERSION: 1.1.43 steps: - name: Check out the repo uses: actions/checkout@v4 - name: Cache [libs] - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: path: | libs/*.xz @@ -133,20 +134,39 @@ jobs: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v3.0.0 with: only: ${{ matrix.only }} - name: Build old Linux wheels if: contains(matrix.only, '-manylinux_') && startsWith(matrix.only, 'cp36-') && (contains(matrix.only, 'i686') || contains(matrix.only, 'x86_64')) - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v3.0.0 env: CIBW_MANYLINUX_i686_IMAGE: manylinux1 CIBW_MANYLINUX_X86_64_IMAGE: manylinux1 with: only: ${{ matrix.only }} - - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + - name: Build faster Linux wheels + # also build wheels with the most recent manylinux images and gcc + if: runner.os == 'Linux' && !contains(matrix.only, 'i686') + uses: pypa/cibuildwheel@v3.0.0 + env: + CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PPC64LE_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_S390X_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_X86_64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_AARCH64_IMAGE: manylinux_2_28 + CIBW_MUSLLINUX_X86_64_IMAGE: musllinux_1_2 + CIBW_MUSLLINUX_AARCH64_IMAGE: musllinux_1_2 + CIBW_MUSLLINUX_PPC64LE_IMAGE: musllinux_1_2 + CIBW_MUSLLINUX_S390X_IMAGE: musllinux_1_2 + with: + only: ${{ matrix.only }} + + - name: Upload wheels + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: path: ./wheelhouse/*.whl name: lxml-wheel-${{ matrix.only }} @@ -161,7 +181,7 @@ jobs: steps: - name: Download artifacts - uses: actions/download-artifact@c850b930e6ba138125429b7e5c93fc707a7f8427 # v4.1.4 + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: path: ./release_upload merge-multiple: true @@ -169,7 +189,8 @@ jobs: - name: List downloaded artifacts run: ls -la ./release_upload - - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + - name: Upload wheels + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: path: ./release_upload/*.whl name: all_wheels diff --git a/CHANGES.txt b/CHANGES.txt index 5cd454b87..028989960 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,15 +2,216 @@ lxml changelog ============== -5.1.2 (2024-??-??) +6.0.0 (2025-??-??) +================== + +Features added +-------------- + +* GH#463: ``lxml.html.diff`` is faster and provides structurally better diffs. + Original patch by Steven Fernandez. + +* GH#405: The factories ``Element`` and ``ElementTree`` can now be used in type hints. + +* GH#448: Parsing from ``memoryview`` and other buffers is supported to allow zero-copy parsing. + +* GH#437: ``lxml.html.builder`` was missing several HTML5 tag names. + Patch by Nick Tarleton. + +* GH#458: ``CDATA`` can now be written into the incremental ``xmlfile()`` writer. + Original patch by Lane Shaw. + +* GH#438: Wheels include the ``arm7l`` target. + +* A new parser option ``decompress=False`` was added that controls the automatic + input decompression when using libxml2 2.15.0 or later. Disabling this option + by default will effectively prevent decompression bombs when handling untrusted + input. Code that depends on automatic decompression must enable this option. + Note that libxml2 2.15.0 was not released yet, so this option currently has no + effect but can already be used. + +* The set of compile time / runtime supported libxml2 feature names is available as + ``etree.LIBXML_COMPILED_FEATURES`` and ``etree.LIBXML_FEATURES``. + This currently includes + ``catalog``, ``ftp``, ``html``, ``http``, ``iconv``, ``icu``, + ``lzma``, ``regexp``, ``schematron``, ``xmlschema``, ``xpath``, ``zlib``. + +Bugs fixed +---------- + +* GH#353: Predicates in ``.find*()`` could mishandle tag indices if a default namespace is provided. + Original patch by Luise K. + +* GH#272: The ``head`` and ``body`` properties of ``lxml.html`` elements failed if no such element + was found. They now return ``None`` instead. + Original patch by FVolral. + +* Tag names provided by code (API, not data) that are longer than ``INT_MAX`` + could be truncated or mishandled in other ways. + +* ``.text_content()`` on ``lxml.html`` elements accidentally returned a "smart string" + without additional information. It now returns a plain string. + +* LP#2109931: When building lxml with coverage reporting, it now disables the ``sys.monitoring`` + support due to the lack of support in https://github.com/nedbat/coveragepy/issues/1790 + +Other changes +------------- + +* Support for Python < 3.8 was removed. + +* Parsing directly from zlib (or lzma) compressed data is now considered an optional + feature in lxml. It may get removed from libxml2 at some point for security reasons + (compression bombs) and is therefore no longer guaranteed to be available in lxml. + + As of this release, zlib support is still normally available in the binary wheels + but may get disabled or removed in later (x.y.0) releases. To test the availability, + use ``"zlib" in etree.LIBXML_FEATURES``. + +* The ``Schematron`` class is deprecated and will become non-functional in a future lxml version. + The feature will soon be removed from libxml2 and stop being available. + +* Binary wheels use the library versions libxml2 2.14.3 and libxslt 1.1.43. + Note that this disables direct HTTP and FTP support for parsing from URLs. + Use Python URL request tools instead (which usually also support HTTPS). + To test the availability, use ``"http" in etree.LIBXML_FEATURES``. + +* Windows binary wheels use the library versions libxml2 2.11.9, libxslt 1.1.39 and libiconv 1.17. + They are now based on VS-2022. + +* Built using Cython 3.1.2. + +* The debug methods ``MemDebug.dump()`` and ``MemDebug.show()`` were removed completely. + libxml2 2.13.0 discarded this feature. + + +5.4.0 (2025-04-22) +================== + +Bugs fixed +---------- + +* LP#2107279: Binary wheels use libxml2 2.13.8 and libxslt 1.1.43 to resolve several CVEs. + (Binary wheels for Windows continue to use a patched libxml2 2.11.9 and libxslt 1.1.39.) + Issue found by Anatoly Katyushin. + + +5.3.2 (2025-04-05) +================== + +This release resolves CVE-2025-24928 as described in +https://gitlab.gnome.org/GNOME/libxml2/-/issues/847 + +Bugs fixed +---------- + +* Binary wheels use libxml2 2.12.10 and libxslt 1.1.42. + +* Binary wheels for Windows use a patched libxml2 2.11.9 and libxslt 1.1.39. + + +5.3.1 (2025-02-09) +================== + +Bugs fixed +---------- + +* GH#440: Some tests were adapted for libxml2 2.14.0. + Patch by Nick Wellnhofer. + +* LP#2097175: ``DTD(external_id="…")`` erroneously required a byte string as ID value. + +* GH#450: ``iterparse()`` internally triggered the `DeprecationWarning`` added in lxml 5.3.0 when parsing HTML. + +Other changes +------------- + +* GH#442: Binary wheels for macOS no longer use the linker flag ``-flat_namespace``. + + +5.3.0 (2024-08-10) +================== + +Features added +-------------- + +* GH#421: Nested ``CDATA`` sections are no longer rejected but split on output + to represent ``]]>`` correctly. + Patch by Gertjan Klein. + +Bugs fixed +---------- + +* LP#2060160: Attribute values serialised differently in ``xmlfile.element()`` and ``xmlfile.write()``. + +* LP#2058177: The ISO-Schematron implementation could fail on unknown prefixes. + Patch by David Lakin. + +Other changes +------------- + +* LP#2067707: The ``strip_cdata`` option in ``HTMLParser()`` turned out to be useless and is now deprecated. + +* Binary wheels use the library versions libxml2 2.12.9 and libxslt 1.1.42. + +* Windows binary wheels use the library versions libxml2 2.11.8 and libxslt 1.1.39. + +* Built with Cython 3.0.11. + + +5.2.2 (2024-05-12) +================== + +Bugs fixed +---------- + +* GH#417: The ``test_feed_parser`` test could fail if ``lxml_html_clean`` was not installed. + It is now skipped in that case. + +* LP#2059910: The minimum CPU architecture for the Linux x86 binary wheels was set back to + "core2", without SSE 4.2. + +* If libxml2 uses iconv, the compile time version is available as `etree.ICONV_COMPILED_VERSION`. + + +5.2.1 (2024-04-02) ================== Bugs fixed ---------- +* LP#2059910: The minimum CPU architecture for the Linux x86 binary wheels was set back to + "core2", but with SSE 4.2 enabled. + * LP#2059977: ``Element.iterfind("//absolute_path")`` failed with a ``SyntaxError`` where it should have issued a warning. +* GH#416: The documentation build was using the non-standard ``which`` command. + Patch by Michał Górny. + + +5.2.0 (2024-03-30) +================== + +Other changes +------------- + +* LP#1958539: The ``lxml.html.clean`` implementation suffered from several (only if used) + security issues in the past and was now extracted into a separate library: + + https://github.com/fedora-python/lxml_html_clean + + Projects that use lxml without "lxml.html.clean" will not notice any difference, + except that they won't have potentially vulnerable code installed. + The module is available as an "extra" setuptools dependency "lxml[html_clean]", + so that Projects that need "lxml.html.clean" will need to switch their requirements + from "lxml" to "lxml[html_clean]", or install the new library themselves. + +* The minimum CPU architecture for the Linux x86 binary wheels was upgraded to + "sandybridge" (launched 2011), and glibc 2.28 / gcc 12 (manylinux_2_28) wheels were added. + +* Built with Cython 3.0.10. + 5.1.1 (2024-03-28) ================== diff --git a/INSTALL.txt b/INSTALL.txt index 937e3be1a..a12dff8a6 100644 --- a/INSTALL.txt +++ b/INSTALL.txt @@ -32,7 +32,7 @@ Try something like :: - sudo port install py27-lxml + sudo port install py39-lxml To install a newer version or to install lxml on other systems, see below. @@ -41,6 +41,7 @@ see below. Requirements ------------ +You need Python 3.8+ for lxml 6.0 and later. You need Python 3.6+ for lxml 5.0 and later. lxml versions before 5.0 support Python 2.7 and 3.6+. @@ -69,7 +70,7 @@ build dependencies of the provided lxml package, e.g. :: - sudo apt-get build-dep python3-lxml + sudo apt-get install build-dep python3-lxml Installation @@ -133,8 +134,13 @@ both libraries automatically in their latest version, e.g. MacOS-X ....... -On MacOS-X, use the following to build the source distribution, -and make sure you have a working Internet connection, as this will +On MacOS-X, we provide binary wheels ("universal2" for Python 3.9+), +so just use:: + + sudo pip3 install lxml + +To build the source distribution, use the following and +make sure you have a working Internet connection, as this will download libxml2 and libxslt in order to build them:: STATIC_DEPS=true sudo pip install lxml diff --git a/LICENSE.txt b/LICENSE.txt index a76d0ed5a..0bdf03913 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,3 +1,5 @@ +BSD 3-Clause License + Copyright (c) 2004 Infrae. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -6,7 +8,7 @@ met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - + 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the diff --git a/Makefile b/Makefile index 8c66ad1a5..eba934cbb 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ PYTHON?=python3 -TESTFLAGS=-p -v +TESTFLAGS=-p -vv TESTOPTS= SETUPFLAGS= LXMLVERSION:=$(shell $(PYTHON) -c 'import re; print(re.findall(r"__version__\s*=\s*\"([^\"]+)\"", open("src/lxml/__init__.py").read())[0])' ) @@ -8,8 +8,8 @@ PYTHON_WITH_CYTHON?=$(shell $(PYTHON) -c 'import Cython.Build.Dependencies' >/d CYTHON_WITH_COVERAGE?=$(shell $(PYTHON) -c 'import Cython.Coverage; import sys; assert not hasattr(sys, "pypy_version_info")' >/dev/null 2>/dev/null && echo " --coverage" || true) PYTHON_BUILD_VERSION ?= * -MANYLINUX_LIBXML2_VERSION=2.12.6 -MANYLINUX_LIBXSLT_VERSION=1.1.39 +MANYLINUX_LIBXML2_VERSION=2.14.3 +MANYLINUX_LIBXSLT_VERSION=1.1.43 MANYLINUX_CFLAGS=-O3 -g1 -pipe -fPIC -flto MANYLINUX_LDFLAGS=-flto @@ -117,7 +117,7 @@ ftest_inplace: inplace $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS) apidoc: apidocclean inplace - @[ -x "`which sphinx-apidoc`" ] \ + @[ -x "`command -v sphinx-apidoc`" ] \ && (echo "Generating API docs ..." && \ PYTHONPATH=src:$(PYTHONPATH) sphinx-apidoc -e -P -T -o doc/api src/lxml \ "*includes" "*tests" "*pyclasslookup.py" "*usedoctest.py" "*html/_html5builder.py" \ @@ -125,7 +125,7 @@ apidoc: apidocclean inplace || (echo "not generating Sphinx autodoc API rst files") apihtml: apidoc inplace - @[ -x "`which sphinx-build`" ] \ + @[ -x "`command -v sphinx-build`" ] \ && (echo "Generating API docs ..." && \ make -C doc/api html) \ || (echo "not generating Sphinx autodoc API documentation") @@ -140,7 +140,7 @@ s5: apipdf: apidoc inplace rm -fr doc/api/_build - @[ -x "`which sphinx-build`" ] \ + @[ -x "`command -v sphinx-build`" ] \ && (echo "Generating API PDF docs ..." && \ make -C doc/api latexpdf) \ || (echo "not generating Sphinx autodoc API PDF documentation") diff --git a/README.rst b/README.rst index 07dcaf941..244af569e 100644 --- a/README.rst +++ b/README.rst @@ -63,26 +63,27 @@ Crypto currencies do not fit into that ambition. .. _`doc/main.txt`: https://github.com/lxml/lxml/blob/master/doc/main.txt .. _`INSTALL.txt`: http://lxml.de/installation.html -`AppVeyor `_ and `GitHub Actions `_ -support the lxml project with their build and CI servers. -Jetbrains supports the lxml project by donating free licenses of their -`PyCharm IDE `_. -Another supporter of the lxml project is -`COLOGNE Webdesign `_. +`GitHub Actions `_ +supports the lxml project with their build and CI servers. Project income report --------------------- -lxml has `about 80 million downloads `_ +lxml has `well over 100 million downloads `_ per month on PyPI. -* Total project income in 2023: EUR 2776.56 (231,38 € / month) +* Total project income in 2024: EUR 2826.29 (235.52 € / month, 1.96 € / 1,000,000 downloads) + + - Tidelift: EUR 2777.34 + - Paypal: EUR 48.95 + +* Total project income in 2023: EUR 2776.56 (231.38 € / month, 2.89 € / 1,000,000 downloads) - Tidelift: EUR 2738.46 - Paypal: EUR 38.10 -* Total project income in 2022: EUR 2566.38 (213.87 € / month) +* Total project income in 2022: EUR 2566.38 (213.87 € / month, 3.56 € / 1,000,000 downloads) - Tidelift: EUR 2539.38 - Paypal: EUR 24.32 diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 2a5c2bc43..000000000 --- a/appveyor.yml +++ /dev/null @@ -1,63 +0,0 @@ -version: 1.0.{build} -image: Visual Studio 2019 - -environment: - matrix: - - python: 312 - - python: 312-x64 - - python: 311 - - python: 311-x64 - - python: 310 - - python: 310-x64 - - python: 39 - - python: 39-x64 - - python: 27 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2013 - - python: 27-x64 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2013 - - python: 38 - - python: 38-x64 - - python: 37 - - python: 37-x64 - - python: 36 - - python: 36-x64 - - python: 35 - - python: 35-x64 - - - python: 312 - arch: arm64 - env: STATIC_DEPS=true - - python: 311 - arch: arm64 - env: STATIC_DEPS=true - - python: 310 - arch: arm64 - env: STATIC_DEPS=true - - python: 39 - arch: arm64 - env: STATIC_DEPS=true - - python: 38 - arch: arm64 - env: STATIC_DEPS=true - -install: - - SET PATH=C:\\Python%PYTHON%;c:\\Python%PYTHON%\\scripts;%PATH% - - ps: | - $env:PYTHON = "C:\\Python$($env:PYTHON)" - if (-not (Test-Path $env:PYTHON)) { - curl -o install_python.ps1 https://raw.githubusercontent.com/matthew-brett/multibuild/11a389d78892cf90addac8f69433d5e22bfa422a/install_python.ps1 - .\\install_python.ps1 - } - # remove the above when appveyor has proper Python 3.8 support - - python -m pip.__main__ install -U pip wheel setuptools - - pip install -r requirements.txt - -build: off -build_script: - - python -u setup.py bdist_wheel --static-deps - - ps: Get-ChildItem dist\*.whl | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name } - - python -u setup.py build_ext --inplace --static-deps - -test: off -test_script: - - python -u test.py -vv -p diff --git a/benchmark/bench_etree.py b/benchmark/bench_etree.py index 8c71a2e41..4c1fadc6e 100644 --- a/benchmark/bench_etree.py +++ b/benchmark/bench_etree.py @@ -4,7 +4,8 @@ import benchbase from benchbase import (with_attributes, with_text, onlylib, - serialized, children, nochange) + serialized, children, nochange, + anytree, widetree, widesubtree) TEXT = "some ASCII text" UTEXT = u"some klingon: \uF8D2" @@ -14,26 +15,31 @@ ############################################################ class BenchMark(benchbase.TreeBenchMark): + @anytree @nochange def bench_iter_children(self, root): for child in root: pass + @anytree @nochange def bench_iter_children_reversed(self, root): for child in reversed(root): pass + @anytree @nochange def bench_first_child(self, root): for i in self.repeat1000: child = root[0] + @anytree @nochange def bench_last_child(self, root): for i in self.repeat1000: child = root[-1] + @widetree @nochange def bench_middle_child(self, root): pos = len(root) // 2 @@ -125,11 +131,13 @@ def bench_iterparse_bytesIO_clear(self, root_xml): for event, element in self.etree.iterparse(f): element.clear() + @anytree def bench_append_from_document(self, root1, root2): # == "1,2 2,3 1,3 3,1 3,2 2,1" # trees 1 and 2, or 2 and 3, or ... for el in root2: root1.append(el) + @anytree def bench_insert_from_document(self, root1, root2): pos = len(root1)//2 for el in root2: @@ -143,12 +151,14 @@ def bench_rotate_children(self, root): del root[0] root.append(el) + @widetree def bench_reorder(self, root): for i in range(1,len(root)//2): el = root[0] del root[0] root[-i:-i] = [ el ] + @widetree def bench_reorder_slice(self, root): for i in range(1,len(root)//2): els = root[0:1] @@ -158,31 +168,29 @@ def bench_reorder_slice(self, root): def bench_clear(self, root): root.clear() - @nochange - @children - def bench_has_children(self, children): - for child in children: - if child and child and child and child and child: - pass - + @widetree @nochange @children def bench_len(self, children): for child in children: map(len, repeat(child, 20)) + @widetree @children def bench_create_subelements(self, children): SubElement = self.etree.SubElement for child in children: SubElement(child, '{test}test') - def bench_append_elements(self, root): + @widetree + @children + def bench_append_elements(self, children): Element = self.etree.Element - for child in root: + for child in children: el = Element('{test}test') child.append(el) + @widetree @nochange @children def bench_makeelement(self, children): @@ -190,6 +198,7 @@ def bench_makeelement(self, children): for child in children: child.makeelement('{test}test', empty_attrib) + @widetree @nochange @children def bench_create_elements(self, children): @@ -197,6 +206,7 @@ def bench_create_elements(self, children): for child in children: Element('{test}test') + @widetree @children def bench_replace_children_element(self, children): Element = self.etree.Element @@ -204,25 +214,30 @@ def bench_replace_children_element(self, children): el = Element('{test}test') child[:] = [el] + @widetree @children def bench_replace_children(self, children): els = [ self.etree.Element("newchild") ] for child in children: child[:] = els + @widetree def bench_remove_children(self, root): for child in root: root.remove(child) + @widetree def bench_remove_children_reversed(self, root): for child in reversed(root): root.remove(child) + @widetree @children def bench_set_attributes(self, children): for child in children: child.set('a', 'bla') + @widetree @with_attributes(True) @children @nochange @@ -231,6 +246,7 @@ def bench_get_attributes(self, children): child.get('bla1') child.get('{attr}test1') + @widetree @children def bench_setget_attributes(self, children): for child in children: @@ -238,26 +254,31 @@ def bench_setget_attributes(self, children): for child in children: child.get('a') + @widetree @nochange def bench_root_getchildren(self, root): root.getchildren() + @widetree @nochange def bench_root_list_children(self, root): list(root) + @widesubtree @nochange @children def bench_getchildren(self, children): for child in children: child.getchildren() + @widesubtree @nochange @children def bench_get_children_slice(self, children): for child in children: child[:] + @widesubtree @nochange @children def bench_get_children_slice_2x(self, children): @@ -279,12 +300,14 @@ def bench_deepcopy(self, children): def bench_deepcopy_all(self, root): copy.deepcopy(root) + @widetree @nochange @children def bench_tag(self, children): for child in children: child.tag + @widetree @nochange @children def bench_tag_repeat(self, children): @@ -292,6 +315,7 @@ def bench_tag_repeat(self, children): for i in self.repeat100: child.tag + @widetree @nochange @with_text(utext=True, text=True, no_text=True) @children @@ -299,6 +323,7 @@ def bench_text(self, children): for child in children: child.text + @widetree @nochange @with_text(utext=True, text=True, no_text=True) @children @@ -307,30 +332,35 @@ def bench_text_repeat(self, children): for i in self.repeat500: child.text + @widetree @children def bench_set_text(self, children): text = TEXT for child in children: child.text = text + @widetree @children def bench_set_utext(self, children): text = UTEXT for child in children: child.text = text + @widetree @nochange @onlylib('lxe') def bench_index(self, root): for child in root: root.index(child) + @widetree @nochange @onlylib('lxe') def bench_index_slice(self, root): for child in root[5:100]: root.index(child, 5, 100) + @widetree @nochange @onlylib('lxe') def bench_index_slice_neg(self, root): diff --git a/benchmark/bench_objectify.py b/benchmark/bench_objectify.py index 9b7126743..ac134001c 100644 --- a/benchmark/bench_objectify.py +++ b/benchmark/bench_objectify.py @@ -17,7 +17,7 @@ def __init__(self, lib): self.objectify = objectify parser = etree.XMLParser(remove_blank_text=True) lookup = objectify.ObjectifyElementClassLookup() - parser.setElementClassLookup(lookup) + parser.set_element_class_lookup(lookup) super(BenchMark, self).__init__(etree, parser) @nochange diff --git a/benchmark/bench_xpath.py b/benchmark/bench_xpath.py index 59cdc78cd..9c04ca8ff 100644 --- a/benchmark/bench_xpath.py +++ b/benchmark/bench_xpath.py @@ -29,7 +29,7 @@ def bench_xpath_class_repeat(self, children): def bench_xpath_element(self, root): xpath = self.etree.XPathElementEvaluator(root) for child in root: - xpath.evaluate("./*[1]") + xpath("./*[1]") @nochange @onlylib('lxe') diff --git a/benchmark/bench_xslt.py b/benchmark/bench_xslt.py index abfdb7c58..3b7cd021a 100644 --- a/benchmark/bench_xslt.py +++ b/benchmark/bench_xslt.py @@ -1,39 +1,12 @@ -from itertools import * - import benchbase from benchbase import onlylib + ############################################################ # Benchmarks ############################################################ class XSLTBenchMark(benchbase.TreeBenchMark): - @onlylib('lxe') - def bench_xslt_extensions_old(self, root): - tree = self.etree.XML("""\ - - TEST - - - - - - - - -""") - def return_child(_, elements): - return elements[0][0] - - extensions = {('testns', 'child') : return_child} - - transform = self.etree.XSLT(tree, extensions) - for i in range(10): - transform(root) - @onlylib('lxe') def bench_xslt_document(self, root): transform = self.etree.XSLT(self.etree.XML("""\ @@ -52,5 +25,6 @@ def bench_xslt_document(self, root): """)) transform(root) + if __name__ == '__main__': benchbase.main(XSLTBenchMark) diff --git a/benchmark/benchbase.py b/benchmark/benchbase.py index ac3c95f82..584058b4d 100644 --- a/benchmark/benchbase.py +++ b/benchmark/benchbase.py @@ -1,20 +1,12 @@ import sys, re, string, copy, gc -from itertools import * +import itertools import time - -try: - izip -except NameError: - izip = zip # Py3 - -def exec_(code, glob): - if sys.version_info[0] >= 3: - exec(code, glob) - else: - exec("exec code in glob") +from contextlib import contextmanager +from functools import partial TREE_FACTOR = 1 # increase tree size with '-l / '-L' cmd option +DEFAULT_REPEAT = 9 _TEXT = "some ASCII text" * TREE_FACTOR _UTEXT = u"some klingon: \uF8D2" * TREE_FACTOR @@ -99,6 +91,22 @@ def nochange(function): function.NO_CHANGE = True return function +def anytree(function): + "Decorator for benchmarks that do not depend on the concrete tree" + function.ANY_TREE = True + return function + +def widetree(function): + "Decorator for benchmarks that use only tree 2" + function.TREES = "2" + return function + +def widesubtree(function): + "Decorator for benchmarks that use only tree 1" + function.TREES = "1" + return function + + ############################################################ # benchmark baseclass ############################################################ @@ -106,7 +114,7 @@ def nochange(function): class SkippedTest(Exception): pass -class TreeBenchMark(object): +class TreeBenchMark: atoz = string.ascii_lowercase repeat100 = range(100) repeat500 = range(500) @@ -198,7 +206,7 @@ def generate_elem(append, elem, level): } # create function object - exec_("\n".join(output), namespace) + exec("\n".join(output), namespace) return namespace["element_factory"] def _all_trees(self): @@ -250,7 +258,7 @@ def _setup_tree3(self, text, attributes): children = [root] for i in range(6 + TREE_FACTOR): children = [ SubElement(c, "{cdefg}a%05d" % (i%8), attributes) - for i,c in enumerate(chain(children, children, children)) ] + for i,c in enumerate(itertools.chain(children, children, children)) ] for child in children: child.text = text child.tail = text @@ -282,15 +290,27 @@ def benchmarks(self): for name in dir(self): if not name.startswith('bench_'): continue + method = getattr(self, name) + + serialized = getattr(method, 'STRING', False) + children = getattr(method, 'CHILDREN', False) + no_change = getattr(method, 'NO_CHANGE', False) + any_tree = getattr(method, 'ANY_TREE', False) + tree_sets = getattr(method, 'TREES', None) + if hasattr(method, 'LIBS') and self.lib_name not in method.LIBS: method_call = None else: method_call = method - if method.__doc__: + + if tree_sets: + tree_sets = tree_sets.split() + elif method.__doc__: tree_sets = method.__doc__.split() else: tree_sets = () + if tree_sets: tree_tuples = [list(map(int, tree_set.split(','))) for tree_set in tree_sets] @@ -302,11 +322,11 @@ def benchmarks(self): arg_count = method.__code__.co_argcount - 1 except AttributeError: arg_count = 1 - tree_tuples = self._permutations(all_trees, arg_count) - serialized = getattr(method, 'STRING', False) - children = getattr(method, 'CHILDREN', False) - no_change = getattr(method, 'NO_CHANGE', False) + if any_tree: + tree_tuples = [all_trees[-arg_count:]] + else: + tree_tuples = self._permutations(all_trees, arg_count) for tree_tuple in tree_tuples: for tn in sorted(getattr(method, 'TEXT', (0,))): @@ -372,49 +392,85 @@ def printSetupTimes(benchmark_suites): print(" T%d: %s" % (i+1, ' '.join("%6.4f" % t for t in tree_times))) print('') + +def autorange(bench_func, min_runtime=0.2, max_number=None, timer=time.perf_counter): + i = 1 + while True: + for j in 1, 2, 5: + number = i * j + if max_number is not None and number >= max_number: + return max_number + time_taken = bench_func(number) + if time_taken >= min_runtime: + return number + i *= 10 + + +@contextmanager +def nogc(): + gc.collect() + gc.disable() + try: + yield + finally: + gc.enable() + + def runBench(suite, method_name, method_call, tree_set, tn, an, - serial, children, no_change): + serial, children, no_change, timer=time.perf_counter, repeat=DEFAULT_REPEAT): if method_call is None: raise SkippedTest - current_time = time.time - call_repeat = range(10) - + rebuild_trees = not no_change and not serial tree_builders = [ suite.tree_builder(tree, tn, an, serial, children) for tree in tree_set ] - rebuild_trees = not no_change and not serial - - args = tuple([ build() for build in tree_builders ]) - method_call(*args) # run once to skip setup overhead + def new_trees(count=range(len(tree_builders)), trees=[None] * len(tree_builders)): + for i in count: + trees[i] = tree_builders[i]() + return tuple(trees) + + if rebuild_trees: + def time_benchmark(loops): + t_all_calls = 0.0 + for _ in range(loops): + run_benchmark = partial(method_call, *new_trees()) + t_one_call = timer() + run_benchmark() + t_one_call = timer() - t_one_call + t_all_calls += t_one_call + return t_all_calls + else: + def time_benchmark(loops, run_benchmark=partial(method_call, *new_trees())): + _loops = range(loops) + t_one_call = timer() + for _ in _loops: + run_benchmark() + t_all_calls = timer() - t_one_call + return t_all_calls + + time_benchmark(1) # run once for tree warm-up + + with nogc(): + # Adjust "min_runtime" to avoid long tree rebuild times for short benchmarks. + inner_loops = autorange( + time_benchmark, + min_runtime=0.1 if rebuild_trees else 0.2, + max_number=200 if rebuild_trees else None, + ) times = [] - for i in range(3): + for _ in range(repeat): + with nogc(): + t_one_call = time_benchmark(inner_loops) / inner_loops + times.append(1000.0 * t_one_call) # msec gc.collect() - gc.disable() - t = -1 - for i in call_repeat: - if rebuild_trees: - args = [ build() for build in tree_builders ] - t_one_call = current_time() - method_call(*args) - t_one_call = current_time() - t_one_call - if t < 0: - t = t_one_call - else: - t = min(t, t_one_call) - times.append(1000.0 * t) - gc.enable() - if rebuild_trees: - args = () - args = () - gc.collect() return times -def runBenchmarks(benchmark_suites, benchmarks): - for bench_calls in izip(*benchmarks): - for lib, (bench, benchmark_setup) in enumerate(izip(benchmark_suites, bench_calls)): +def runBenchmarks(benchmark_suites, benchmarks, repeat=DEFAULT_REPEAT): + for bench_calls in zip(*benchmarks): + for lib, (bench, benchmark_setup) in enumerate(zip(benchmark_suites, bench_calls)): bench_name = benchmark_setup[0] tree_set_name = build_treeset_name(*benchmark_setup[-6:-1]) sys.stdout.write("%-3s: %-28s (%-10s) " % ( @@ -422,7 +478,7 @@ def runBenchmarks(benchmark_suites, benchmarks): sys.stdout.flush() try: - result = runBench(bench, *benchmark_setup) + result = runBench(bench, *benchmark_setup, repeat=repeat) except SkippedTest: print("skipped") except KeyboardInterrupt: @@ -433,12 +489,14 @@ def runBenchmarks(benchmark_suites, benchmarks): print("failed: %s: %s" % (exc_type.__name__, exc_value)) exc_type = exc_value = None else: - print("%9.4f msec/pass, best of (%s)" % ( - min(result), ' '.join("%9.4f" % t for t in result))) + result.sort() + t_min, t_median, t_max = result[0], result[len(result) // 2], result[-1] + print(f"{t_min:9.4f} msec/pass, best of ({t_min:9.4f}, {t_median:9.4f}, {t_max:9.4f})") if len(benchmark_suites) > 1: print('') # empty line between different benchmarks + ############################################################ # Main program ############################################################ @@ -487,22 +545,6 @@ def main(benchmark_class): etree.ElementDefaultClassLookup()) if len(sys.argv) > 1: - if '-a' in sys.argv or '-c' in sys.argv: - # 'all' or 'C-implementations' ? - try: - sys.argv.remove('-c') - except ValueError: - pass - try: - import cElementTree as cET - _etrees.append(cET) - except ImportError: - try: - import xml.etree.cElementTree as cET - _etrees.append(cET) - except ImportError: - pass - try: # 'all' ? sys.argv.remove('-a') @@ -510,14 +552,10 @@ def main(benchmark_class): pass else: try: - from elementtree import ElementTree as ET + from xml.etree import ElementTree as ET _etrees.append(ET) except ImportError: - try: - from xml.etree import ElementTree as ET - _etrees.append(ET) - except ImportError: - pass + pass if not _etrees: print("No library to test. Exiting.") @@ -527,8 +565,7 @@ def main(benchmark_class): print("Preparing test suites and trees ...") selected = set( sys.argv[1:] ) - benchmark_suites, benchmarks = \ - buildSuites(benchmark_class, _etrees, selected) + benchmark_suites, benchmarks = buildSuites(benchmark_class, _etrees, selected) print("Running benchmark on", ', '.join(b.lib_name for b in benchmark_suites)) @@ -537,9 +574,8 @@ def main(benchmark_class): printSetupTimes(benchmark_suites) if callgrind_zero: - cmd = open("callgrind.cmd", 'w') - cmd.write('+Instrumentation\n') - cmd.write('Zero\n') - cmd.close() + with open("callgrind.cmd", 'w') as cmd: + cmd.write('+Instrumentation\n') + cmd.write('Zero\n') - runBenchmarks(benchmark_suites, benchmarks) + runBenchmarks(benchmark_suites, benchmarks, repeat=DEFAULT_REPEAT) diff --git a/benchmark/run_benchmarks.py b/benchmark/run_benchmarks.py new file mode 100644 index 000000000..fe09c05c6 --- /dev/null +++ b/benchmark/run_benchmarks.py @@ -0,0 +1,354 @@ +import collections +import io +import logging +import os +import pathlib +import re +import shutil +import subprocess +import sys +import tempfile +import time +import zipfile + + +BENCHMARKS_DIR = pathlib.Path(__file__).parent + +BENCHMARK_FILES = sorted(BENCHMARKS_DIR.glob("bench_*.py")) + +ALL_BENCHMARKS = [bm.stem for bm in BENCHMARK_FILES] + +LIMITED_API_VERSION = max((3, 12), sys.version_info[:2]) + + +try: + from distutils import sysconfig + DISTUTILS_CFLAGS = sysconfig.get_config_var('CFLAGS') +except ImportError: + DISTUTILS_CFLAGS = '' + + +parse_timings = re.compile( + r"(?P\w+):\s*" + r"(?P\w+)\s+" + r"\((?P[^)]+)\)\s*" + r"(?P[0-9.]+)\s+" + r"(?P.*)" +).match + + +def run(command, cwd=None, pythonpath=None, c_macros=None): + env = None + if pythonpath: + env = os.environ.copy() + env['PYTHONPATH'] = pythonpath + if c_macros: + env = env or os.environ.copy() + env['CFLAGS'] = env.get('CFLAGS', '') + " " + ' '.join(f" -D{macro}" for macro in c_macros) + + try: + return subprocess.run(command, cwd=cwd, check=True, capture_output=True, env=env) + except subprocess.CalledProcessError as exc: + logging.error(f"Command failed: {' '.join(map(str, command))}\nOutput:\n{exc.stderr.decode()}") + raise + + +def copy_benchmarks(bm_dir: pathlib.Path, benchmarks=None): + bm_files = [] + shutil.copy(BENCHMARKS_DIR / 'benchbase.py', bm_dir / 'benchbase.py') + for bm_src_file in BENCHMARK_FILES: + if benchmarks and bm_src_file.stem not in benchmarks: + continue + bm_file = bm_dir / bm_src_file.name + for benchmark_file in BENCHMARKS_DIR.glob(bm_src_file.stem + ".*"): + shutil.copy(benchmark_file, bm_dir / benchmark_file.name) + bm_files.append(bm_file) + + return bm_files + + +def compile_lxml(lxml_dir: pathlib.Path, c_macros=None): + rev_hash = get_git_rev(rev_dir=lxml_dir) + logging.info(f"Compiling lxml gitrev {rev_hash}") + run( + [sys.executable, "setup.py", "build_ext", "-i", "-j6"], + cwd=lxml_dir, + c_macros=c_macros, + ) + + +def get_git_rev(revision=None, rev_dir=None): + command = ["git", "describe", "--long"] + if revision: + command.append(revision) + output = run(command, cwd=rev_dir) + _, rev_hash = output.stdout.decode().strip().rsplit('-', 1) + return rev_hash[1:] + + +def git_clone(rev_dir, revision): + rev_hash = get_git_rev(revision) + run(["git", "clone", "-n", "--no-single-branch", ".", str(rev_dir)]) + run(["git", "checkout", rev_hash], cwd=rev_dir) + + +def copy_profile(bm_dir, module_name, profiler): + timestamp = int(time.time() * 1000) + profile_input = bm_dir / "profile.out" + data_file_name = f"{profiler}_{module_name}_{timestamp:X}.data" + + if profiler == 'callgrind': + bm_dir_str = str(bm_dir) + os.sep + with open(profile_input) as data_file_in: + with open(data_file_name, mode='w') as data_file_out: + for line in data_file_in: + if bm_dir_str in line: + # Remove absolute file paths to link to local file copy below. + line = line.replace(bm_dir_str, "") + data_file_out.write(line) + else: + shutil.move(profile_input, data_file_name) + + for result_file_name in (f"{module_name}.c", f"{module_name}.html"): + result_file = bm_dir / result_file_name + if result_file.exists(): + shutil.move(result_file, result_file_name) + + for ext in bm_dir.glob(f"{module_name}.*so"): + shutil.move(str(ext), ext.name) + + +def run_benchmark(bm_dir, module_name, pythonpath=None, profiler=None): + logging.info(f"Running benchmark '{module_name}'.") + + command = [] + + if profiler: + if profiler == 'perf': + command = ["perf", "record", "--quiet", "-g", "--output=profile.out"] + elif profiler == 'callgrind': + command = [ + "valgrind", "--tool=callgrind", + "--dump-instr=yes", "--collect-jumps=yes", + "--callgrind-out-file=profile.out", + ] + + command += [sys.executable, f"{module_name}.py"] + + output = run(command, cwd=bm_dir, pythonpath=pythonpath) + + if profiler: + copy_profile(bm_dir, module_name, profiler) + + lines = filter(None, output.stdout.decode().splitlines()) + for line in lines: + if line == "Setup times for trees in seconds:": + break + + other_lines = [] + timings = [] + for line in lines: + match = parse_timings(line) + if match: + timings.append((match['benchmark'], match['params'].strip(), match['lib'], float(match['besttime']), match['timings'])) + else: + other_lines.append(line) + + return other_lines, timings + + +def run_benchmarks(bm_dir, benchmarks, pythonpath=None, profiler=None): + timings = {} + for benchmark in benchmarks: + timings[benchmark] = run_benchmark(bm_dir, benchmark, pythonpath=pythonpath, profiler=profiler) + return timings + + +def benchmark_revisions(benchmarks, revisions, profiler=None, limited_revisions=(), deps_zipfile=None): + python_version = "Python %d.%d.%d" % sys.version_info[:3] + logging.info(f"### Comparing revisions in {python_version}: {' '.join(revisions)}.") + logging.info(f"CFLAGS={os.environ.get('CFLAGS', DISTUTILS_CFLAGS)}") + + hashes = {} + timings = {} + for revision in revisions: + rev_hash = get_git_rev(revision) + if rev_hash in hashes: + logging.info(f"### Ignoring revision '{revision}': same as '{hashes[rev_hash]}'") + continue + hashes[rev_hash] = revision + + logging.info(f"### Preparing benchmark run for lxml '{revision}'.") + timings[revision] = benchmark_revision( + revision, benchmarks, profiler, deps_zipfile=deps_zipfile) + + if revision in limited_revisions: + logging.info( + f"### Preparing benchmark run for lxml '{revision}' (Limited API {LIMITED_API_VERSION[0]}.{LIMITED_API_VERSION[1]}).") + timings['L-' + revision] = benchmark_revision( + revision, benchmarks, profiler, + c_macros=["Py_LIMITED_API=0x%02x%02x0000" % LIMITED_API_VERSION], + deps_zipfile=deps_zipfile, + ) + + return timings + + +def cache_libs(lxml_dir, deps_zipfile): + for dir_path, _, filenames in (lxml_dir / "build" / "tmp").walk(): + for filename in filenames: + path = dir_path / filename + deps_zipfile.write(path, path.relative_to(lxml_dir)) + + +def benchmark_revision(revision, benchmarks, profiler=None, c_macros=None, deps_zipfile=None): + with tempfile.TemporaryDirectory() as base_dir_str: + base_dir = pathlib.Path(base_dir_str) + lxml_dir = base_dir / "lxml" / revision + bm_dir = base_dir / "benchmarks" / revision + + git_clone(lxml_dir, revision=revision) + + bm_dir.mkdir(parents=True) + bm_files = copy_benchmarks(bm_dir, benchmarks) + + deps_zip_is_empty = deps_zipfile and not deps_zipfile.namelist() + if deps_zipfile and not deps_zip_is_empty: + deps_zipfile.extractall(lxml_dir) + + compile_lxml(lxml_dir, c_macros=c_macros) + + if deps_zipfile and deps_zip_is_empty: + cache_libs(lxml_dir, deps_zipfile) + + logging.info(f"### Running benchmarks for {revision}: {' '.join(bm.stem for bm in bm_files)}") + return run_benchmarks(bm_dir, benchmarks, pythonpath=f"{bm_dir}:{lxml_dir / 'src'}", profiler=profiler) + + +def report_revision_timings(rev_timings): + units = {"nsec": 1e-9, "usec": 1e-6, "msec": 1e-3, "sec": 1.0} + scales = [(scale, unit) for unit, scale in reversed(units.items())] # biggest first + + def format_time(t): + pos_t = abs(t) + for scale, unit in scales: + if pos_t >= scale: + break + else: + raise RuntimeError(f"Timing is below nanoseconds: {t:f}") + return f"{t / scale :+.3f} {unit}" + + timings_by_benchmark = collections.defaultdict(list) + setup_times = [] + for revision_name, bm_timings in rev_timings.items(): + for benchmark_module, (output, timings) in bm_timings.items(): + setup_times.append((benchmark_module, revision_name, output)) + for benchmark_name, params, lib, best_time, result_text in timings: + timings_by_benchmark[(benchmark_module, benchmark_name, params)].append((lib, revision_name, best_time, result_text)) + + setup_times.sort() + for timings in timings_by_benchmark.values(): + timings.sort() + + for benchmark_module, revision_name, output in setup_times: + result = '\n'.join(output) + logging.info(f"Setup times for trees in seconds - {benchmark_module} / {revision_name}:\n{result}") + + differences = collections.defaultdict(list) + for (benchmark_module, benchmark_name, params), timings in timings_by_benchmark.items(): + logging.info(f"### Benchmark {benchmark_module} / {benchmark_name} ({params}):") + base_line = timings[0][2] + for lib, revision_name, bm_time, result_text in timings: + diff_str = "" + if base_line != bm_time: + pdiff = bm_time * 100 / base_line - 100 + differences[(lib, revision_name)].append((abs(pdiff), pdiff, bm_time - base_line, benchmark_module, benchmark_name, params)) + diff_str = f" {pdiff:+8.2f} %" + logging.info( + f" {lib:3} / {revision_name[:25]:25} = {bm_time:8.4f} {result_text}{diff_str}" + ) + + for (lib, revision_name), diffs in differences.items(): + diffs.sort(reverse=True) + diffs_by_sign = {True: [], False: []} + for diff in diffs: + diffs_by_sign[diff[1] < 0].append(diff) + + for is_win, diffs in diffs_by_sign.items(): + if not diffs or diffs[0][0] < 1.0: + continue + + logging.info(f"Largest {'gains' if is_win else 'losses'} for {revision_name}:") + cutoff = max(1.0, diffs[0][0] // 4) + for absdiff, pdiff, tdiff, benchmark_module, benchmark_name, params in diffs: + if absdiff < cutoff: + break + logging.info(f" {benchmark_module} / {benchmark_name:<25} ({params:>10}) {pdiff:+8.2f} % / {format_time(tdiff / 1000.0):>8}") + + +def parse_args(args): + from argparse import ArgumentParser, RawDescriptionHelpFormatter + parser = ArgumentParser( + description="Run benchmarks against different lxml tags/revisions.", + formatter_class=RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-b", "--benchmarks", + dest="benchmarks", default=','.join(ALL_BENCHMARKS), + help="The list of benchmark selectors to run, simple substrings, separated by comma.", + ) + parser.add_argument( + "--with-limited", + dest="with_limited_api", action="append", default=[], + help="Also run the benchmarks for REVISION against the Limited C-API.", + ) + #parser.add_argument( + # "--with-elementtree", + # dest="with_elementtree", + # help="Include results for Python's xml.etree.ElementTree.", + #) + parser.add_argument( + "--perf", + dest="profiler", action="store_const", const="perf", default=None, + help="Run Linux 'perf record' on the benchmark process.", + ) + parser.add_argument( + "--callgrind", + dest="profiler", action="store_const", const="callgrind", default=None, + help="Run Valgrind's callgrind profiler on the benchmark process.", + ) + parser.add_argument( + "revisions", + nargs="*", default=[], + help="The git revisions to check out and benchmark.", + ) + + return parser.parse_known_args(args) + + +if __name__ == '__main__': + options, cythonize_args = parse_args(sys.argv[1:]) + + logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="%(asctime)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + benchmark_selectors = set(bm.strip() for bm in options.benchmarks.split(",")) + benchmarks = [bm for bm in ALL_BENCHMARKS if any(selector in bm for selector in benchmark_selectors)] + if benchmark_selectors and not benchmarks: + logging.error("No benchmarks selected!") + sys.exit(1) + + deps_zipfile = zipfile.ZipFile(io.BytesIO(), mode='w') + + revisions = list({rev: rev for rev in (options.revisions + options.with_limited_api)}) # deduplicate in order + timings = benchmark_revisions( + benchmarks, revisions, + profiler=options.profiler, + limited_revisions=options.with_limited_api, + deps_zipfile=deps_zipfile, + ) + report_revision_timings(timings) diff --git a/buildlibxml.py b/buildlibxml.py index 574d34e31..cc61d65b2 100644 --- a/buildlibxml.py +++ b/buildlibxml.py @@ -1,10 +1,12 @@ import json import os, re, sys, subprocess, platform import tarfile +import time from distutils import log from contextlib import closing, contextmanager from ftplib import FTP +import urllib.error from urllib.parse import urljoin, unquote, urlparse from urllib.request import urlretrieve, urlopen, Request @@ -52,9 +54,6 @@ def download_and_extract_windows_binaries(destdir): else: arch = "win32" - if sys.version_info < (3, 5): - arch = 'vs2008.' + arch - arch_part = '.' + arch + '.' filenames = [filename for filename in filenames if arch_part in filename] @@ -264,7 +263,7 @@ def py2_tarxz(filename): def download_libxml2(dest_dir, version=None): """Downloads libxml2, returning the filename where the library was downloaded""" #version_re = re.compile(r'LATEST_LIBXML2_IS_([0-9.]+[0-9](?:-[abrc0-9]+)?)') - version_re = re.compile(r'libxml2-([0-9.]+[0-9]).tar.xz') + version_re = re.compile(r'libxml2-([0-9.]+[0-9])[.]tar[.]xz') filename = 'libxml2-%s.tar.xz' if version == "2.9.12": @@ -281,7 +280,7 @@ def download_libxml2(dest_dir, version=None): def download_libxslt(dest_dir, version=None): """Downloads libxslt, returning the filename where the library was downloaded""" #version_re = re.compile(r'LATEST_LIBXSLT_IS_([0-9.]+[0-9](?:-[abrc0-9]+)?)') - version_re = re.compile(r'libxslt-([0-9.]+[0-9]).tar.xz') + version_re = re.compile(r'libxslt-([0-9.]+[0-9])[.]tar[.]xz') filename = 'libxslt-%s.tar.xz' from_location = http_find_latest_version_directory(LIBXSLT_LOCATION, version=version) return download_library(dest_dir, from_location, 'libxslt', @@ -290,7 +289,7 @@ def download_libxslt(dest_dir, version=None): def download_libiconv(dest_dir, version=None): """Downloads libiconv, returning the filename where the library was downloaded""" - version_re = re.compile(r'libiconv-([0-9.]+[0-9]).tar.gz') + version_re = re.compile(r'libiconv-([0-9.]+[0-9])[.]tar[.]gz') filename = 'libiconv-%s.tar.gz' return download_library(dest_dir, LIBICONV_LOCATION, 'libiconv', version_re, filename, version=version) @@ -298,7 +297,7 @@ def download_libiconv(dest_dir, version=None): def download_zlib(dest_dir, version): """Downloads zlib, returning the filename where the library was downloaded""" - version_re = re.compile(r'zlib-([0-9.]+[0-9]).tar.gz') + version_re = re.compile(r'zlib-([0-9.]+[0-9])[.]tar[.]gz') filename = 'zlib-%s.tar.gz' return download_library(dest_dir, ZLIB_LOCATION, 'zlib', version_re, filename, version=version) @@ -328,10 +327,10 @@ def download_library(dest_dir, location, name, version_re, filename, version=Non if version is None: try: if location.startswith('ftp://'): - fns = remote_listdir(location) + fns = list(remote_listdir(location)) else: - print(location) fns = http_listfiles(location, '(%s)' % filename.replace('%s', '(?:[0-9.]+[0-9])')) + print(f"Found {len(fns)} links at {location}") version = find_max_version(name, fns, version_re) except IOError: # network failure - maybe we have the files already? @@ -360,7 +359,15 @@ def download_library(dest_dir, location, name, version_re, filename, version=Non return dest_filename print('Downloading %s into %s from %s' % (name, dest_filename, full_url)) - urlretrieve(full_url, dest_filename) + try: + urlretrieve(full_url, dest_filename) + except urllib.error.URLError as exc: + # retry once + retry_after_seconds = 2 + print(f"Download failed: {exc}, retrying in {int(retry_after_seconds)} seconds…") + time.sleep(retry_after_seconds) + urlretrieve(full_url, dest_filename) + return dest_filename @@ -437,18 +444,24 @@ def build_libxml2xslt(download_dir, build_dir, libxslt_version=None, libiconv_version=None, zlib_version=None, - multicore=None): + multicore=None, + with_zlib=True): safe_mkdir(download_dir) safe_mkdir(build_dir) - zlib_dir = unpack_tarball(download_zlib(download_dir, zlib_version), build_dir) + + zlib_dir = None + if with_zlib: + zlib_dir = unpack_tarball(download_zlib(download_dir, zlib_version), build_dir) + libiconv_dir = unpack_tarball(download_libiconv(download_dir, libiconv_version), build_dir) libxml2_dir = unpack_tarball(download_libxml2(download_dir, libxml2_version), build_dir) libxslt_dir = unpack_tarball(download_libxslt(download_dir, libxslt_version), build_dir) + prefix = os.path.join(os.path.abspath(build_dir), 'libxml2') lib_dir = os.path.join(prefix, 'lib') safe_mkdir(prefix) - lib_names = ['libxml2', 'libexslt', 'libxslt', 'iconv', 'libz'] + lib_names = ['libxml2', 'libexslt', 'libxslt', 'iconv'] + (['libz'] if with_zlib else []) existing_libs = { lib: os.path.join(lib_dir, filename) for lib in lib_names @@ -479,12 +492,13 @@ def has_current_lib(name, build_dir, _build_all_following=[False]): ] # build zlib - zlib_configure_cmd = [ - './configure', - '--prefix=%s' % prefix, - ] - if not has_current_lib("libz", zlib_dir): - cmmi(zlib_configure_cmd, zlib_dir, multicore, **call_setup) + if with_zlib: + zlib_configure_cmd = [ + './configure', + '--prefix=%s' % prefix, + ] + if not has_current_lib("libz", zlib_dir): + cmmi(zlib_configure_cmd, zlib_dir, multicore, **call_setup) # build libiconv if not has_current_lib("iconv", libiconv_dir): @@ -494,7 +508,7 @@ def has_current_lib(name, build_dir, _build_all_following=[False]): libxml2_configure_cmd = configure_cmd + [ '--without-python', '--with-iconv=%s' % prefix, - '--with-zlib=%s' % prefix, + ('--with-zlib=%s' % prefix) if with_zlib else '--without-zlib', ] if not libxml2_version: diff --git a/doc/FAQ.txt b/doc/FAQ.txt index 6cfe92dbc..9236a6b93 100644 --- a/doc/FAQ.txt +++ b/doc/FAQ.txt @@ -1162,7 +1162,7 @@ safely expose their values to the evaluation engine. The defusedxml_ package comes with an example setup and a wrapper API for lxml that applies certain counter measures internally. -.. _defusedxml: https://bitbucket.org/tiran/defusedxml +.. _defusedxml: https://github.com/tiran/defusedxml How can I sort the attributes? diff --git a/doc/build.txt b/doc/build.txt index 256f65b13..7a2630ceb 100644 --- a/doc/build.txt +++ b/doc/build.txt @@ -135,6 +135,12 @@ files to the include path like:: where the file is in ``/usr/include/libxml2/libxml/xmlversion.h`` +For static builds, if you get an error saying "recompile with -fPIC", +do so by adding it to your `CFLAGS` environment variable: +``env CFLAGS="$CFLAGS -fPIC"``, such as:: + + env CFLAGS="$CFLAGS -fPIC" python3 setup.py build_ext -i --with-cython --static-deps + To use lxml.etree in-place, you can place lxml's ``src`` directory on your Python module search path (PYTHONPATH) and then import ``lxml.etree`` to play with it:: @@ -146,6 +152,12 @@ on your Python module search path (PYTHONPATH) and then import >>> from lxml import etree >>> +For non-static builds, you may have to set ``LD_LIBRARY_PATH`` to where the +shared object files for libxml2 and libxslt are, such as ``/usr/local/lib``. For +example:: + + PYTHONPATH=src LD_LIBRARY_PATH=/usr/local/lib python3 + To make sure everything gets recompiled cleanly after changes, you can run ``make clean`` or delete the file ``src/lxml/etree.c``. diff --git a/doc/lxmlhtml.txt b/doc/lxmlhtml.txt index 8f32da6c1..d07eacb7e 100644 --- a/doc/lxmlhtml.txt +++ b/doc/lxmlhtml.txt @@ -433,7 +433,7 @@ You can, for instance, do: ... name='John Smith', ... phone='555-555-3949', ... interest=set(['cats', 'llamas'])) - >>> print tostring(form) + >>> print(tostring(form))
@@ -479,193 +479,10 @@ Example: >>> page = parse('http://tinyurl.com').getroot() >>> page.forms[0].fields['url'] = 'http://lxml.de/' >>> result = parse(submit_form(page.forms[0])).getroot() + >>> [a.attrib['href'] for a in result.xpath("//a[@target='_blank']")] ['http://tinyurl.com/2xae8s', 'http://preview.tinyurl.com/2xae8s'] -Cleaning up HTML -================ - -The module ``lxml.html.clean`` provides a ``Cleaner`` class for cleaning up -HTML pages. It supports removing embedded or script content, special tags, -CSS style annotations and much more. - -Note: the HTML Cleaner in ``lxml.html.clean`` is **not** considered -appropriate **for security sensitive environments**. -See e.g. `bleach `_ or -`nh3 `_ for alternatives. - -Note: owing to the increased number of security vulnerabilities that have been -reported concerning the blocklist-based nature of lxml.html.clean, it has been -determined that this specific component of the project will be extracted -and transitioned into a separate project. This strategic decision is aimed -at enhancing the suitability of the lxml library for deployment -in security-sensitive environments, thereby addressing and mitigating potential -risks more effectively. - -Say, you have an overburdened web page from a hideous source which contains -lots of content that upsets browsers and tries to run unnecessary code on the -client side: - -.. sourcecode:: pycon - - >>> html = '''\ - ... - ... - ... - ... - ... - ... - ... - ... - ... a link - ... another link - ...

a paragraph

- ...
secret EVIL!
- ... of EVIL! - ... - ... - ... Password: - ... - ... annoying EVIL! - ... spam spam SPAM! - ... - ... - ... ''' - -To remove the all superfluous content from this unparsed document, use the -``clean_html`` function: - -.. sourcecode:: pycon - - >>> from lxml.html.clean import clean_html - >>> print clean_html(html) -
- - a link - another link -

a paragraph

-
secret EVIL!
- of EVIL! - - - Password: - annoying EVIL!spam spam SPAM! -
- -The ``Cleaner`` class supports several keyword arguments to control exactly -which content is removed: - -.. sourcecode:: pycon - - >>> from lxml.html.clean import Cleaner - - >>> cleaner = Cleaner(page_structure=False, links=False) - >>> print cleaner.clean_html(html) - - - - - - - a link - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - annoying EVIL! - spam spam SPAM! - - - - - >>> cleaner = Cleaner(style=True, links=True, add_nofollow=True, - ... page_structure=False, safe_attrs_only=False) - - >>> print cleaner.clean_html(html) - - - - - a link - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - annoying EVIL! - spam spam SPAM! - - - - -You can also whitelist some otherwise dangerous content with -``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow -embedded media from YouTube, while still filtering out embedded media -from other sites. - -See the docstring of ``Cleaner`` for the details of what can be -cleaned. - - -autolink --------- - -In addition to cleaning up malicious HTML, ``lxml.html.clean`` -contains functions to do other things to your HTML. This includes -autolinking:: - - autolink(doc, ...) - - autolink_html(html, ...) - -This finds anything that looks like a link (e.g., -``http://example.com``) in the *text* of an HTML document, and -turns it into an anchor. It avoids making bad links. - -Links in the elements ``