diff --git a/.codecov.yml b/.codecov.yml
deleted file mode 100644
index f307895e..00000000
--- a/.codecov.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-coverage:
- status:
- project:
- default:
- target: 88%
- threshold: null
- patch: false
- changes: false
diff --git a/.coveragerc b/.coveragerc
index 723bfd0c..8b591311 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,2 +1,28 @@
[run]
-source=charset_normalizer
+source =
+ charset_normalizer
+# Needed for Python 3.11 and lower
+disable_warnings = no-sysmon
+
+[paths]
+source =
+ src/charset_normalizer
+ */charset_normalizer
+ *\charset_normalizer
+
+[report]
+omit =
+ src/charset_normalizer/__main__.py
+
+exclude_lines =
+ except ModuleNotFoundError:
+ except ImportError:
+ pass
+ import
+ raise NotImplementedError
+ .* # Platform-specific.*
+ .*:.* # Python \d.*
+ .* # Abstract
+ .* # Defensive:
+ if (?:typing.)?TYPE_CHECKING:
+ ^\s*?\.\.\.\s*$
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 00000000..33e824d2
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,9 @@
+# Restrict all files related to deploying to
+# require lead maintainer approval.
+
+.github/workflows/ @Ousret
+.github/CODEOWNERS @Ousret
+src/charset_normalizer/ @Ousret
+pyproject.toml @Ousret
+tests/ @Ousret
+data/ @Ousret
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index 288256e7..26c9d3c1 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -21,15 +21,15 @@ jobs:
needs:
- pre_flight_check
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
- python-version: '3.11'
- - name: Update pip, setuptools, wheel, build and twine
+ python-version: '3'
+ - name: Update pip, install build
run: |
python -m pip install --upgrade pip
- python -m pip install setuptools wheel build
+ python -m pip install build
- name: Build Wheel
env:
CHARSET_NORMALIZER_USE_MYPYC: '0'
@@ -46,7 +46,7 @@ jobs:
needs: pre_flight_check
strategy:
matrix:
- os: [ ubuntu-latest, windows-latest, macos-13 ]
+ os: [ ubuntu-22.04, windows-latest, macos-13 ]
qemu: [ '' ]
include:
# Split ubuntu job for the sake of speed-up
@@ -58,7 +58,7 @@ jobs:
qemu: s390x
steps:
- name: Checkout
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
submodules: true
- name: Set up QEMU
@@ -76,20 +76,16 @@ jobs:
fi
shell: bash
- name: Setup Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
- - name: Update pip, wheel, setuptools, build, twine
- run: |
- python -m pip install -U pip wheel setuptools build twine
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- name: Build wheels
- uses: pypa/cibuildwheel@f1859528322d7b29d4493ee241a167807661dfb4 # v2.21.2
+ uses: pypa/cibuildwheel@ee63bf16da6cddfb925f542f2c7b59ad50e93969 # v2.22.0
env:
- CIBW_BUILD_FRONTEND: "pip; args: --no-build-isolation"
- CIBW_ARCHS_MACOS: x86_64 arm64 universal2
+ CIBW_BUILD_FRONTEND: build
+ CIBW_ARCHS_MACOS: universal2
CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1'
- CIBW_BEFORE_BUILD: pip install -r build-requirements.txt
CIBW_TEST_REQUIRES: pytest
CIBW_TEST_COMMAND: pytest -c {package} {package}/tests
- CIBW_SKIP: pp* cp36*
+ CIBW_SKIP: pp*
- name: Upload artifacts
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
with:
@@ -105,7 +101,7 @@ jobs:
outputs:
hashes: ${{ steps.compute.outputs.hashes }}
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Download distributions
uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a
with:
@@ -143,7 +139,7 @@ jobs:
name: pypi
url: https://pypi.org/project/charset-normalizer/
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Download distributions
uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a
with:
@@ -153,7 +149,7 @@ jobs:
run: |
tree dist
- name: Publish package distributions to PyPI
- uses: pypa/gh-action-pypi-publish@f7600683efdcb7656dec5b29656edb7bc586e597 # release/v1
+ uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # release/v1
- name: Upload dists to GitHub Release
env:
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index da285c55..c3f0dc59 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,32 +15,19 @@ jobs:
name: 🎨 Linters
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
- python-version: '3.11'
- - name: Install dependencies
- run: |
- python -m pip install -U pip setuptools
- python -m pip install -r dev-requirements.txt
- python -m pip uninstall -y charset-normalizer
- - name: Type checking (Mypy)
- run: |
- mypy --strict charset_normalizer
- - name: Import sorting check (isort)
- run: |
- isort --check charset_normalizer
- - name: Code format (Black)
- run: |
- black --check --diff --target-version=py37 charset_normalizer
- - name: Style guide enforcement (Flake8)
- run: |
- flake8 charset_normalizer
+ python-version: '3'
+ - name: Install nox
+ run: python -m pip install nox
+ - name: Pre-commit checks
+ run: nox -s lint
tests:
name: ✅ Tests
- runs-on: ubuntu-latest
+ runs-on: ubuntu-22.04
strategy:
fail-fast: false
@@ -55,25 +42,22 @@ jobs:
- "3.13"
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
- name: Install dependencies
- run: |
- python -m pip install -U pip setuptools
- python -m pip install -r dev-requirements.txt
- python -m pip uninstall -y charset-normalizer
- - name: Install the package
- run: |
- python -m build --no-isolation
- python -m pip install ./dist/*.whl
+ run: python -m pip install nox
- name: Run tests
- run: |
- pytest
- - uses: codecov/codecov-action@4fe8c5f003fae66aa5ebb77cfd3e7bfbbda0b6b0 # v3.1.5
+ run: nox -s test-${{ matrix.python-version }}
+ - name: "Upload artifact"
+ uses: "actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce"
+ with:
+ name: coverage-data
+ path: ".coverage.*"
+ if-no-files-found: error
detection_coverage:
@@ -84,65 +68,49 @@ jobs:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
- python-version: '3.11'
+ python-version: '3'
- name: Install dependencies
- run: |
- python -m pip install -U pip setuptools
- python -m pip install -r dev-requirements.txt
- python -m pip uninstall -y charset-normalizer
- - name: Install the package
- run: |
- python -m build
- python -m pip install ./dist/*.whl
- - name: Clone the complete dataset
- run: |
- git clone https://github.com/Ousret/char-dataset.git
+ run: python -m pip install nox
- name: Coverage WITH preemptive
- run: |
- python ./bin/coverage.py --coverage 97 --with-preemptive
+ run: nox -s coverage -- --coverage 97 --with-preemptive
- name: Coverage WITHOUT preemptive
- run: |
- python ./bin/coverage.py --coverage 95
-
-# integration_test:
-#
-# needs:
-# - tests
-#
-# name: 🔗 Integration Tests
-# runs-on: ubuntu-latest
-#
-# steps:
-# - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
-# - name: Set up Python
-# uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-# with:
-# python-version: '3.11'
-# - name: Install dependencies
-# run: |
-# pip install -U pip setuptools
-# pip install -r dev-requirements.txt
-# - name: Remove Chardet & Charset-Normalizer
-# run: |
-# pip uninstall -y chardet
-# pip uninstall -y charset-normalizer
-# - name: Install the package
-# run: |
-# python -m build
-# pip install ./dist/*.whl
-# - name: Clone the complete dataset
-# run: |
-# git clone https://github.com/Ousret/char-dataset.git
-# - name: Start the Flask server
-# run: |
-# python ./bin/serve.py &
-# - name: Integration Tests with Requests
-# run: |
-# python ./bin/integration.py
+ run: nox -s coverage -- --coverage 95
+ - name: "Upload artifact"
+ uses: "actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce"
+ with:
+ name: coverage-data
+ path: ".coverage.*"
+ if-no-files-found: error
+
+ integration_test:
+
+ needs:
+ - tests
+
+ name: 🔗 Integration Tests
+ runs-on: ubuntu-latest
+
+ strategy:
+ fail-fast: false
+ matrix:
+ downstream_project:
+ - niquests
+ - requests
+
+ steps:
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: '3'
+ - name: Install dependencies
+ run: pip install nox
+ - name: Integration Tests with Requests
+ run: nox -s downstream_${{ matrix.downstream_project }}
chardet_bc:
@@ -150,26 +118,15 @@ jobs:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
- python-version: '3.11'
+ python-version: '3'
- name: Install dependencies
- run: |
- python -m pip install -U pip setuptools
- python -m pip install -r dev-requirements.txt
- python -m pip uninstall -y charset-normalizer
- - name: Install the package
- run: |
- python -m build
- python -m pip install ./dist/*.whl
- - name: Clone the complete dataset
- run: |
- git clone https://github.com/Ousret/char-dataset.git
+ run: pip install nox
- name: BC Coverage
- run: |
- python ./bin/bc.py --coverage 80
+ run: nox -s backward_compatibility -- --coverage 80
mypyc_test:
@@ -193,7 +150,7 @@ jobs:
os: [ ubuntu-latest, macos-latest, windows-latest ]
include:
- python-version: "3.7"
- os: ubuntu-latest
+ os: ubuntu-22.04
- python-version: "3.7"
os: macos-13
- python-version: "3.7"
@@ -201,61 +158,72 @@ jobs:
env:
PYTHONIOENCODING: utf8 # only needed for Windows (console IO output encoding)
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
- - name: Install dependencies
- run: |
- python -m pip install -U pip setuptools
- python -m pip install -r dev-requirements.txt
- python -m pip uninstall -y charset-normalizer
- - name: Install the package
- env:
- CHARSET_NORMALIZER_USE_MYPYC: '1'
- run: |
- python -m pip install .
- - name: Clone the complete dataset
- run: |
- git clone https://github.com/Ousret/char-dataset.git
- - name: Coverage WITH preemptive
- run: |
- python ./bin/coverage.py --coverage 97 --with-preemptive
- - name: Performance (Normal)
+ - name: Install nox
+ run: pip install nox
+ - name: Run tests with mypyc enabled
+ run: nox -s test_mypyc-${{ matrix.python-version }}
+ - name: "Upload artifact"
+ uses: "actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce"
+ with:
+ name: coverage-data
+ path: ".coverage.*"
+ if-no-files-found: error
+
+ coverage:
+ if: always()
+ runs-on: "ubuntu-latest"
+ needs:
+ - tests
+ - mypyc_test
+ - detection_coverage
+ steps:
+ - name: "Checkout repository"
+ uses: "actions/checkout@d632683dd7b4114ad314bca15554477dd762a938"
+
+ - name: "Setup Python"
+ uses: "actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3"
+ with:
+ python-version: "3.x"
+
+ - name: "Install coverage"
+ run: "python -m pip install --upgrade coverage"
+
+ - name: "Download artifact"
+ uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a
+ with:
+ name: coverage-data
+
+ - name: "Combine & check coverage"
run: |
- python ./bin/performance.py
+ python -m coverage combine
+ python -m coverage html --skip-covered --skip-empty
+ python -m coverage report --ignore-errors --show-missing --fail-under=92
+
+ - name: "Upload report"
+ uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+ with:
+ name: coverage-report
+ path: htmlcov
performance:
- name: ⚡ Performance Test (no MypyC)
+ name: ⚡ Performance Test
runs-on: ubuntu-latest
- needs:
- - mypyc_test
- - chardet_bc
+ needs: coverage
steps:
- - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
- python-version: '3.11'
+ python-version: '3'
- name: Install dependencies
- run: |
- python -m pip install -U pip setuptools
- python -m pip install -r dev-requirements.txt
- python -m pip uninstall -y charset-normalizer
- - name: Install the package
- run: |
- python -m build
- python -m pip install ./dist/*.whl
- - name: Clone the complete dataset
- run: |
- git clone https://github.com/Ousret/char-dataset.git
- - name: Performance (Normal)
- run: |
- python ./bin/performance.py
- - name: Performance (Medium)
- run: |
- python ./bin/performance.py --size-increase 2
+ run: pip install nox
+ - name: Performance Measurement
+ run: nox -s performance
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 65dfcd73..3f31cc31 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -38,20 +38,20 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
- uses: github/codeql-action/init@c36620d31ac7c881962c3d9dd939c40ec9434f2b # v3.26.12
+ uses: github/codeql-action/init@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
with:
languages: ${{ matrix.language }}
-
+
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
- uses: github/codeql-action/autobuild@c36620d31ac7c881962c3d9dd939c40ec9434f2b # v3.26.12
+ uses: github/codeql-action/autobuild@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
- name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@c36620d31ac7c881962c3d9dd939c40ec9434f2b # v3.26.12
+ uses: github/codeql-action/analyze@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
with:
category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 923d8932..bc246305 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -31,7 +31,7 @@ jobs:
steps:
- name: "Checkout code"
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
@@ -66,6 +66,6 @@ jobs:
# Upload the results to GitHub's code scanning dashboard.
- name: "Upload to code-scanning"
- uses: github/codeql-action/upload-sarif@c36620d31ac7c881962c3d9dd939c40ec9434f2b # v3.26.12
+ uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
with:
sarif_file: results.sarif
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..0b051524
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,30 @@
+exclude: 'docs/|data/|tests/'
+
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v5.0.0
+ hooks:
+ - id: check-yaml
+ - id: debug-statements
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
+ - repo: https://github.com/asottile/pyupgrade
+ rev: v3.19.1
+ hooks:
+ - id: pyupgrade
+ args: [ --py37-plus, --keep-runtime-typing ]
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ # Ruff version.
+ rev: v0.8.4
+ hooks:
+ # Run the linter.
+ - id: ruff
+ args: [ --fix ]
+ # Run the formatter.
+ - id: ruff-format
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: v1.14.0
+ hooks:
+ - id: mypy
+ args: [ --check-untyped-defs ]
+ exclude: 'tests/|noxfile.py|setup.py|bin/'
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 87d8ed3e..b783a32f 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,9 +1,9 @@
version: 2
build:
- os: ubuntu-20.04
+ os: ubuntu-22.04
tools:
- python: "3.9"
+ python: "3.10"
# Build documentation in the docs/ directory with Sphinx
sphinx:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7cd7e1a..2895a843 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,27 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
+
+### Changed
+- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
+- Enforce annotation delayed loading for a simpler and consistent types in the project.
+- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
+
+### Added
+- pre-commit configuration.
+- noxfile.
+
+### Removed
+- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
+- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
+- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
+- Unused `utils.range_scan` function.
+
+### Fixed
+- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
+- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
### Added
@@ -181,7 +202,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
-- ASCII miss-detection on rare cases (PR #170)
+- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
@@ -213,7 +234,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
-- Code style as refactored by Sourcery-AI (PR #131)
+- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
@@ -286,7 +307,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
-- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
+- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index abee674b..40b19f0e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
# Contribution Guidelines
-If you’re reading this, you’re probably interested in contributing to Charset Normalizer.
-Thank you very much! Open source projects live-and-die based on the support they receive from others,
+If you’re reading this, you’re probably interested in contributing to Charset Normalizer.
+Thank you very much! Open source projects live-and-die based on the support they receive from others,
and the fact that you’re even considering contributing to this project is very generous of you.
## Questions
-The GitHub issue tracker is for *bug reports* and *feature requests*.
+The GitHub issue tracker is for *bug reports* and *feature requests*.
Questions are allowed only when no answer are provided in docs.
## Good Bug Reports
@@ -67,6 +67,10 @@ the backward-compatibility.
## How to run tests locally?
It is essential that you run, prior to any submissions the mandatory checks.
-Run the script `./bin/run_checks.sh` to verify that your modification are not breaking anything.
-Also, make sure to run the `./bin/run_autofix.sh` to comply with the style format and import sorting.
+```shell
+pip install nox
+nox -s test
+nox -s lint
+nox -s coverage
+```
diff --git a/LICENSE b/LICENSE
index ad82355b..9725772c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2019 TAHRI Ahmed R.
+Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
index 3792f5bb..8da2cd04 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include LICENSE README.md CHANGELOG.md charset_normalizer/py.typed dev-requirements.txt
+include LICENSE README.md CHANGELOG.md src/charset_normalizer/py.typed dev-requirements.txt SECURITY.md noxfile.py
recursive-include data *.md
recursive-include data *.txt
recursive-include docs *
diff --git a/README.md b/README.md
index 13e6e14f..ee5b2e7e 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
Featured Packages
-
+
@@ -55,8 +55,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector*

-*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
-Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
+*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
## ⚡ Performance
@@ -64,21 +63,23 @@ This package offer better performance than its counterpart Chardet. Here are som
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
-| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
+| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
-| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
+| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
+_updated as of december 2024 using CPython 3.12_
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
-> (eg. Supported Encoding) Challenge-them if you want.
+> (e.g. Supported Encoding) Challenge-them if you want.
## ✨ Installation
@@ -195,11 +196,11 @@ reliable alternative using a completely different method. Also! I never back dow
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
-What I want is to get readable text, the best I can.
+What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
-Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
+Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
@@ -211,7 +212,7 @@ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
-**I established** some ground rules about **what is obvious** when **it seems like** a mess.
+**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
@@ -255,3 +256,5 @@ from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
+
+[](https://www.bestpractices.dev/projects/7297)
diff --git a/UPGRADE.md b/UPGRADE.md
deleted file mode 100644
index 4b8f7bb1..00000000
--- a/UPGRADE.md
+++ /dev/null
@@ -1,31 +0,0 @@
-Guide to upgrade your code from v1 to v2
-----------------------------------------
-
- * If you are using the legacy `detect` function, that is it. You have nothing to do.
-
-## Detection
-
-### Before
-
-```python
-from charset_normalizer import CharsetNormalizerMatches
-
-results = CharsetNormalizerMatches.from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32')
-)
-```
-
-### After
-
-```python
-from charset_normalizer import from_bytes
-
-results = from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32')
-)
-```
-
-Methods that once were staticmethods of the class `CharsetNormalizerMatches` are now basic functions.
-`from_fp`, `from_bytes`, `from_fp` and `` are concerned.
-
-Staticmethods scheduled to be removed in version 3.0
diff --git a/bin/bc.py b/bin/bc.py
index df289433..cac23682 100644
--- a/bin/bc.py
+++ b/bin/bc.py
@@ -1,13 +1,13 @@
-#!/bin/python
+from __future__ import annotations
+
+import argparse
from glob import glob
from os.path import isdir
from sys import argv
-from typing import List
-import argparse
-from charset_normalizer import detect as tbt_detect
from chardet import detect as chardet_detect
+from charset_normalizer import detect as tbt_detect
from charset_normalizer.utils import iana_name
@@ -16,28 +16,35 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
except UnicodeDecodeError:
- return 0.
+ return 0.0
character_count = len(str_a)
- diff_character_count = sum(
- chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
- )
+ diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
- return 1. - (diff_character_count / character_count)
+ return 1.0 - (diff_character_count / character_count)
-def cli_bc(arguments: List[str]):
+def cli_bc(arguments: list[str]):
parser = argparse.ArgumentParser(
description="BC script checker for Charset-Normalizer with Chardet"
)
- parser.add_argument('-c', '--coverage', action="store", default=85, type=int, dest='coverage',
- help="Define the minimum acceptable coverage to succeed")
+ parser.add_argument(
+ "-c",
+ "--coverage",
+ action="store",
+ default=85,
+ type=int,
+ dest="coverage",
+ help="Define the minimum acceptable coverage to succeed",
+ )
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
- print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
+ print(
+ "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
+ )
exit(1)
success_count = 0
@@ -50,44 +57,52 @@ def cli_bc(arguments: List[str]):
content = fp.read()
chardet_result = chardet_detect(content)
- chardet_encoding = chardet_result['encoding']
+ chardet_encoding = chardet_result["encoding"]
charset_normalizer_result = tbt_detect(content)
- charset_normalizer_encoding = charset_normalizer_result['encoding']
+ charset_normalizer_encoding = charset_normalizer_result["encoding"]
if [chardet_encoding, charset_normalizer_encoding].count(None) == 1:
- print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
+ print(
+ f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
+ )
continue
if charset_normalizer_encoding == chardet_encoding:
success_count += 1
- print("✅✅ '{}' (BC)".format(tbt_path))
+ print(f"✅✅ '{tbt_path}' (BC)")
continue
- if (chardet_encoding is None and charset_normalizer_encoding is None) or (iana_name(chardet_encoding, False) == iana_name(charset_normalizer_encoding, False)):
+ if (chardet_encoding is None and charset_normalizer_encoding is None) or (
+ iana_name(chardet_encoding, False)
+ == iana_name(charset_normalizer_encoding, False)
+ ):
success_count += 1
- print("✅✅ '{}' (BC)".format(tbt_path))
+ print(f"✅✅ '{tbt_path}' (BC)")
continue
- calc_eq = calc_equivalence(content, chardet_encoding, charset_normalizer_encoding)
+ calc_eq = calc_equivalence(
+ content, chardet_encoding, charset_normalizer_encoding
+ )
if calc_eq >= 0.98:
success_count += 1
- print("️✅ ️'{}' (got '{}' but eq {} WITH {} %)".format(tbt_path, charset_normalizer_encoding, chardet_encoding, round(calc_eq * 100., 3)))
+ print(
+ f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but "
+ f"eq {chardet_encoding} WITH {round(calc_eq * 100., 3)} %)"
+ )
continue
- print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
+ print(
+ f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
+ )
- success_ratio = round(success_count / total_count, 2) * 100.
+ success_ratio = round(success_count / total_count, 2) * 100.0
- print("Total EST BC = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
+ print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)")
return 0 if success_ratio >= args.coverage else 1
if __name__ == "__main__":
- exit(
- cli_bc(
- argv[1:]
- )
- )
+ exit(cli_bc(argv[1:]))
diff --git a/bin/coverage.py b/bin/coverage.py
index e5f07bd5..a84bb73c 100644
--- a/bin/coverage.py
+++ b/bin/coverage.py
@@ -1,43 +1,54 @@
-#!/bin/python
+from __future__ import annotations
+
+import argparse
from glob import glob
+from os import sep
from os.path import isdir
from sys import argv
-from typing import List
-import argparse
-from charset_normalizer import from_path, __version__
+from charset_normalizer import __version__, from_path
from charset_normalizer.utils import iana_name
-from os import sep
-
def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
character_count = len(str_a)
- diff_character_count = sum(
- chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
- )
-
+ diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
- return 1. - (diff_character_count / character_count)
+ return 1.0 - (diff_character_count / character_count)
-def cli_coverage(arguments: List[str]):
+def cli_coverage(arguments: list[str]):
parser = argparse.ArgumentParser(
description="Embedded detection success coverage script checker for Charset-Normalizer"
)
- parser.add_argument('-p', '--with-preemptive', action="store_true", default=False, dest='preemptive',
- help='Enable the preemptive scan behaviour during coverage check')
- parser.add_argument('-c', '--coverage', action="store", default=90, type=int, dest='coverage',
- help="Define the minimum acceptable coverage to succeed")
+ parser.add_argument(
+ "-p",
+ "--with-preemptive",
+ action="store_true",
+ default=False,
+ dest="preemptive",
+ help="Enable the preemptive scan behaviour during coverage check",
+ )
+ parser.add_argument(
+ "-c",
+ "--coverage",
+ action="store",
+ default=90,
+ type=int,
+ dest="coverage",
+ help="Define the minimum acceptable coverage to succeed",
+ )
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
- print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
+ print(
+ "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
+ )
exit(1)
print(f"> using charset-normalizer {__version__}")
@@ -46,28 +57,27 @@ def cli_coverage(arguments: List[str]):
total_count = 0
for tbt_path in sorted(glob("./char-dataset/**/*.*")):
-
expected_encoding = tbt_path.split(sep)[-2]
total_count += 1
- results = from_path(
- tbt_path,
- preemptive_behaviour=args.preemptive
- )
+ results = from_path(tbt_path, preemptive_behaviour=args.preemptive)
if expected_encoding == "None" and len(results) == 0:
- print("✅✅ '{}'".format(tbt_path))
+ print(f"✅✅ '{tbt_path}'")
success_count += 1
continue
if len(results) == 0:
- print("⚡⚡ '{}' (nothing)".format(tbt_path))
+ print(f"⚡⚡ '{tbt_path}' (nothing)")
continue
result = results.best()
- if expected_encoding in result.could_be_from_charset or iana_name(expected_encoding) in result.could_be_from_charset:
- print("✅✅ '{}'".format(tbt_path))
+ if (
+ expected_encoding in result.could_be_from_charset
+ or iana_name(expected_encoding) in result.could_be_from_charset
+ ):
+ print(f"✅✅ '{tbt_path}'")
success_count += 1
continue
@@ -75,21 +85,21 @@ def cli_coverage(arguments: List[str]):
if calc_eq >= 0.98:
success_count += 1
- print("️✅ ️'{}' (got '{}' but equivalence {} %)".format(tbt_path, result.encoding, round(calc_eq * 100., 3)))
+ print(
+ f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100., 3)} %)"
+ )
continue
- print("⚡ '{}' (got '{}')".format(tbt_path, result.encoding))
+ print(f"⚡ '{tbt_path}' (got '{result.encoding}')")
- success_ratio = round(success_count / total_count, 2) * 100.
+ success_ratio = round(success_count / total_count, 2) * 100.0
- print("Total EST coverage = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
+ print(
+ f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)"
+ )
return 0 if success_ratio >= args.coverage else 1
if __name__ == "__main__":
- exit(
- cli_coverage(
- argv[1:]
- )
- )
+ exit(cli_coverage(argv[1:]))
diff --git a/bin/integration.py b/bin/integration.py
deleted file mode 100644
index b186313a..00000000
--- a/bin/integration.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from requests import get, __version__
-from typing import List
-from charset_normalizer import detect, __version__ as __version_cn__
-
-if __name__ == "__main__":
-
- print(f"requests {__version__}")
- print(f"charset_normalizer {__version_cn__}")
-
- files: List[str] = get("http://127.0.0.1:8080/").json()
-
- print("## Testing with actual files")
-
- for file in files:
- r = get(
- "http://127.0.0.1:8080/" + file
- )
-
- if r.ok is False:
- print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}")
- exit(1)
-
- expected_encoding = detect(r.content)["encoding"]
-
- if expected_encoding != r.apparent_encoding:
- print(f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'")
- exit(1)
-
- print(f"✅✅ '{file}' OK")
-
- print("## Testing with edge cases")
-
- # Should NOT crash
- get("http://127.0.0.1:8080/edge/empty/json").json()
-
- print("✅✅ Empty JSON OK")
-
- if get("http://127.0.0.1:8080/edge/empty/plain").apparent_encoding != "utf-8":
- print("Empty payload SHOULD not return apparent_encoding != UTF-8")
- exit(1)
-
- print("✅✅ Empty Plain Text OK")
-
- r = get("http://127.0.0.1:8080/edge/gb18030/json")
-
- if r.apparent_encoding != "GB18030":
- print("JSON Basic Detection FAILURE (/edge/gb18030/json)")
- exit(1)
-
- r.json()
-
- print("✅✅ GB18030 JSON Encoded OK")
-
- print("Integration tests passed!")
diff --git a/bin/performance.py b/bin/performance.py
index ff715fd1..3a55c188 100644
--- a/bin/performance.py
+++ b/bin/performance.py
@@ -1,15 +1,16 @@
-#!/bin/python
-from glob import glob
-from time import perf_counter_ns
+from __future__ import annotations
+
import argparse
-from sys import argv
+from glob import glob
+from math import ceil
from os.path import isdir
+from statistics import mean, stdev
+from sys import argv
+from time import perf_counter_ns
-from charset_normalizer import detect
from chardet import detect as chardet_detect
-from statistics import mean, stdev
-from math import ceil
+from charset_normalizer import detect
def calc_percentile(data, percentile):
@@ -66,7 +67,8 @@ def performance_compare(arguments):
charset_normalizer_time = charset_normalizer_time or 0.000005
cn_faster = (chardet_time / charset_normalizer_time) * 100 - 100
print(
- f"{idx+1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %"
+ f"{idx + 1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} "
+ f"CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %"
)
# Print the top 10 rows with the slowest execution time
@@ -78,7 +80,7 @@ def performance_compare(arguments):
)
for idx, time in sorted_results[:10]:
tbt_path = file_list[idx]
- print(f"{idx+1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}")
+ print(f"{idx + 1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}")
# Print charset normalizer statistics
min_time = min(charset_normalizer_results)
diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh
deleted file mode 100755
index f8a63812..00000000
--- a/bin/run_autofix.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-
-export PREFIX=""
-if [ -d 'venv' ] ; then
- export PREFIX="venv/bin/"
-fi
-
-set -x
-
-${PREFIX}pip install -r ./dev-requirements.txt
-${PREFIX}black --target-version=py37 charset_normalizer
-${PREFIX}isort charset_normalizer
diff --git a/bin/run_checks.sh b/bin/run_checks.sh
deleted file mode 100755
index 951611ff..00000000
--- a/bin/run_checks.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh -e
-
-export PREFIX=""
-if [ -d 'venv' ] ; then
- export PREFIX="venv/bin/"
-fi
-
-set -x
-
-${PREFIX}pip install -r ./dev-requirements.txt
-${PREFIX}pytest
-${PREFIX}black --check --diff --target-version=py37 charset_normalizer
-${PREFIX}flake8 charset_normalizer
-${PREFIX}mypy charset_normalizer
-${PREFIX}isort --check --diff charset_normalizer
diff --git a/bin/serve.py b/bin/serve.py
deleted file mode 100644
index 0b055ef6..00000000
--- a/bin/serve.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from flask import Flask, jsonify, send_from_directory
-from glob import glob
-
-app = Flask(__name__)
-
-
-@app.route('/raw/')
-def read_file(path):
- return send_from_directory('../char-dataset', path, as_attachment=True), 200, {"Content-Type": "text/plain"}
-
-
-@app.route("/")
-def read_targets():
- return jsonify(
- [
- el.replace("./char-dataset", "/raw").replace("\\", "/") for el in sorted(glob("./char-dataset/**/*"))
- ]
- )
-
-
-@app.route("/edge/empty/plain")
-def read_empty_response_plain():
- return b"", 200, {"Content-Type": "text/plain"}
-
-
-@app.route("/edge/empty/json")
-def read_empty_response_json():
- return b"{}", 200, {"Content-Type": "application/json"}
-
-
-@app.route("/edge/gb18030/json")
-def read_gb18030_response_json():
- return '{"abc": "我没有埋怨,磋砣的只是一些时间。。今觀俗士之論也,以族舉德,以位命賢,茲可謂得論之一體矣,而未獲至論之淑真也。"}'.encode("gb18030"), 200, {"Content-Type": "application/json"}
-
-
-if __name__ == "__main__":
- app.run(host="127.0.0.1", port=8080)
diff --git a/build-requirements.txt b/build-requirements.txt
deleted file mode 100644
index ce978b23..00000000
--- a/build-requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# in the meantime we migrate to pyproject.toml
-# this represent the minimum requirement to build (for the optional speedup)
---find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295
-mypy==1.11.2; python_version >= '3.8' and python_version < '3.13'
-mypy==1.12.0; python_version >= '3.13'
-mypy==1.4.1; python_version < '3.8'
-build>=0.10.0,<2
-wheel==0.42.0
-setuptools>=68,<76
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
deleted file mode 100644
index 699990ee..00000000
--- a/charset_normalizer/version.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""
-Expose version
-"""
-
-__version__ = "3.4.0"
-VERSION = __version__.split(".")
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 12c2ebfd..19f1c2bb 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,15 +1,2 @@
---find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295
-flake8==5.0.4
-chardet==5.1.0
-isort==5.11.4
-codecov==2.1.13
-pytest-cov==4.1.0
-build>=0.10.0,<2
-wheel==0.42.0
-black==23.3.0
-mypy==1.11.2; python_version >= '3.8' and python_version < '3.13'
-mypy==1.12.0; python_version >= '3.13'
-mypy==1.4.1; python_version < '3.8'
-Flask==2.2.3
-pytest>=7.4.4,<=8.3.3
-requests==2.31.0
+coverage>=7.2.7,<7.7
+pytest>=7.4.4,<9
diff --git a/docs/index.rst b/docs/index.rst
index 19ca08a9..da8a9b6f 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -20,7 +20,7 @@ It aims to be as generic as possible.
It is released under MIT license, see LICENSE for more
details. Be aware that no warranty of any kind is provided with this package.
-Copyright (C) 2023 Ahmed TAHRI
+Copyright (C) 2025 Ahmed TAHRI
Introduction
============
diff --git a/noxfile.py b/noxfile.py
new file mode 100644
index 00000000..b8f0c8c6
--- /dev/null
+++ b/noxfile.py
@@ -0,0 +1,232 @@
+from __future__ import annotations
+
+import os
+import shutil
+
+import nox
+
+
+def test_impl(
+ session: nox.Session,
+ use_mypyc: bool = False,
+):
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install(
+ ".",
+ silent=False,
+ env={"CHARSET_NORMALIZER_USE_MYPYC": "1" if use_mypyc else "0"},
+ )
+
+ # Show the pip version.
+ session.run("pip", "--version")
+ # Print the Python version and bytesize.
+ session.run("python", "--version")
+ # Show charset-normalizer cli info
+ session.run("normalizer", "--version")
+
+ # Inspired from https://hynek.me/articles/ditch-codecov-python/
+ # We use parallel mode and then combine in a later CI step
+ session.run(
+ "python",
+ "-m",
+ "coverage",
+ "run",
+ "--parallel-mode",
+ "-m",
+ "pytest",
+ "-v",
+ "-ra",
+ f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
+ "--tb=native",
+ "--durations=10",
+ "--strict-config",
+ "--strict-markers",
+ *(session.posargs or ("tests/",)),
+ env={
+ "PYTHONWARNINGS": "always::DeprecationWarning",
+ "COVERAGE_CORE": "sysmon",
+ },
+ )
+
+
+@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy"])
+def test(session: nox.Session) -> None:
+ test_impl(session)
+
+
+@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"])
+def test_mypyc(session: nox.Session) -> None:
+ test_impl(session, True)
+
+
+def git_clone(session: nox.Session, git_url: str) -> None:
+ """We either clone the target repository or if already exist
+ simply reset the state and pull.
+ """
+ expected_directory = git_url.split("/")[-1]
+
+ if expected_directory.endswith(".git"):
+ expected_directory = expected_directory[:-4]
+
+ if not os.path.isdir(expected_directory):
+ session.run("git", "clone", "--depth", "1", git_url, external=True)
+ else:
+ session.run(
+ "git", "-C", expected_directory, "reset", "--hard", "HEAD", external=True
+ )
+ session.run("git", "-C", expected_directory, "pull", external=True)
+
+
+@nox.session()
+def backward_compatibility(session: nox.Session) -> None:
+ git_clone(session, "https://github.com/ousret/char-dataset")
+
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install(".", silent=False)
+ session.install("chardet")
+
+ session.run(
+ "python",
+ "bin/bc.py",
+ *(session.posargs or ("--coverage=85",)),
+ )
+
+
+@nox.session()
+def coverage(session: nox.Session) -> None:
+ git_clone(session, "https://github.com/ousret/char-dataset")
+
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install(".", silent=False)
+
+ # Show the pip version.
+ session.run("pip", "--version")
+ # Print the Python version and bytesize.
+ session.run("python", "--version")
+ # Show charset-normalizer cli info
+ session.run("normalizer", "--version")
+
+ session.run(
+ "python",
+ "-m",
+ "coverage",
+ "run",
+ "--parallel-mode",
+ "bin/coverage.py",
+ *(session.posargs or ("--coverage=90", "--with-preemptive")),
+ )
+
+
+@nox.session()
+def performance(session: nox.Session) -> None:
+ git_clone(session, "https://github.com/ousret/char-dataset")
+
+ # Install deps and the package itself.
+ session.install("-U", "pip", "setuptools", silent=False)
+ session.install("-r", "dev-requirements.txt", silent=False)
+
+ session.install("chardet")
+ session.install(".", silent=False, env={"CHARSET_NORMALIZER_USE_MYPYC": "1"})
+
+ session.run(
+ "python",
+ "bin/performance.py",
+ *(session.posargs or ()),
+ )
+
+
+@nox.session()
+def downstream_niquests(session: nox.Session) -> None:
+ root = os.getcwd()
+ tmp_dir = session.create_tmp()
+
+ session.cd(tmp_dir)
+ git_clone(session, "https://github.com/jawah/niquests")
+ session.chdir("niquests")
+
+ session.run("git", "rev-parse", "HEAD", external=True)
+ session.install(".[socks]", silent=False)
+ session.install("-r", "requirements-dev.txt", silent=False)
+
+ session.cd(root)
+ session.install(".", silent=False)
+ session.cd(f"{tmp_dir}/niquests")
+
+ session.run(
+ "python",
+ "-c",
+ "import charset_normalizer; print(charset_normalizer.__version__)",
+ )
+ session.run(
+ "python",
+ "-m",
+ "pytest",
+ "-v",
+ f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
+ *(session.posargs or ("tests/",)),
+ env={"NIQUESTS_STRICT_OCSP": "1"},
+ )
+
+
+@nox.session()
+def downstream_requests(session: nox.Session) -> None:
+ root = os.getcwd()
+ tmp_dir = session.create_tmp()
+
+ session.cd(tmp_dir)
+ git_clone(session, "https://github.com/psf/requests")
+ session.chdir("requests")
+
+ session.run("git", "rev-parse", "HEAD", external=True)
+ session.install(".[socks]", silent=False)
+ session.install("-r", "requirements-dev.txt", silent=False)
+
+ session.cd(root)
+ session.install(".", silent=False)
+ session.cd(f"{tmp_dir}/requests")
+
+ session.run(
+ "python",
+ "-c",
+ "import charset_normalizer; print(charset_normalizer.__version__)",
+ )
+ session.run(
+ "python",
+ "-m",
+ "pytest",
+ "-v",
+ f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
+ *(session.posargs or ("tests/",)),
+ )
+
+
+@nox.session()
+def format(session: nox.Session) -> None:
+ """Run code formatters."""
+ lint(session)
+
+
+@nox.session
+def lint(session: nox.Session) -> None:
+ session.install("pre-commit")
+ session.run("pre-commit", "run", "--all-files")
+
+
+@nox.session
+def docs(session: nox.Session) -> None:
+ session.install("-r", "docs/requirements.txt")
+ session.install(".")
+
+ session.chdir("docs")
+ if os.path.exists("_build"):
+ shutil.rmtree("_build")
+ session.run("sphinx-build", "-b", "html", "-W", ".", "_build/html")
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..bbb82274
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,85 @@
+[build-system]
+requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.14.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "charset-normalizer"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+license = {text = "MIT"}
+keywords = ["encoding", "charset", "charset-detector", "detector", "normalization", "unicode", "chardet", "detect"]
+authors = [
+ {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"},
+]
+maintainers = [
+ {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"},
+]
+classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Programming Language :: Python :: Implementation :: PyPy",
+ "Topic :: Text Processing :: Linguistic",
+ "Topic :: Utilities",
+ "Typing :: Typed",
+]
+requires-python = ">=3.7"
+dynamic = ["version", "readme"]
+
+[project.optional-dependencies]
+unicode_backport = []
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+packages = ["charset_normalizer", "charset_normalizer.cli", ]
+
+[tool.setuptools.dynamic]
+version = {attr = "charset_normalizer.__version__"}
+readme = {file = ["README.md", "CHANGELOG.md", "LICENSE"], content-type = "text/markdown"}
+
+[project.scripts]
+normalizer = "charset_normalizer:cli.cli_detect"
+
+[project.urls]
+"Changelog" = "https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md"
+"Documentation" = "https://charset-normalizer.readthedocs.io/"
+"Code" = "https://github.com/jawah/charset_normalizer"
+"Issue tracker" = "https://github.com/jawah/charset_normalizer/issues"
+
+[tool.pytest.ini_options]
+log_level = "DEBUG"
+filterwarnings = [
+ "error",
+]
+
+[tool.isort]
+profile = "black"
+add_imports = "from __future__ import annotations"
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_any_generics = true
+disallow_incomplete_defs = true
+disallow_subclassing_any = true
+disallow_untyped_calls = true
+disallow_untyped_decorators = true
+disallow_untyped_defs = true
+no_implicit_optional = true
+no_implicit_reexport = true
+show_error_codes = true
+strict_equality = true
+warn_redundant_casts = true
+warn_return_any = true
+warn_unused_configs = true
+warn_unused_ignores = false
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 3eb71fa8..00000000
--- a/setup.cfg
+++ /dev/null
@@ -1,71 +0,0 @@
-[metadata]
-name = charset-normalizer
-description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
-long_description = file: README.md, CHANGELOG.md, LICENSE
-long_description_content_type = text/markdown
-keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect
-url = https://github.com/Ousret/charset_normalizer
-license = MIT
-author_email = tahri.ahmed@proton.me
-author = Ahmed TAHRI
-project_urls =
- Bug Reports = https://github.com/Ousret/charset_normalizer/issues
- Documentation = https://charset-normalizer.readthedocs.io/en/latest
-classifiers =
- Development Status :: 5 - Production/Stable
- License :: OSI Approved :: MIT License
- Intended Audience :: Developers
- Topic :: Software Development :: Libraries :: Python Modules
- Operating System :: OS Independent
- Programming Language :: Python
- Programming Language :: Python :: 3
- Programming Language :: Python :: 3.7
- Programming Language :: Python :: 3.8
- Programming Language :: Python :: 3.9
- Programming Language :: Python :: 3.10
- Programming Language :: Python :: 3.11
- Programming Language :: Python :: 3.12
- Programming Language :: Python :: 3.13
- Programming Language :: Python :: Implementation :: PyPy
- Topic :: Text Processing :: Linguistic
- Topic :: Utilities
- Typing :: Typed
-
-[options.packages.find]
-exclude =
- tests
- *.tests
- *.tests.*
- tests.*
- docs*
- data*
-
-[options.extras_require]
-unicode_backport =
-
-[options.entry_points]
-console_scripts =
- normalizer = charset_normalizer.cli:cli_detect
-
-[options]
-packages = find:
-include_package_data = True
-python_requires = >=3.7.0
-
-[options.package_data]
-charset_normalizer = py.typed
-
-[tool:pytest]
-addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs
-
-[flake8]
-ignore = W503, E203, B305
-max-line-length = 120
-
-[mypy]
-disallow_untyped_defs = True
-ignore_missing_imports = True
-
-[tool:isort]
-profile = black
-combine_as_imports = True
diff --git a/setup.py b/setup.py
index 0ccc7e91..da2a69ff 100644
--- a/setup.py
+++ b/setup.py
@@ -1,38 +1,30 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+from __future__ import annotations
import os
import sys
-from re import search
from setuptools import setup
-
-def get_version():
- with open('charset_normalizer/version.py') as version_file:
- return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""",
- version_file.read()).group('version')
-
-
USE_MYPYC = False
if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc":
sys.argv.pop(1)
USE_MYPYC = True
-if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1":
+elif os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1":
USE_MYPYC = True
if USE_MYPYC:
from mypyc.build import mypycify
- MYPYC_MODULES = mypycify([
- "charset_normalizer/md.py",
- ], debug_level="0")
+ MYPYC_MODULES = mypycify(
+ [
+ "src/charset_normalizer/md.py",
+ ],
+ debug_level="0",
+ opt_level="3",
+ )
else:
MYPYC_MODULES = None
-setup(
- name="charset-normalizer",
- version=get_version(),
- ext_modules=MYPYC_MODULES
-)
+setup(name="charset-normalizer", ext_modules=MYPYC_MODULES)
diff --git a/charset_normalizer/__init__.py b/src/charset_normalizer/__init__.py
similarity index 97%
rename from charset_normalizer/__init__.py
rename to src/charset_normalizer/__init__.py
index 55991fc3..0d3a3799 100644
--- a/charset_normalizer/__init__.py
+++ b/src/charset_normalizer/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
@@ -19,6 +18,9 @@
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
+
+from __future__ import annotations
+
import logging
from .api import from_bytes, from_fp, from_path, is_binary
diff --git a/charset_normalizer/__main__.py b/src/charset_normalizer/__main__.py
similarity index 66%
rename from charset_normalizer/__main__.py
rename to src/charset_normalizer/__main__.py
index beae2ef7..e0e76f7b 100644
--- a/charset_normalizer/__main__.py
+++ b/src/charset_normalizer/__main__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from .cli import cli_detect
if __name__ == "__main__":
diff --git a/charset_normalizer/api.py b/src/charset_normalizer/api.py
similarity index 93%
rename from charset_normalizer/api.py
rename to src/charset_normalizer/api.py
index e3f2283b..2c8c0618 100644
--- a/charset_normalizer/api.py
+++ b/src/charset_normalizer/api.py
@@ -1,6 +1,8 @@
+from __future__ import annotations
+
import logging
from os import PathLike
-from typing import BinaryIO, List, Optional, Set, Union
+from typing import BinaryIO
from .cd import (
coherence_ratio,
@@ -21,8 +23,6 @@
should_strip_sig_or_bom,
)
-# Will most likely be controversial
-# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
@@ -31,12 +31,12 @@
def from_bytes(
- sequences: Union[bytes, bytearray],
+ sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@@ -62,7 +62,7 @@ def from_bytes(
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
- "Expected object of type bytes or bytearray, got: {0}".format(
+ "Expected object of type bytes or bytearray, got: {}".format(
type(sequences)
)
)
@@ -76,7 +76,7 @@ def from_bytes(
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
@@ -135,9 +135,9 @@ def from_bytes(
),
)
- prioritized_encodings: List[str] = []
+ prioritized_encodings: list[str] = []
- specified_encoding: Optional[str] = (
+ specified_encoding: str | None = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
@@ -149,13 +149,13 @@ def from_bytes(
specified_encoding,
)
- tested: Set[str] = set()
- tested_but_hard_failure: List[str] = []
- tested_but_soft_failure: List[str] = []
+ tested: set[str] = set()
+ tested_but_hard_failure: list[str] = []
+ tested_but_soft_failure: list[str] = []
- fallback_ascii: Optional[CharsetMatch] = None
- fallback_u8: Optional[CharsetMatch] = None
- fallback_specified: Optional[CharsetMatch] = None
+ fallback_ascii: CharsetMatch | None = None
+ fallback_u8: CharsetMatch | None = None
+ fallback_specified: CharsetMatch | None = None
results: CharsetMatches = CharsetMatches()
@@ -189,7 +189,7 @@ def from_bytes(
tested.add(encoding_iana)
- decoded_payload: Optional[str] = None
+ decoded_payload: str | None = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
@@ -292,7 +292,7 @@ def from_bytes(
early_stop_count: int = 0
lazy_str_hard_failure = False
- md_chunks: List[str] = []
+ md_chunks: list[str] = []
md_ratios = []
try:
@@ -397,7 +397,7 @@ def from_bytes(
)
if not is_multi_byte_decoder:
- target_languages: List[str] = encoding_languages(encoding_iana)
+ target_languages: list[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
@@ -462,7 +462,7 @@ def from_bytes(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])
@@ -480,7 +480,7 @@ def from_bytes(
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
@@ -492,7 +492,7 @@ def from_bytes(
"the beginning of the sequence.",
encoding_iana,
)
- if explain:
+ if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
@@ -546,8 +546,8 @@ def from_fp(
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@@ -572,12 +572,12 @@ def from_fp(
def from_path(
- path: Union[str, bytes, PathLike], # type: ignore[type-arg]
+ path: str | bytes | PathLike, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@@ -603,12 +603,12 @@ def from_path(
def is_binary(
- fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
+ cp_isolation: list[str] | None = None,
+ cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
diff --git a/charset_normalizer/cd.py b/src/charset_normalizer/cd.py
similarity index 86%
rename from charset_normalizer/cd.py
rename to src/charset_normalizer/cd.py
index 4ea6760c..71a3ed51 100644
--- a/charset_normalizer/cd.py
+++ b/src/charset_normalizer/cd.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
-from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
+from typing import Counter as TypeCounter
from .constant import (
FREQUENCIES,
@@ -22,26 +24,24 @@
)
-def encoding_unicode_range(iana_name: str) -> List[str]:
+def encoding_unicode_range(iana_name: str) -> list[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
- raise IOError("Function not supported on multi-byte code page")
+ raise OSError("Function not supported on multi-byte code page")
- decoder = importlib.import_module(
- "encodings.{}".format(iana_name)
- ).IncrementalDecoder
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
- seen_ranges: Dict[str, int] = {}
+ seen_ranges: dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
- character_range: Optional[str] = unicode_range(chunk)
+ character_range: str | None = unicode_range(chunk)
if character_range is None:
continue
@@ -61,11 +61,11 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
)
-def unicode_range_languages(primary_range: str) -> List[str]:
+def unicode_range_languages(primary_range: str) -> list[str]:
"""
Return inferred languages used with a unicode range.
"""
- languages: List[str] = []
+ languages: list[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
@@ -77,13 +77,13 @@ def unicode_range_languages(primary_range: str) -> List[str]:
@lru_cache()
-def encoding_languages(iana_name: str) -> List[str]:
+def encoding_languages(iana_name: str) -> list[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
- unicode_ranges: List[str] = encoding_unicode_range(iana_name)
- primary_range: Optional[str] = None
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
+ primary_range: str | None = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
@@ -97,7 +97,7 @@ def encoding_languages(iana_name: str) -> List[str]:
@lru_cache()
-def mb_encoding_languages(iana_name: str) -> List[str]:
+def mb_encoding_languages(iana_name: str) -> list[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
@@ -118,7 +118,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
-def get_target_features(language: str) -> Tuple[bool, bool]:
+def get_target_features(language: str) -> tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
@@ -135,12 +135,12 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
def alphabet_languages(
- characters: List[str], ignore_non_latin: bool = False
-) -> List[str]:
+ characters: list[str], ignore_non_latin: bool = False
+) -> list[str]:
"""
Return associated languages associated to given characters.
"""
- languages: List[Tuple[str, float]] = []
+ languages: list[tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
@@ -170,7 +170,7 @@ def alphabet_languages(
def characters_popularity_compare(
- language: str, ordered_characters: List[str]
+ language: str, ordered_characters: list[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
@@ -178,7 +178,7 @@ def characters_popularity_compare(
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
- raise ValueError("{} not available".format(language))
+ raise ValueError(f"{language} not available")
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
@@ -214,14 +214,14 @@ def characters_popularity_compare(
character_approved_count += 1
continue
- characters_before_source: List[str] = FREQUENCIES[language][
+ characters_before_source: list[str] = FREQUENCIES[language][
0:character_rank_in_language
]
- characters_after_source: List[str] = FREQUENCIES[language][
+ characters_after_source: list[str] = FREQUENCIES[language][
character_rank_in_language:
]
- characters_before: List[str] = ordered_characters[0:character_rank]
- characters_after: List[str] = ordered_characters[character_rank:]
+ characters_before: list[str] = ordered_characters[0:character_rank]
+ characters_after: list[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
@@ -249,24 +249,24 @@ def characters_popularity_compare(
return character_approved_count / len(ordered_characters)
-def alpha_unicode_split(decoded_sequence: str) -> List[str]:
+def alpha_unicode_split(decoded_sequence: str) -> list[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
- layers: Dict[str, str] = {}
+ layers: dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
continue
- layer_target_range: Optional[str] = None
+ layer_target_range: str | None = None
for discovered_range in layers:
if (
@@ -288,12 +288,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
return list(layers.values())
-def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
+def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
- per_language_ratios: Dict[str, List[float]] = {}
+ per_language_ratios: dict[str, list[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
@@ -321,7 +321,7 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
- index_results: Dict[str, List[float]] = dict()
+ index_results: dict[str, list[float]] = dict()
for result in results:
language, ratio = result
@@ -345,14 +345,14 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
@lru_cache(maxsize=2048)
def coherence_ratio(
- decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
- results: List[Tuple[str, float]] = []
+ results: list[tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
@@ -371,7 +371,7 @@ def coherence_ratio(
if character_count <= TOO_SMALL_SEQUENCE:
continue
- popular_character_ordered: List[str] = [c for c, o in most_common]
+ popular_character_ordered: list[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
diff --git a/charset_normalizer/cli/__init__.py b/src/charset_normalizer/cli/__init__.py
similarity index 73%
rename from charset_normalizer/cli/__init__.py
rename to src/charset_normalizer/cli/__init__.py
index d95fedfe..543a5a4d 100644
--- a/charset_normalizer/cli/__init__.py
+++ b/src/charset_normalizer/cli/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from .__main__ import cli_detect, query_yes_no
__all__ = (
diff --git a/charset_normalizer/cli/__main__.py b/src/charset_normalizer/cli/__main__.py
similarity index 97%
rename from charset_normalizer/cli/__main__.py
rename to src/charset_normalizer/cli/__main__.py
index e7edd0fc..64a290f2 100644
--- a/charset_normalizer/cli/__main__.py
+++ b/src/charset_normalizer/cli/__main__.py
@@ -1,9 +1,10 @@
+from __future__ import annotations
+
import argparse
import sys
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
-from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
@@ -45,7 +46,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
-def cli_detect(argv: Optional[List[str]] = None) -> int:
+def cli_detect(argv: list[str] | None = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
@@ -124,7 +125,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
default=0.2,
type=float,
dest="threshold",
- help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
)
parser.add_argument(
"--version",
@@ -259,7 +260,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
- o_: List[str] = file_name.split(".")
+ o_: list[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
@@ -284,7 +285,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
- except IOError as e:
+ except OSError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
diff --git a/charset_normalizer/constant.py b/src/charset_normalizer/constant.py
similarity index 98%
rename from charset_normalizer/constant.py
rename to src/charset_normalizer/constant.py
index f8f2a811..1fb9508d 100644
--- a/charset_normalizer/constant.py
+++ b/src/charset_normalizer/constant.py
@@ -1,11 +1,12 @@
-# -*- coding: utf-8 -*-
+from __future__ import annotations
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from encodings.aliases import aliases
-from re import IGNORECASE, compile as re_compile
-from typing import Dict, List, Set, Union
+from re import IGNORECASE
+from re import compile as re_compile
# Contain for each eligible encoding a list of/item bytes SIG/BOM
-ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
+ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
"utf_8": BOM_UTF8,
"utf_7": [
b"\x2b\x2f\x76\x38",
@@ -25,7 +26,7 @@
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
# Up-to-date Unicode ucd/15.0.0
-UNICODE_RANGES_COMBINED: Dict[str, range] = {
+UNICODE_RANGES_COMBINED: dict[str, range] = {
"Control character": range(32),
"Basic Latin": range(32, 128),
"Latin-1 Supplement": range(128, 256),
@@ -357,7 +358,7 @@
}
-UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
+UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
"Supplement",
"Extended",
"Extensions",
@@ -392,7 +393,7 @@
"koi8_u",
]
-IANA_SUPPORTED: List[str] = sorted(
+IANA_SUPPORTED: list[str] = sorted(
filter(
lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"},
@@ -403,7 +404,7 @@
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity.
-IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
+IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"],
@@ -492,7 +493,7 @@
}
-CHARDET_CORRESPONDENCE: Dict[str, str] = {
+CHARDET_CORRESPONDENCE: dict[str, str] = {
"iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR",
@@ -528,7 +529,7 @@
}
-COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
+COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
"<",
">",
"=",
@@ -549,8 +550,8 @@
}
-KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
-ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
+KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
+ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
# Logging LEVEL below DEBUG
TRACE: int = 5
@@ -558,7 +559,7 @@
# Language label that contain the em dash "—"
# character are to be considered alternative seq to origin
-FREQUENCIES: Dict[str, List[str]] = {
+FREQUENCIES: dict[str, list[str]] = {
"English": [
"e",
"a",
diff --git a/charset_normalizer/legacy.py b/src/charset_normalizer/legacy.py
similarity index 90%
rename from charset_normalizer/legacy.py
rename to src/charset_normalizer/legacy.py
index 3f6d4907..a2f53451 100644
--- a/charset_normalizer/legacy.py
+++ b/src/charset_normalizer/legacy.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
from warnings import warn
from .api import from_bytes
@@ -11,9 +11,9 @@
from typing_extensions import TypedDict
class ResultDict(TypedDict):
- encoding: Optional[str]
+ encoding: str | None
language: str
- confidence: Optional[float]
+ confidence: float | None
def detect(
@@ -37,8 +37,9 @@ def detect(
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
- "Expected object of type bytes or bytearray, got: "
- "{0}".format(type(byte_str))
+ "Expected object of type bytes or bytearray, got: " "{}".format(
+ type(byte_str)
+ )
)
if isinstance(byte_str, bytearray):
diff --git a/charset_normalizer/md.py b/src/charset_normalizer/md.py
similarity index 94%
rename from charset_normalizer/md.py
rename to src/charset_normalizer/md.py
index d834db0e..9ed59a86 100644
--- a/charset_normalizer/md.py
+++ b/src/charset_normalizer/md.py
@@ -1,6 +1,7 @@
+from __future__ import annotations
+
from functools import lru_cache
from logging import getLogger
-from typing import List, Optional
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
@@ -68,7 +69,7 @@ def __init__(self) -> None:
self._symbol_count: int = 0
self._character_count: int = 0
- self._last_printable_char: Optional[str] = None
+ self._last_printable_char: str | None = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
@@ -92,7 +93,7 @@ def feed(self, character: str) -> None:
self._last_printable_char = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@@ -123,7 +124,7 @@ def feed(self, character: str) -> None:
if is_accentuated(character):
self._accentuated_count += 1
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._accentuated_count = 0
@@ -149,7 +150,7 @@ def feed(self, character: str) -> None:
self._unprintable_count += 1
self._character_count += 1
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._unprintable_count = 0
@property
@@ -165,7 +166,7 @@ def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
- self._last_latin_character: Optional[str] = None
+ self._last_latin_character: str | None = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
@@ -184,7 +185,7 @@ def feed(self, character: str) -> None:
self._successive_count += 1
self._last_latin_character = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@@ -201,7 +202,7 @@ class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
- self._last_printable_seen: Optional[str] = None
+ self._last_printable_seen: str | None = None
def eligible(self, character: str) -> bool:
return character.isprintable()
@@ -221,15 +222,15 @@ def feed(self, character: str) -> None:
self._last_printable_seen = character
return
- unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
- unicode_range_b: Optional[str] = unicode_range(character)
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
+ unicode_range_b: str | None = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@@ -346,7 +347,7 @@ def feed(self, character: str) -> None:
self._is_current_word_bad = True
self._buffer += character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
@@ -384,7 +385,7 @@ def feed(self, character: str) -> None:
if is_cjk(character):
self._cjk_character_count += 1
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._wrong_stop_count = 0
self._cjk_character_count = 0
@@ -406,7 +407,7 @@ def __init__(self) -> None:
self._character_count: int = 0
- self._last_alpha_seen: Optional[str] = None
+ self._last_alpha_seen: str | None = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
@@ -454,7 +455,7 @@ def feed(self, character: str) -> None:
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
@@ -476,7 +477,7 @@ def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
- def reset(self) -> None: # pragma: no cover
+ def reset(self) -> None: # Abstract
self._character_count = 0
self._isolated_form_count = 0
@@ -501,7 +502,7 @@ def ratio(self) -> float:
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
- unicode_range_a: Optional[str], unicode_range_b: Optional[str]
+ unicode_range_a: str | None, unicode_range_b: str | None
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
@@ -525,9 +526,10 @@ def is_suspiciously_successive_range(
):
return False
- keywords_range_a, keywords_range_b = unicode_range_a.split(
- " "
- ), unicode_range_b.split(" ")
+ keywords_range_a, keywords_range_b = (
+ unicode_range_a.split(" "),
+ unicode_range_b.split(" "),
+ )
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
@@ -580,7 +582,7 @@ def mess_ratio(
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
- detectors: List[MessDetectorPlugin] = [
+ detectors: list[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
@@ -622,7 +624,7 @@ def mess_ratio(
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
- for dt in detectors: # pragma: nocover
+ for dt in detectors:
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)
diff --git a/charset_normalizer/models.py b/src/charset_normalizer/models.py
similarity index 83%
rename from charset_normalizer/models.py
rename to src/charset_normalizer/models.py
index 6f6b86b3..1042758f 100644
--- a/charset_normalizer/models.py
+++ b/src/charset_normalizer/models.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Iterator, List, Tuple
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
@@ -15,9 +17,9 @@ def __init__(
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
- languages: "CoherenceMatches",
- decoded_payload: Optional[str] = None,
- preemptive_declaration: Optional[str] = None,
+ languages: CoherenceMatches,
+ decoded_payload: str | None = None,
+ preemptive_declaration: str | None = None,
):
self._payload: bytes = payload
@@ -25,17 +27,17 @@ def __init__(
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
- self._unicode_ranges: Optional[List[str]] = None
+ self._unicode_ranges: list[str] | None = None
- self._leaves: List[CharsetMatch] = []
+ self._leaves: list[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
- self._output_payload: Optional[bytes] = None
- self._output_encoding: Optional[str] = None
+ self._output_payload: bytes | None = None
+ self._output_encoding: str | None = None
- self._string: Optional[str] = decoded_payload
+ self._string: str | None = decoded_payload
- self._preemptive_declaration: Optional[str] = preemptive_declaration
+ self._preemptive_declaration: str | None = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
@@ -77,9 +79,9 @@ def __str__(self) -> str:
return self._string
def __repr__(self) -> str:
- return "".format(self.encoding, self.fingerprint)
+ return f""
- def add_submatch(self, other: "CharsetMatch") -> None:
+ def add_submatch(self, other: CharsetMatch) -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
@@ -95,11 +97,11 @@ def encoding(self) -> str:
return self._encoding
@property
- def encoding_aliases(self) -> List[str]:
+ def encoding_aliases(self) -> list[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
- also_known_as: List[str] = []
+ also_known_as: list[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
@@ -116,7 +118,7 @@ def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
- def languages(self) -> List[str]:
+ def languages(self) -> list[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
@@ -177,7 +179,7 @@ def raw(self) -> bytes:
return self._payload
@property
- def submatch(self) -> List["CharsetMatch"]:
+ def submatch(self) -> list[CharsetMatch]:
return self._leaves
@property
@@ -185,19 +187,17 @@ def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
- def alphabets(self) -> List[str]:
+ def alphabets(self) -> list[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
- detected_ranges: List[Optional[str]] = [
- unicode_range(char) for char in str(self)
- ]
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
- def could_be_from_charset(self) -> List[str]:
+ def could_be_from_charset(self) -> list[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
@@ -221,10 +221,11 @@ def output(self, encoding: str = "utf_8") -> bytes:
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
- m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
+ m.groups()[0],
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
),
decoded_string[:8192],
- 1,
+ count=1,
)
decoded_string = patched_header + decoded_string[8192:]
@@ -247,13 +248,13 @@ class CharsetMatches:
Act like a list(iterable) but does not implements all related methods.
"""
- def __init__(self, results: Optional[List[CharsetMatch]] = None):
- self._results: List[CharsetMatch] = sorted(results) if results else []
+ def __init__(self, results: list[CharsetMatch] | None = None):
+ self._results: list[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
- def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
+ def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
@@ -293,7 +294,7 @@ def append(self, item: CharsetMatch) -> None:
self._results.append(item)
self._results = sorted(self._results)
- def best(self) -> Optional["CharsetMatch"]:
+ def best(self) -> CharsetMatch | None:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
@@ -301,7 +302,7 @@ def best(self) -> Optional["CharsetMatch"]:
return None
return self._results[0]
- def first(self) -> Optional["CharsetMatch"]:
+ def first(self) -> CharsetMatch | None:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
@@ -316,31 +317,31 @@ class CliDetectionResult:
def __init__(
self,
path: str,
- encoding: Optional[str],
- encoding_aliases: List[str],
- alternative_encodings: List[str],
+ encoding: str | None,
+ encoding_aliases: list[str],
+ alternative_encodings: list[str],
language: str,
- alphabets: List[str],
+ alphabets: list[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
- unicode_path: Optional[str],
+ unicode_path: str | None,
is_preferred: bool,
):
self.path: str = path
- self.unicode_path: Optional[str] = unicode_path
- self.encoding: Optional[str] = encoding
- self.encoding_aliases: List[str] = encoding_aliases
- self.alternative_encodings: List[str] = alternative_encodings
+ self.unicode_path: str | None = unicode_path
+ self.encoding: str | None = encoding
+ self.encoding_aliases: list[str] = encoding_aliases
+ self.alternative_encodings: list[str] = alternative_encodings
self.language: str = language
- self.alphabets: List[str] = alphabets
+ self.alphabets: list[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
- def __dict__(self) -> Dict[str, Any]: # type: ignore
+ def __dict__(self) -> dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
diff --git a/charset_normalizer/py.typed b/src/charset_normalizer/py.typed
similarity index 100%
rename from charset_normalizer/py.typed
rename to src/charset_normalizer/py.typed
diff --git a/charset_normalizer/utils.py b/src/charset_normalizer/utils.py
similarity index 85%
rename from charset_normalizer/utils.py
rename to src/charset_normalizer/utils.py
index e5cbbf4c..0175e0a9 100644
--- a/charset_normalizer/utils.py
+++ b/src/charset_normalizer/utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import importlib
import logging
import unicodedata
@@ -5,9 +7,11 @@
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
-from typing import Generator, List, Optional, Set, Tuple, Union
+from typing import Generator
-from _multibytecodec import MultibyteIncrementalDecoder
+from _multibytecodec import ( # type: ignore[import-not-found,import]
+ MultibyteIncrementalDecoder,
+)
from .constant import (
ENCODING_MARKS,
@@ -23,7 +27,7 @@
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return (
"WITH GRAVE" in description
@@ -43,13 +47,13 @@ def remove_accent(character: str) -> str:
if not decomposed:
return character
- codes: List[str] = decomposed.split(" ")
+ codes: list[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
-def unicode_range(character: str) -> Optional[str]:
+def unicode_range(character: str) -> str | None:
"""
Retrieve the Unicode range official name from a single character.
"""
@@ -66,7 +70,7 @@ def unicode_range(character: str) -> Optional[str]:
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "LATIN" in description
@@ -78,7 +82,7 @@ def is_punctuation(character: str) -> bool:
if "P" in character_category:
return True
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
return False
@@ -93,7 +97,7 @@ def is_symbol(character: str) -> bool:
if "S" in character_category or "N" in character_category:
return True
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
return False
@@ -103,7 +107,7 @@ def is_symbol(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
- character_range: Optional[str] = unicode_range(character)
+ character_range: str | None = unicode_range(character)
if character_range is None:
return False
@@ -130,7 +134,7 @@ def is_case_variable(character: str) -> bool:
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "CJK" in character_name
@@ -140,7 +144,7 @@ def is_cjk(character: str) -> bool:
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "HIRAGANA" in character_name
@@ -150,7 +154,7 @@ def is_hiragana(character: str) -> bool:
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "KATAKANA" in character_name
@@ -160,7 +164,7 @@ def is_katakana(character: str) -> bool:
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "HANGUL" in character_name
@@ -170,7 +174,7 @@ def is_hangul(character: str) -> bool:
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "THAI" in character_name
@@ -180,7 +184,7 @@ def is_thai(character: str) -> bool:
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name
@@ -190,7 +194,7 @@ def is_arabic(character: str) -> bool:
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
- except ValueError:
+ except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@@ -206,13 +210,13 @@ def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
- and character != "\x1A" # Why? Its the ASCII substitute character.
+ and character != "\x1a" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
-def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
+def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
@@ -221,7 +225,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional
seq_len: int = len(sequence)
- results: List[str] = findall(
+ results: list[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
@@ -260,18 +264,18 @@ def is_multi_byte_encoding(name: str) -> bool:
"utf_32_be",
"utf_7",
} or issubclass(
- importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
MultibyteIncrementalDecoder,
)
-def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
+def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
- marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
@@ -288,6 +292,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
def iana_name(cp_name: str, strict: bool = True) -> str:
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
@@ -298,35 +303,17 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
return encoding_iana
if strict:
- raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
return cp_name
-def range_scan(decoded_sequence: str) -> List[str]:
- ranges: Set[str] = set()
-
- for character in decoded_sequence:
- character_range: Optional[str] = unicode_range(character)
-
- if character_range is None:
- continue
-
- ranges.add(character_range)
-
- return list(ranges)
-
-
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
- decoder_a = importlib.import_module(
- "encodings.{}".format(iana_name_a)
- ).IncrementalDecoder
- decoder_b = importlib.import_module(
- "encodings.{}".format(iana_name_b)
- ).IncrementalDecoder
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
@@ -374,7 +361,7 @@ def cut_sequence_chunks(
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
- decoded_payload: Optional[str] = None,
+ decoded_payload: str | None = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
diff --git a/src/charset_normalizer/version.py b/src/charset_normalizer/version.py
new file mode 100644
index 00000000..f85e8929
--- /dev/null
+++ b/src/charset_normalizer/version.py
@@ -0,0 +1,8 @@
+"""
+Expose version
+"""
+
+from __future__ import annotations
+
+__version__ = "3.4.1"
+VERSION = __version__.split(".")
diff --git a/tests/__init__.py b/tests/__init__.py
index 8b137891..e69de29b 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1 +0,0 @@
-
diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py
index e5d774d3..e4fb5fd3 100644
--- a/tests/test_base_detection.py
+++ b/tests/test_base_detection.py
@@ -1,40 +1,52 @@
-from charset_normalizer.api import from_bytes
-from charset_normalizer.models import CharsetMatches
+from __future__ import annotations
import pytest
+from charset_normalizer.api import from_bytes
+from charset_normalizer.models import CharsetMatches
+
def test_empty():
- best_guess = from_bytes(b'').best()
+ best_guess = from_bytes(b"").best()
assert best_guess is not None, "Empty bytes payload SHOULD NOT return None"
- assert best_guess.encoding == "utf_8", "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)"
+ assert (
+ best_guess.encoding == "utf_8"
+ ), "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)"
assert len(best_guess.alphabets) == 0, ""
def test_bool_matches():
- guesses_not_empty = from_bytes(b'')
+ guesses_not_empty = from_bytes(b"")
guesses_empty = CharsetMatches([])
- assert bool(guesses_not_empty) is True, "Bool behaviour of CharsetMatches altered, should be True"
- assert bool(guesses_empty) is False, "Bool behaviour of CharsetMatches altered, should be False"
+ assert (
+ bool(guesses_not_empty) is True
+ ), "Bool behaviour of CharsetMatches altered, should be True"
+ assert (
+ bool(guesses_empty) is False
+ ), "Bool behaviour of CharsetMatches altered, should be False"
@pytest.mark.parametrize(
"payload, expected_encoding",
[
- (b'\xfe\xff', 'utf_16'),
- ('\uFEFF'.encode('gb18030'), 'gb18030'),
- (b'\xef\xbb\xbf', 'utf_8'),
- ("".encode('utf_32'), "utf_32")
- ]
+ (b"\xfe\xff", "utf_16"),
+ ("\uFEFF".encode("gb18030"), "gb18030"),
+ (b"\xef\xbb\xbf", "utf_8"),
+ ("".encode("utf_32"), "utf_32"),
+ ],
)
def test_empty_but_with_bom_or_sig(payload, expected_encoding):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Empty detection but with SIG/BOM has failed!"
- assert best_guess.encoding == expected_encoding, "Empty detection but with SIG/BOM is wrongly detected!"
- assert best_guess.raw == payload, "The RAW property should contain the original payload given for detection."
+ assert (
+ best_guess.encoding == expected_encoding
+ ), "Empty detection but with SIG/BOM is wrongly detected!"
+ assert (
+ best_guess.raw == payload
+ ), "The RAW property should contain the original payload given for detection."
assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True"
assert str(best_guess) == "", "The cast to str SHOULD be empty"
@@ -42,16 +54,27 @@ def test_empty_but_with_bom_or_sig(payload, expected_encoding):
@pytest.mark.parametrize(
"payload, expected_encoding",
[
- ((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030'), "gb18030",),
- ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_32'), "utf_32",),
- ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_8_sig'), "utf_8",),
- ]
+ (
+ ("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"),
+ "gb18030",
+ ),
+ (
+ "我没有埋怨,磋砣的只是一些时间。".encode("utf_32"),
+ "utf_32",
+ ),
+ (
+ "我没有埋怨,磋砣的只是一些时间。".encode("utf_8_sig"),
+ "utf_8",
+ ),
+ ],
)
def test_content_with_bom_or_sig(payload, expected_encoding):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Detection but with SIG/BOM has failed!"
- assert best_guess.encoding == expected_encoding, "Detection but with SIG/BOM is wrongly detected!"
+ assert (
+ best_guess.encoding == expected_encoding
+ ), "Detection but with SIG/BOM is wrongly detected!"
assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True"
@@ -63,42 +86,49 @@ def test_content_with_bom_or_sig(payload, expected_encoding):
b'{"token": "g4UsPJdfzNkGW2jwmKDGDilKGKYtpF2X.mx3MaTWL1tL7CNn5U7DeCcodKX7S3lwwJPKNjBT8etY"}',
b"81f4ab054b39cb0e12701e734077d84264308f5fc79494fc5f159fa2ebc07b73c8cc0e98e009664a20986706f90146e8eefcb929ce1f74a8eab21369fdc70198",
b"{}",
- ]
+ ],
)
def test_obviously_ascii_content(payload):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Dead-simple ASCII detection has failed!"
- assert best_guess.encoding == "ascii", "Dead-simple ASCII detection is wrongly detected!"
+ assert (
+ best_guess.encoding == "ascii"
+ ), "Dead-simple ASCII detection is wrongly detected!"
@pytest.mark.parametrize(
"payload",
[
- '\u020d\x1b'.encode('utf-8'),
- 'h\xe9llo world!\n'.encode('utf_8'),
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8'),
- 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'.encode('utf_8'),
- 'Bсеки човек има право на образование.'.encode('utf_8'),
- "(° ͜ʖ °), creepy face, smiley 😀".encode("utf_8"),
- """["Financiën", "La France"]""".encode("utf_8"),
- "Qu'est ce que une étoile?".encode("utf_8"),
- """Financiën""".encode("utf_8"),
- "😀".encode("utf_8")
- ]
+ "\u020d\x1b".encode(),
+ "h\xe9llo world!\n".encode(),
+ "我没有埋怨,磋砣的只是一些时间。".encode(),
+ "Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.".encode(),
+ "Bсеки човек има право на образование.".encode(),
+ "(° ͜ʖ °), creepy face, smiley 😀".encode(),
+ """["Financiën", "La France"]""".encode(),
+ "Qu'est ce que une étoile?".encode(),
+ """Financiën""".encode(),
+ "😀".encode(),
+ ],
)
def test_obviously_utf8_content(payload):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Dead-simple UTF-8 detection has failed!"
- assert best_guess.encoding == "utf_8", "Dead-simple UTF-8 detection is wrongly detected!"
+ assert (
+ best_guess.encoding == "utf_8"
+ ), "Dead-simple UTF-8 detection is wrongly detected!"
def test_mb_cutting_chk():
# This payload should be wrongfully split and the autofix should ran automatically
# on chunks extraction.
- payload = b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 " \
- b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa " * 128
+ payload = (
+ b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 "
+ b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa "
+ * 128
+ )
guesses = from_bytes(payload, cp_isolation=["cp949"])
best_guess = guesses.best()
@@ -108,9 +138,7 @@ def test_mb_cutting_chk():
def test_alphabets_property():
- best_guess = from_bytes(
- "😀 Hello World! How affairs are going? 😀".encode("utf_8")
- ).best()
+ best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best()
assert "Basic Latin" in best_guess.alphabets
assert "Emoticons range(Emoji)" in best_guess.alphabets
@@ -119,16 +147,14 @@ def test_alphabets_property():
def test_doc_example_short_cp1251():
best_guess = from_bytes(
- 'Bсеки човек има право на образование.'.encode('cp1251')
+ "Bсеки човек има право на образование.".encode("cp1251")
).best()
assert best_guess.encoding == "cp1251"
def test_direct_cmp_charset_match():
- best_guess = from_bytes(
- "😀 Hello World! How affairs are going? 😀".encode("utf_8")
- ).best()
+ best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best()
assert best_guess == "utf_8"
assert best_guess == "utf-8"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b73fb613..5f2777c4 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,70 +1,46 @@
+from __future__ import annotations
+
import unittest
-from charset_normalizer.cli import cli_detect, query_yes_no
-from unittest.mock import patch
+from os import pardir, path, remove
from os.path import exists
-from os import remove, path, pardir
+from unittest.mock import patch
+
+from charset_normalizer.cli import cli_detect, query_yes_no
-DIR_PATH = path.join(
- path.dirname(path.realpath(__file__)),
- pardir
-)
+DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
class TestCommandLineInterface(unittest.TestCase):
-
- @patch('builtins.input', lambda *args: 'y')
+ @patch("builtins.input", lambda *args: "y")
def test_simple_yes_input(self):
- self.assertTrue(
- query_yes_no('Are u willing to chill a little bit ?')
- )
+ self.assertTrue(query_yes_no("Are u willing to chill a little bit ?"))
- @patch('builtins.input', lambda *args: 'N')
+ @patch("builtins.input", lambda *args: "N")
def test_simple_no_input(self):
- self.assertFalse(
- query_yes_no('Are u willing to chill a little bit ?')
- )
+ self.assertFalse(query_yes_no("Are u willing to chill a little bit ?"))
def test_single_file(self):
-
- self.assertEqual(
- 0,
- cli_detect(
- [DIR_PATH + '/data/sample-arabic-1.txt']
- )
- )
+ self.assertEqual(0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt"]))
def test_version_output_success(self):
with self.assertRaises(SystemExit):
- cli_detect(
- ['--version']
- )
+ cli_detect(["--version"])
def test_single_file_normalize(self):
self.assertEqual(
- 0,
- cli_detect(
- [
- DIR_PATH + '/data/sample-arabic-1.txt',
- '--normalize'
- ]
- )
+ 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--normalize"])
)
- self.assertTrue(
- exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt')
- )
+ self.assertTrue(exists(DIR_PATH + "/data/sample-arabic-1.cp1256.txt"))
try:
- remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt')
+ remove(DIR_PATH + "/data/sample-arabic-1.cp1256.txt")
except:
pass
def test_single_verbose_file(self):
self.assertEqual(
- 0,
- cli_detect(
- [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose']
- )
+ 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--verbose"])
)
def test_multiple_file(self):
@@ -72,11 +48,11 @@ def test_multiple_file(self):
0,
cli_detect(
[
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_with_alternative(self):
@@ -84,12 +60,12 @@ def test_with_alternative(self):
0,
cli_detect(
[
- '-a',
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ "-a",
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_with_minimal_output(self):
@@ -97,12 +73,12 @@ def test_with_minimal_output(self):
0,
cli_detect(
[
- '-m',
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ "-m",
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_with_minimal_and_alt(self):
@@ -110,47 +86,31 @@ def test_with_minimal_and_alt(self):
0,
cli_detect(
[
- '-m',
- '-a',
- DIR_PATH + '/data/sample-arabic-1.txt',
- DIR_PATH + '/data/sample-french.txt',
- DIR_PATH + '/data/sample-chinese.txt'
+ "-m",
+ "-a",
+ DIR_PATH + "/data/sample-arabic-1.txt",
+ DIR_PATH + "/data/sample-french.txt",
+ DIR_PATH + "/data/sample-chinese.txt",
]
- )
+ ),
)
def test_non_existent_file(self):
-
with self.assertRaises(SystemExit) as cm:
- cli_detect(
- [DIR_PATH + '/data/not_found_data.txt']
- )
+ cli_detect([DIR_PATH + "/data/not_found_data.txt"])
self.assertEqual(cm.exception.code, 2)
def test_replace_without_normalize(self):
-
self.assertEqual(
- cli_detect(
- [
- DIR_PATH + '/data/sample-arabic-1.txt',
- '--replace'
- ]
- ),
- 1
+ cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--replace"]), 1
)
def test_force_replace_without_replace(self):
self.assertEqual(
- cli_detect(
- [
- DIR_PATH + '/data/sample-arabic-1.txt',
- '--force'
- ]
- ),
- 1
+ cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--force"]), 1
)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py
index 7e399132..e5952d6c 100644
--- a/tests/test_coherence_detection.py
+++ b/tests/test_coherence_detection.py
@@ -1,5 +1,14 @@
+from __future__ import annotations
+
import pytest
-from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches
+
+from charset_normalizer.cd import (
+ encoding_languages,
+ filter_alt_coherence_matches,
+ get_target_features,
+ is_multi_byte_encoding,
+ mb_encoding_languages,
+)
@pytest.mark.parametrize(
@@ -13,14 +22,20 @@
("johab", ["Korean"]),
("shift_jis", ["Japanese"]),
("mac_greek", ["Greek"]),
- ("iso2022_jp", ["Japanese"])
- ]
+ ("iso2022_jp", ["Japanese"]),
+ ],
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
- languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding)
+ languages = (
+ mb_encoding_languages(iana_encoding)
+ if is_multi_byte_encoding(iana_encoding)
+ else encoding_languages(iana_encoding)
+ )
for expected_language in expected_languages:
- assert expected_language in languages, "Wrongly detected language for given code page"
+ assert (
+ expected_language in languages
+ ), "Wrongly detected language for given code page"
@pytest.mark.parametrize(
@@ -31,8 +46,8 @@ def test_infer_language_from_cp(iana_encoding, expected_languages):
("Hebrew", False, False),
("Arabic", False, False),
("Vietnamese", True, True),
- ("Turkish", True, True)
- ]
+ ("Turkish", True, True),
+ ],
)
def test_target_features(language, expected_have_accents, expected_pure_latin):
target_have_accents, target_pure_latin = get_target_features(language)
@@ -44,11 +59,48 @@ def test_target_features(language, expected_have_accents, expected_pure_latin):
@pytest.mark.parametrize(
"matches, expected_return",
[
- ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]),
- ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]),
- ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]),
- ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]),
- ]
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("English—", 0.99),
+ ],
+ [("English", 0.99)],
+ ),
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("English—", 0.99),
+ ("English——", 0.999),
+ ],
+ [("English", 0.999)],
+ ),
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("English—", 0.77),
+ ],
+ [("English", 0.88)],
+ ),
+ (
+ [
+ (
+ "English",
+ 0.88,
+ ),
+ ("Italian", 0.77),
+ ],
+ [("English", 0.88), ("Italian", 0.77)],
+ ),
+ ],
)
def test_filter_alt_coherence_matches(matches, expected_return):
results = filter_alt_coherence_matches(matches)
diff --git a/tests/test_detect_legacy.py b/tests/test_detect_legacy.py
index ec45aa77..bd2b0351 100644
--- a/tests/test_detect_legacy.py
+++ b/tests/test_detect_legacy.py
@@ -1,75 +1,43 @@
+from __future__ import annotations
+
import unittest
+
from charset_normalizer.legacy import detect
class TestDetectLegacy(unittest.TestCase):
-
def test_detect_dict_keys(self):
+ r = detect(("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"))
- r = detect(
- (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030')
- )
+ with self.subTest("encoding key present"):
+ self.assertIn("encoding", r.keys())
- with self.subTest('encoding key present'):
- self.assertIn(
- 'encoding',
- r.keys()
- )
+ with self.subTest("language key present"):
+ self.assertIn("language", r.keys())
- with self.subTest('language key present'):
- self.assertIn(
- 'language',
- r.keys()
- )
-
- with self.subTest('confidence key present'):
- self.assertIn(
- 'confidence',
- r.keys()
- )
+ with self.subTest("confidence key present"):
+ self.assertIn("confidence", r.keys())
def test_detect_dict_value_type(self):
+ r = detect("我没有埋怨,磋砣的只是一些时间。".encode())
- r = detect(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
- )
-
- with self.subTest('encoding instance of str'):
- self.assertIsInstance(
- r['encoding'],
- str
- )
+ with self.subTest("encoding instance of str"):
+ self.assertIsInstance(r["encoding"], str)
- with self.subTest('language instance of str'):
- self.assertIsInstance(
- r['language'],
- str
- )
+ with self.subTest("language instance of str"):
+ self.assertIsInstance(r["language"], str)
- with self.subTest('confidence instance of float'):
- self.assertIsInstance(
- r['confidence'],
- float
- )
+ with self.subTest("confidence instance of float"):
+ self.assertIsInstance(r["confidence"], float)
def test_detect_dict_value(self):
- r = detect(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32')
- )
+ r = detect("我没有埋怨,磋砣的只是一些时间。".encode("utf_32"))
- with self.subTest('encoding is equal to utf_32'):
- self.assertEqual(
- r['encoding'],
- 'UTF-32'
- )
+ with self.subTest("encoding is equal to utf_32"):
+ self.assertEqual(r["encoding"], "UTF-32")
def test_utf8_sig_not_striped(self):
- r = detect(
- "Hello World".encode('utf-8-sig')
- )
+ r = detect("Hello World".encode("utf-8-sig"))
with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
- self.assertEqual(
- r['encoding'],
- "UTF-8-SIG"
- )
+ self.assertEqual(r["encoding"], "UTF-8-SIG")
diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py
index 6caa1c48..5b763ba2 100644
--- a/tests/test_edge_case.py
+++ b/tests/test_edge_case.py
@@ -1,14 +1,25 @@
-from charset_normalizer import from_bytes
-import pytest
+from __future__ import annotations
+
import platform
-@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)")
+import pytest
+
+from charset_normalizer import from_bytes
+
+
+@pytest.mark.xfail(
+ platform.python_version_tuple()[0] == "3"
+ and platform.python_version_tuple()[1] == "7",
+ reason="Unicode database is too old for this case (Python 3.7)",
+)
def test_unicode_edge_case():
- payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'
+ payload = b"\xef\xbb\xbf\xf0\x9f\xa9\xb3"
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Payload should have given something, detection failure"
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"
@@ -18,7 +29,9 @@ def test_issue_gh520():
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Payload should have given something, detection failure"
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
assert "Basic Latin" in best_guess.alphabets
@@ -28,15 +41,19 @@ def test_issue_gh509():
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Payload should have given something, detection failure"
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
assert "ascii" == best_guess.encoding
def test_issue_gh498():
"""This case was mistaken for utf-16-le, this should never happen again."""
- payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx'
+ payload = b"\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx"
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Payload should have given something, detection failure"
+ assert (
+ best_guess is not None
+ ), "Payload should have given something, detection failure"
assert "Cyrillic" in best_guess.alphabets
diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py
index adff8801..ff91e125 100644
--- a/tests/test_full_detection.py
+++ b/tests/test_full_detection.py
@@ -1,43 +1,50 @@
-from charset_normalizer.api import from_path
+from __future__ import annotations
+
+from os import pardir, path
+
import pytest
-from os import path, pardir
-DIR_PATH = path.join(
- path.dirname(path.realpath(__file__)),
- pardir
-)
+from charset_normalizer.api import from_path
+
+DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
@pytest.mark.parametrize(
"input_data_file, expected_charset, expected_language",
[
- ('sample-arabic-1.txt', 'cp1256', 'Arabic'),
- ('sample-french-1.txt', 'cp1252', 'French'),
- ('sample-arabic.txt', 'utf_8', 'Arabic'),
- ('sample-russian-3.txt', 'utf_8', 'Russian'),
- ('sample-french.txt', 'utf_8', 'French'),
- ('sample-chinese.txt', 'big5', 'Chinese'),
- ('sample-greek.txt', 'cp1253', 'Greek'),
- ('sample-greek-2.txt', 'cp1253', 'Greek'),
- ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'),
- ('sample-hebrew-3.txt', 'cp1255', 'Hebrew'),
- ('sample-bulgarian.txt', 'utf_8', 'Bulgarian'),
- ('sample-english.bom.txt', 'utf_8', 'English'),
- ('sample-spanish.txt', 'utf_8', 'Spanish'),
- ('sample-korean.txt', 'cp949', 'Korean'),
- ('sample-turkish.txt', 'cp1254', 'Turkish'),
- ('sample-russian-2.txt', 'utf_8', 'Russian'),
- ('sample-russian.txt', 'mac_cyrillic', 'Russian'),
- ('sample-polish.txt', 'utf_8', 'Polish'),
- ]
+ ("sample-arabic-1.txt", "cp1256", "Arabic"),
+ ("sample-french-1.txt", "cp1252", "French"),
+ ("sample-arabic.txt", "utf_8", "Arabic"),
+ ("sample-russian-3.txt", "utf_8", "Russian"),
+ ("sample-french.txt", "utf_8", "French"),
+ ("sample-chinese.txt", "big5", "Chinese"),
+ ("sample-greek.txt", "cp1253", "Greek"),
+ ("sample-greek-2.txt", "cp1253", "Greek"),
+ ("sample-hebrew-2.txt", "cp1255", "Hebrew"),
+ ("sample-hebrew-3.txt", "cp1255", "Hebrew"),
+ ("sample-bulgarian.txt", "utf_8", "Bulgarian"),
+ ("sample-english.bom.txt", "utf_8", "English"),
+ ("sample-spanish.txt", "utf_8", "Spanish"),
+ ("sample-korean.txt", "cp949", "Korean"),
+ ("sample-turkish.txt", "cp1254", "Turkish"),
+ ("sample-russian-2.txt", "utf_8", "Russian"),
+ ("sample-russian.txt", "mac_cyrillic", "Russian"),
+ ("sample-polish.txt", "utf_8", "Polish"),
+ ],
)
def test_elementary_detection(
input_data_file: str,
expected_charset: str,
expected_language: str,
):
- best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best()
+ best_guess = from_path(DIR_PATH + f"/data/{input_data_file}").best()
- assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file)
- assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file)
- assert best_guess.language == expected_language, "Elementary language detection has failed upon '{}'".format(input_data_file)
+ assert (
+ best_guess is not None
+ ), f"Elementary detection has failed upon '{input_data_file}'"
+ assert (
+ best_guess.encoding == expected_charset
+ ), f"Elementary charset detection has failed upon '{input_data_file}'"
+ assert (
+ best_guess.language == expected_language
+ ), f"Elementary language detection has failed upon '{input_data_file}'"
diff --git a/tests/test_isbinary.py b/tests/test_isbinary.py
index b134a8ac..841474f1 100644
--- a/tests/test_isbinary.py
+++ b/tests/test_isbinary.py
@@ -1,28 +1,29 @@
-import pytest
+from __future__ import annotations
+
import typing
-from io import BytesIO
from base64 import b64decode
+from io import BytesIO
+from os import pardir, path
+
+import pytest
+
from charset_normalizer import is_binary
-from os import path, pardir
-DIR_PATH = path.join(
- path.dirname(path.realpath(__file__)),
- pardir
-)
+DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
@pytest.mark.parametrize(
"raw, expected",
[
- (b'\x00\x5f\x2f\xff'*50, True),
+ (b"\x00\x5f\x2f\xff" * 50, True),
(b64decode("R0lGODlhAQABAAAAACw="), True),
(BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True),
- ('sample-polish.txt', False),
- ('sample-arabic.txt', False)
- ]
+ ("sample-polish.txt", False),
+ ("sample-arabic.txt", False),
+ ],
)
-def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None:
+def test_isbinary(raw: bytes | typing.BinaryIO | str, expected: bool) -> None:
if isinstance(raw, str):
- raw = DIR_PATH + "/data/{}".format(raw)
+ raw = DIR_PATH + f"/data/{raw}"
assert is_binary(raw) is expected
diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py
index 04526d38..7fc28fac 100644
--- a/tests/test_large_payload.py
+++ b/tests/test_large_payload.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import pytest
from charset_normalizer import from_bytes
@@ -5,29 +7,43 @@
def test_large_payload_u8_sig_basic_entry():
- payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8_sig")
+ payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig")
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Large U8 payload case detection completely failed"
- assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!"
+ assert (
+ best_guess.encoding == "utf_8"
+ ), "Large U8 payload case detection wrongly detected!"
assert best_guess.bom is True, "SIG/BOM property should be True"
- assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
- assert best_guess._string is not None, "str should be decoded before direct access (sig available)"
+ assert len(best_guess.raw) == len(
+ payload
+ ), "Large payload should remain untouched when accessed through .raw"
+ assert (
+ best_guess._string is not None
+ ), "str should be decoded before direct access (sig available)"
def test_large_payload_ascii_basic_entry():
- payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8")
+ payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8")
best_guess = from_bytes(payload).best()
- assert best_guess is not None, "Large ASCII payload case detection completely failed"
- assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
+ assert (
+ best_guess is not None
+ ), "Large ASCII payload case detection completely failed"
+ assert (
+ best_guess.encoding == "ascii"
+ ), "Large ASCII payload case detection wrongly detected!"
assert best_guess.bom is False, "SIG/BOM property should be False"
- assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
+ assert len(best_guess.raw) == len(
+ payload
+ ), "Large payload should remain untouched when accessed through .raw"
assert best_guess._string is None, "str should not be decoded until direct access"
def test_misleading_large_sequence():
- content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8')
+ content = (
+ ("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。")
+ ).encode("utf_8")
guesses = from_bytes(content)
@@ -35,5 +51,5 @@ def test_misleading_large_sequence():
match = guesses.best()
assert match is not None
assert match._string is not None, "str should be cached as only match"
- assert match.encoding == 'utf_8'
+ assert match.encoding == "utf_8"
assert str(match) is not None
diff --git a/tests/test_logging.py b/tests/test_logging.py
index f44820e2..ad2413e2 100644
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@@ -1,9 +1,12 @@
-import pytest
+from __future__ import annotations
+
import logging
-from charset_normalizer.utils import set_logging_handler
-from charset_normalizer.api import from_bytes, explain_handler
+import pytest
+
+from charset_normalizer.api import explain_handler, from_bytes
from charset_normalizer.constant import TRACE
+from charset_normalizer.utils import set_logging_handler
class TestLogBehaviorClass:
@@ -14,34 +17,32 @@ def setup_method(self):
self.logger.level = logging.WARNING
def test_explain_true_behavior(self, caplog):
- test_sequence = b'This is a test sequence of bytes that should be sufficient'
+ test_sequence = b"This is a test sequence of bytes that should be sufficient"
from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
assert explain_handler not in self.logger.handlers
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
def test_explain_false_handler_set_behavior(self, caplog):
- test_sequence = b'This is a test sequence of bytes that should be sufficient'
+ test_sequence = b"This is a test sequence of bytes that should be sufficient"
set_logging_handler(level=TRACE, format_string="%(message)s")
from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
- assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers)
+ assert any(
+ isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers
+ )
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
assert "Encoding detection: ascii is most likely the one." in caplog.text
def test_set_stream_handler(self, caplog):
- set_logging_handler(
- "charset_normalizer", level=logging.DEBUG
- )
+ set_logging_handler("charset_normalizer", level=logging.DEBUG)
self.logger.debug("log content should log with default format")
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
assert "log content should log with default format" in caplog.text
def test_set_stream_handler_format(self, caplog):
- set_logging_handler(
- "charset_normalizer", format_string="%(message)s"
- )
+ set_logging_handler("charset_normalizer", format_string="%(message)s")
self.logger.info("log content should only be this message")
assert caplog.record_tuples == [
(
diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py
index d70fee45..4089f825 100644
--- a/tests/test_mess_detection.py
+++ b/tests/test_mess_detection.py
@@ -1,27 +1,48 @@
+from __future__ import annotations
+
import pytest
+
from charset_normalizer.md import mess_ratio
@pytest.mark.parametrize(
"content, min_expected_ratio, max_expected_ratio",
[
- ('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。', 0., 0.),
- ('العقلية , التنويم المغناطيسي و / أو الاقتراح', 0., 0.),
- ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0., 0.),
+ (
+ "典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。",
+ 0.0,
+ 0.0,
+ ),
+ ("العقلية , التنويم المغناطيسي و / أو الاقتراح", 0.0, 0.0),
+ ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0.0, 0.0),
("Cehennemin Sava■þ²s²'da kim?", 0.1, 0.5),
- ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.),
- ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5),
- ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5),
- ("""ØĢØŠØģاØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ
Ų
ا ØģŲŲبШ쨧ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲ
Øđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪا؊Ų
""", 0.8, 3.0),
+ ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.0),
+ (
+ "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli",
+ 0.1,
+ 0.5,
+ ),
+ (
+ "Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.",
+ 0.01,
+ 0.5,
+ ),
+ (
+ """ØĢØŠØģاØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ
Ų
ا ØģŲŲبШ쨧ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲ
Øđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪا؊Ų
""",
+ 0.8,
+ 3.0,
+ ),
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
- ("""hishamkoc@yahoo.com ุชุฑุฌู
ููุฉ ููุดูููุงู
ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ู
ูููู ูููุจููู""", 0.5, 2.0)
-
- ]
+ (
+ """hishamkoc@yahoo.com ุชุฑุฌู
ููุฉ ููุดูููุงู
ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ู
ูููู ูููุจููู""",
+ 0.5,
+ 2.0,
+ ),
+ ],
)
def test_mess_detection(content, min_expected_ratio, max_expected_ratio):
- calculated_mess_ratio = mess_ratio(
- content,
- maximum_threshold=1.
- )
+ calculated_mess_ratio = mess_ratio(content, maximum_threshold=1.0)
- assert min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio, "The mess detection ratio calculated for given content is not well adjusted!"
+ assert (
+ min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio
+ ), "The mess detection ratio calculated for given content is not well adjusted!"
diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py
index 411bf45f..e56c4a16 100644
--- a/tests/test_preemptive_detection.py
+++ b/tests/test_preemptive_detection.py
@@ -1,7 +1,9 @@
+from __future__ import annotations
+
import pytest
-from charset_normalizer.utils import any_specified_encoding
from charset_normalizer import CharsetMatch
+from charset_normalizer.utils import any_specified_encoding
@pytest.mark.parametrize(
@@ -10,52 +12,76 @@
(b'', "euc_jp"),
(b'', "utf_8"),
(b'', None),
- (b'# coding: utf-8', "utf_8"),
- (b'', 'utf_8'),
- (b'', 'ascii'),
- (b'', 'johab'),
- (b'', 'cp037'),
- (b'', "cp1252"),
+ (b"# coding: utf-8", "utf_8"),
+ (b'', "utf_8"),
+ (b'', "ascii"),
+ (b'', "johab"),
+ (b'', "cp037"),
+ (b"", "cp1252"),
(b'', "cp1256"),
- ]
+ ],
)
def test_detect_most_common_body_encoding(payload, expected_encoding):
- specified_encoding = any_specified_encoding(
- payload
- )
+ specified_encoding = any_specified_encoding(payload)
- assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body"
+ assert (
+ specified_encoding == expected_encoding
+ ), "Unable to determine properly encoding from given body"
@pytest.mark.parametrize(
"payload, expected_outcome",
[
- (b'', b''),
- (b'', b''),
- (b'', b''),
- (b'# coding: utf-8', b'# coding: utf-8'),
- (b'', b''),
- (b'', b''),
- (b'', b''),
- (b'', b''),
- (b'', b''),
- ]
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (b"# coding: utf-8", b"# coding: utf-8"),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b'',
+ b'',
+ ),
+ (
+ b"",
+ b"",
+ ),
+ (
+ b'',
+ b'',
+ ),
+ ],
)
def test_preemptive_mark_replacement(payload, expected_outcome):
"""
When generating (to Unicode converted) bytes, we want to change any potential declarative charset
to utf-8. This test that.
"""
- specified_encoding = any_specified_encoding(
- payload
- )
+ specified_encoding = any_specified_encoding(payload)
- detected_encoding = specified_encoding if specified_encoding is not None else "utf-8"
+ detected_encoding = (
+ specified_encoding if specified_encoding is not None else "utf-8"
+ )
m = CharsetMatch(
payload,
detected_encoding,
- 0.,
+ 0.0,
False,
[],
preemptive_declaration=specified_encoding,
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5c603b3c..a0cc088e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,10 @@
+from __future__ import annotations
+
import logging
+
import pytest
-from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler
+
+from charset_normalizer.utils import cp_similarity, is_accentuated, set_logging_handler
@pytest.mark.parametrize(