Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 46778e7

Browse files
committed
regex-based POC
Uses ua-parser/uap-rust#3 Fixes #166
1 parent 022ab80 commit 46778e7

File tree

9 files changed

+169
-67
lines changed

9 files changed

+169
-67
lines changed

.github/workflows/ci.yml

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,8 @@ name: CI
22

33
on:
44
push:
5-
branches: [ '*' ]
65
pull_request:
7-
branches: [ '*' ]
86
workflow_dispatch:
9-
schedule:
10-
# cron is kinda random, assumes 22:00 UTC is a low ebb, eastern
11-
# countries are very early morning, and US are mid-day to
12-
# mid-afternoon
13-
- cron: '0 22 * * 2'
147

158
jobs:
169
checks:
@@ -79,7 +72,6 @@ jobs:
7972
test:
8073
runs-on: ubuntu-latest
8174
needs: compile
82-
continue-on-error: ${{ matrix.python-version == '3.13' || matrix.python-version == 'pypy-3.11' }}
8375
strategy:
8476
fail-fast: false
8577
matrix:
@@ -88,19 +80,14 @@ jobs:
8880
- sdist
8981
- source
9082
python-version:
91-
- "3.8"
9283
- "3.9"
9384
- "3.10"
9485
- "3.11"
9586
- "3.12"
9687
- "3.13"
97-
- "pypy-3.8"
98-
- "pypy-3.9"
9988
- "pypy-3.10"
10089
# - "pypy-3.11"
101-
# don't enable graal because it's slower than even pypy and
102-
# fails because oracle/graalpython#385
103-
# - "graalpy-23"
90+
- "graalpy-24"
10491
include:
10592
- source: sdist
10693
artifact: dist/*.tar.gz
@@ -116,34 +103,30 @@ jobs:
116103
with:
117104
python-version: ${{ matrix.python-version }}
118105
allow-prereleases: true
119-
- name: Install test dependencies
120-
run: |
121-
python -mpip install --upgrade pip
122-
# cyaml is outright broken on pypy
123-
if ! ${{ startsWith(matrix.python-version, 'pypy-') }}; then
124-
# if binary wheels are not available for the current
125-
# package install libyaml-dev so we can install pyyaml
126-
# from source
127-
if ! pip download --only-binary pyyaml -rrequirements_dev.txt > /dev/null 2>&1; then
128-
sudo apt install libyaml-dev
129-
fi
106+
- run: python -mpip install --upgrade pip
107+
- run: |
108+
# if binary wheels are not available for the current
109+
# package install libyaml-dev so we can install pyyaml
110+
# from source
111+
if ! pip download --only-binary :all: pyyaml > /dev/null 2>&1; then
112+
sudo apt install libyaml-dev
130113
fi
131-
python -mpip install pytest pyyaml
132-
133-
# re2 is basically impossible to install from source so don't
134-
# bother, and suppress installation failure so the test does
135-
# not fail (re2 tests will just be skipped for versions /
136-
# implementations for which google does not provide a binary
137-
# wheel)
138-
python -mpip install --only-binary :all: google-re2 || true
114+
- run: python -mpip install pytest pyyaml
115+
# install rs accelerator if available, ignore if not
116+
- run: python -mpip install ua-parser-rs || true
117+
# re2 is basically impossible to install from source so don't
118+
# bother, and suppress installation failure so the test does
119+
# not fail (re2 tests will just be skipped for versions /
120+
# implementations for which google does not provide a binary
121+
# wheel)
122+
- run: 'python -mpip install --only-binary :all: google-re2 || true'
139123
- name: download ${{ matrix.source }} artifact
140124
if: matrix.artifact
141125
uses: actions/download-artifact@v4
142126
with:
143127
name: ${{ matrix.source }}
144128
path: dist/
145129
- name: install package in environment
146-
run: |
147-
pip install ${{ matrix.artifact || '.' }}
130+
run: pip install ${{ matrix.artifact || '.' }}
148131
- name: run tests
149132
run: pytest -v -Werror -Wignore::ImportWarning --doctest-glob="*.rst" -ra

doc/conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
rst_epilog = """
2020
.. |pyyaml| replace:: ``PyYaml``
2121
.. |re2| replace:: ``google-re2``
22+
.. |regex| replace:: ``regex``
2223
2324
.. _pyyaml: https://pyyaml.org
2425
.. _re2: https://pypi.org/project/google-re2
26+
.. _regex: https://pypi.org/project/ua-parser-rs
2527
"""
2628

2729
# -- General configuration ---------------------------------------------------

doc/installation.rst

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@ Installation
55
Python Version
66
==============
77

8-
ua-parser currently supports Python 3.8 and newer, as well as recent
9-
versions of PyPy supporting the same standards.
8+
ua-parser currently supports CPython 3.9 and newer, recent Pypy
9+
(supporting 3.10), and Graal 24.
1010

11-
.. note:: While PyPy is supported, it is not *fast*, and google-re2 is
12-
not supported on it.
11+
.. note::
12+
13+
While pypy and graal are supported, they are rather slow when using
14+
pure python mode and ``[re2]`` is not supported, so using the
15+
``[regex]`` feature is very strongly recommended.
1316

1417
Installation
1518
============
@@ -21,13 +24,14 @@ Installation
2124
Optional Dependencies
2225
=====================
2326

24-
ua-parser currently has two optional dependencies, |re2|_ and
25-
|pyyaml|_. These dependencies will be detected and used automatically
27+
ua-parser currently has three optional dependencies, |regex|_, |re2|_ and
28+
|pyyaml|_. These dependencies will be detected and used augitomatically
2629
if installed, but can also be installed via and alongside ua-parser:
2730

2831
.. code-block:: sh
2932
33+
$ pip install 'ua-parser[regex]'
3034
$ pip install 'ua-parser[re2]'
3135
$ pip install 'ua-parser[yaml]'
32-
$ pip install 'ua-parser[re2,yaml]'
36+
$ pip install 'ua-parser[regex,yaml]'
3337

pyproject.toml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ name = "ua-parser"
77
description = "Python port of Browserscope's user agent parser"
88
version = "1.0.0a1"
99
readme = "README.rst"
10-
requires-python = ">=3.8"
10+
requires-python = ">=3.9"
1111
dependencies = []
12-
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
1312

1413
license = {text = "Apache 2.0"}
1514
urls = {repository = "https://github.com/ua-parser/uap-python"}
@@ -35,14 +34,20 @@ classifiers = [
3534
"Topic :: Internet :: WWW/HTTP",
3635
"Topic :: Software Development :: Libraries :: Python Modules",
3736
"Programming Language :: Python",
38-
"Programming Language :: Python :: 3.8",
3937
"Programming Language :: Python :: 3.9",
4038
"Programming Language :: Python :: 3.10",
4139
"Programming Language :: Python :: 3.11",
40+
"Programming Language :: Python :: 3.12",
4241
"Programming Language :: Python :: Implementation :: CPython",
43-
"Programming Language :: Python :: Implementation :: PyPy"
42+
"Programming Language :: Python :: Implementation :: PyPy",
43+
"Programming Language :: Python :: Implementation :: GraalPy",
4444
]
4545

46+
[project.optional-dependencies]
47+
yaml = ["PyYaml"]
48+
re2 = ["google-re2"]
49+
regex = ["ua-parser-rs"]
50+
4651
[tool.setuptools.packages.find]
4752
where = ["src"]
4853

@@ -63,7 +68,7 @@ known-first-party = ["ua_parser"]
6368
combine-as-imports = true
6469

6570
[tool.mypy]
66-
python_version = "3.8"
71+
python_version = "3.9"
6772
files = "src,tests"
6873

6974
# can't use strict because it's only global

setup.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,20 @@ def run(self) -> None:
6767
dest_lazy = outdir / "_lazy.py"
6868
dest_legacy = outdir / "_regexes.py"
6969

70-
with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
71-
"wb"
72-
) as legacy:
70+
with (
71+
dest.open("wb") as eager,
72+
dest_lazy.open("wb") as lazy,
73+
dest_legacy.open("wb") as legacy,
74+
):
7375
eager = EagerWriter(eager)
7476
lazy = LazyWriter(lazy)
7577
legacy = LegacyWriter(legacy)
7678

7779
for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
78-
with eager.section(section), lazy.section(section), legacy.section(
79-
section
80+
with (
81+
eager.section(section),
82+
lazy.section(section),
83+
legacy.section(section),
8084
):
8185
extract = EXTRACTORS[section]
8286
for p in regexes[section]:

src/ua_parser/__main__.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@
3939
from .caching import Cache, Local
4040
from .loaders import load_builtins, load_yaml
4141
from .re2 import Resolver as Re2Resolver
42+
from .regex import Resolver as RegexResolver
4243
from .user_agent_parser import Parse
4344

4445
CACHEABLE = {
4546
"basic": True,
4647
"re2": True,
48+
"regex": True,
4749
"legacy": False,
4850
}
4951

@@ -178,6 +180,8 @@ def get_parser(
178180
r = BasicResolver(rules)
179181
elif parser == "re2":
180182
r = Re2Resolver(rules)
183+
elif parser == "regex":
184+
r = RegexResolver(rules)
181185
else:
182186
sys.exit(f"unknown parser {parser!r}")
183187

@@ -327,6 +331,7 @@ def run_threaded(args: argparse.Namespace) -> None:
327331
("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))),
328332
("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))),
329333
("re2", Re2Resolver(load_builtins())),
334+
("regex", RegexResolver(load_builtins())),
330335
]
331336
for name, resolver in resolvers:
332337
print(f"{name:11}: ", end="", flush=True)
@@ -436,14 +441,14 @@ def __call__(
436441
bench.add_argument(
437442
"--bases",
438443
nargs="+",
439-
choices=["basic", "re2", "legacy"],
440-
default=["basic", "re2", "legacy"],
444+
choices=["basic", "re2", "regex", "legacy"],
445+
default=["basic", "re2", "regex", "legacy"],
441446
help="""Base resolvers to benchmark. `basic` is a linear search
442447
through the regexes file, `re2` is a prefiltered regex set
443-
implemented in C++, `legacy` is the legacy API (essentially a
444-
basic resolver with a clearing cache of fixed 200 entries, but
445-
less layered so usually slightly faster than an equivalent
446-
basic-based resolver).""",
448+
implemented in C++, `regex` is a prefiltered regex set implemented
449+
in Rust, `legacy` is the legacy API (essentially a basic resolver
450+
with a clearing cache of fixed 200 entries, but less layered so
451+
usually slightly faster than an equivalent basic-based resolver).""",
447452
)
448453
bench.add_argument(
449454
"--caches",

src/ua_parser/regex.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
__all__ = ["Resolver"]
2+
3+
from operator import attrgetter
4+
5+
import ua_parser_rs # type: ignore
6+
7+
from .core import (
8+
Device,
9+
Domain,
10+
Matchers,
11+
OS,
12+
PartialResult,
13+
UserAgent,
14+
)
15+
16+
17+
class Resolver:
18+
ua: ua_parser_rs.UserAgentExtractor
19+
os: ua_parser_rs.OSExtractor
20+
de: ua_parser_rs.DeviceExtractor
21+
22+
def __init__(self, matchers: Matchers) -> None:
23+
ua, os, de = matchers
24+
self.ua = ua_parser_rs.UserAgentExtractor(
25+
map(
26+
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
27+
ua,
28+
)
29+
)
30+
self.os = ua_parser_rs.OSExtractor(
31+
map(
32+
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
33+
os,
34+
)
35+
)
36+
self.de = ua_parser_rs.DeviceExtractor(
37+
map(
38+
attrgetter("regex", "regex_flag", "family", "brand", "model"),
39+
de,
40+
)
41+
)
42+
43+
def __call__(self, ua: str, domains: Domain, /) -> PartialResult:
44+
user_agent = os = device = None
45+
if Domain.USER_AGENT in domains:
46+
if m := self.ua.extract(ua):
47+
user_agent = UserAgent(
48+
m.family,
49+
m.major,
50+
m.minor,
51+
m.patch,
52+
m.patch_minor,
53+
)
54+
if Domain.OS in domains:
55+
if m := self.os.extract(ua):
56+
os = OS(
57+
m.family,
58+
m.major,
59+
m.minor,
60+
m.patch,
61+
m.patch_minor,
62+
)
63+
if Domain.DEVICE in domains:
64+
if m := self.de.extract(ua):
65+
device = Device(
66+
m.family,
67+
m.brand,
68+
m.model,
69+
)
70+
return PartialResult(
71+
domains=domains,
72+
string=ua,
73+
user_agent=user_agent,
74+
os=os,
75+
device=device,
76+
)

tests/test_core.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,19 @@
5353
else:
5454
PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2"))
5555

56+
try:
57+
from ua_parser import regex
58+
except ImportError:
59+
PARSERS.append(
60+
pytest.param(
61+
None,
62+
id="regex",
63+
marks=pytest.mark.skip(reason="regex parser not available"),
64+
)
65+
)
66+
else:
67+
PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex"))
68+
5669
UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
5770

5871

@@ -64,7 +77,7 @@
6477
CORE_DIR / "test_resources" / "firefox_user_agent_strings.yaml",
6578
CORE_DIR / "test_resources" / "pgts_browser_list.yaml",
6679
],
67-
ids=attrgetter("name"),
80+
ids=attrgetter("stem"),
6881
)
6982
def test_ua(parser, test_file):
7083
with test_file.open("rb") as f:
@@ -90,7 +103,7 @@ def test_ua(parser, test_file):
90103
CORE_DIR / "tests" / "test_os.yaml",
91104
CORE_DIR / "test_resources" / "additional_os_tests.yaml",
92105
],
93-
ids=attrgetter("name"),
106+
ids=attrgetter("stem"),
94107
)
95108
def test_os(parser, test_file):
96109
with test_file.open("rb") as f:
@@ -111,7 +124,7 @@ def test_os(parser, test_file):
111124
[
112125
CORE_DIR / "tests" / "test_device.yaml",
113126
],
114-
ids=attrgetter("name"),
127+
ids=attrgetter("stem"),
115128
)
116129
def test_devices(parser, test_file):
117130
with test_file.open("rb") as f:

0 commit comments

Comments
 (0)