Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit df7ffe9

Browse files
committed
Add an re2-based parser
Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Only uses re2.Set which turns out to be not great, at least according to `pytest --durations` on 3.11: - re2 is sometimes faster for UA tests - `pgts_browser_list.yaml` goes from 2.5s to 1.5 - `firefox_user_agent_strings.yaml` goes from 0.05 to 0.04 (not really significant) - though `test_ua.yaml` goes from 0.18 to 0.65 - re2 is *way* slower for devices tests - `test_device.yaml` goes from 2.5 to 8s Obviously tests might not be representative at all, implementing a proper benchmark on a real-life test-set (#163) would likely provide better information. It's possible that `FilteredRE2` would would offer better performances, *but* it requires additional memory and more importantly it requires a fast literal string matcher e.g. a fast implementation of Aho-Corasick, or possibly Hyperscan's Teddy (via [python-hyperscan][5]?). [According to burntsushi commentz-walter is not great in practice][1], at least as you increase the number of patterns, so that one looks like a dead end. Either way this would likely be an *additional* dependency to make it usable, although there seems to be [a well-maintained Python version with impressive performances (for pure python)][2], [a native module][3], and [a wrapper for burntsushi's rust implementation][4] which claims even better performances than the native module. Linked to (but probably can't be argued to fix) #149. [1]: https://news.ycombinator.com/item?id=26913349 [2]: https://github.com/abusix/ahocorapy [3]: https://github.com/WojciechMula/pyahocorasick/ [4]: https://github.com/G-Research/ahocorasick_rs/ [5]: https://python-hyperscan.readthedocs.io
1 parent dbcee8c commit df7ffe9

File tree

4 files changed

+91
-1
lines changed

4 files changed

+91
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ version = "1.0.0a1"
99
readme = "README.rst"
1010
requires-python = ">=3.8"
1111
dependencies = []
12-
optional-dependencies = { yaml = ["PyYaml"] }
12+
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
1313

1414
license = {text = "Apache 2.0"}
1515
urls = {repository = "https://github.com/ua-parser/uap-python"}

src/ua_parser/re2.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
from __future__ import annotations
2+
3+
import io
4+
import os
5+
import re
6+
from typing import List, Tuple, Union
7+
8+
import re2 # type: ignore
9+
10+
from .core import (
11+
Parser as BaseParser,
12+
PartialParseResult,
13+
Device,
14+
Domain,
15+
OS,
16+
UserAgent,
17+
Matchers,
18+
UserAgentMatcher,
19+
OSMatcher,
20+
DeviceMatcher,
21+
)
22+
23+
24+
RE_OPTS = re2.Options()
25+
# as of uap-core 0.18, the devices set needs at least 28MB (up from
26+
# the default 8), set to 32
27+
RE_OPTS.max_mem = 8 << 22
28+
# might write directly to stdout? not great, suppress
29+
RE_OPTS.log_errors = False
30+
31+
32+
class Parser(BaseParser):
33+
ua: re2.Set
34+
user_agent_parsers: List[UserAgentMatcher]
35+
os: re2.Set
36+
os_parsers: List[OSMatcher]
37+
devices: re2.Set
38+
device_parsers: List[DeviceMatcher]
39+
40+
def __init__(
41+
self,
42+
matchers: Matchers,
43+
) -> None:
44+
self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers
45+
46+
self.ua = re2.Set.SearchSet(RE_OPTS)
47+
for u in self.user_agent_parsers:
48+
self.ua.Add(u.regex.pattern)
49+
self.ua.Compile()
50+
51+
self.os = re2.Set.SearchSet(RE_OPTS)
52+
for o in self.os_parsers:
53+
self.os.Add(o.regex.pattern)
54+
self.os.Compile()
55+
56+
self.devices = re2.Set.SearchSet(RE_OPTS)
57+
for d in self.device_parsers:
58+
# Prepend the i global flag if IGNORECASE is set. Assumes
59+
# no pattern uses global flags, but since they're not
60+
# supported in JS that seems safe.
61+
if d.regex.flags & re.IGNORECASE:
62+
self.devices.Add("(?i)" + d.regex.pattern)
63+
else:
64+
self.devices.Add(d.regex.pattern)
65+
self.devices.Compile()
66+
67+
def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
68+
user_agent = os = device = None
69+
if Domain.USER_AGENT in domains:
70+
if matches := self.ua.Match(ua):
71+
user_agent = self.user_agent_parsers[min(matches)](ua)
72+
if Domain.OS in domains:
73+
if matches := self.os.Match(ua):
74+
os = self.os_parsers[min(matches)](ua)
75+
if Domain.DEVICE in domains:
76+
if matches := self.devices.Match(ua):
77+
device = self.device_parsers[min(matches)](ua)
78+
return PartialParseResult(
79+
domains=domains, string=ua, user_agent=user_agent, os=os, device=device
80+
)

tests/test_core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Tests UAP-Python using the UAP-core test suite
22
"""
3+
import contextlib
34
import dataclasses
45
import logging
56
import pathlib
@@ -36,7 +37,10 @@
3637
PARSERS = [
3738
pytest.param(BasicParser(load_builtins()), id="basic"),
3839
]
40+
with contextlib.suppress(ImportError):
41+
from ua_parser import re2
3942

43+
PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))
4044

4145
UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
4246

tox.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,15 @@ wheel_build_env = .pkg
1919
deps =
2020
pytest
2121
pyyaml
22+
google-re2
2223
commands =
2324
pytest -Werror --doctest-glob="*.rst" {posargs}
2425

26+
[testenv:pypy3.{8,9,10},py312]
27+
deps =
28+
pytest
29+
pyyaml
30+
2531
[testenv:flake8]
2632
package = skip
2733
deps = flake8

0 commit comments

Comments
 (0)