Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7ff511e

Browse files
committed
Add an re2-based parser
Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Uses `re2.Filter`, which unlike the C++ `FilteredRE2` bundles prefiltering, using an `re2.Set` so likely less efficient than providing one's own e.g. aho-corasick, but avoids having to do that. At first glance according to pytest's `--durations 0` this is quite successful (unlike using `re2.Set` which was more of a mixed bag): ``` 2.54s call tests/test_core.py::test_devices[test_device.yaml-basic] 2.51s call tests/test_core.py::test_ua[pgts_browser_list.yaml-basic] 2.48s call tests/test_legacy.py::TestParse::testPGTSStrings 2.43s call tests/test_legacy.py::TestParse::testStringsDevice 0.95s call tests/test_core.py::test_devices[test_device.yaml-re2] 0.55s call tests/test_core.py::test_ua[pgts_browser_list.yaml-re2] 0.18s call tests/test_core.py::test_ua[test_ua.yaml-basic] 0.16s call tests/test_legacy.py::TestParse::testBrowserscopeStrings 0.10s call tests/test_core.py::test_ua[test_ua.yaml-re2] ``` While the "basic" parser for the new API is slightly slower than the legacy API (browserscope does use test_ua.yaml so that matches) the re2 parser is significantly faster than both: - 60% faster on test_device.yaml (~2.5s -> 1s) - 80% faster on pgts (2.5s -> 0.5s) - 40% faster on browserscope (0.16 -> 0.1) This is very encouraging, altough the memory consumption has not been checked (yet). Fixes #149, kind-of
1 parent 7f90746 commit 7ff511e

File tree

4 files changed

+86
-1
lines changed

4 files changed

+86
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ version = "1.0.0a1"
99
readme = "README.rst"
1010
requires-python = ">=3.8"
1111
dependencies = []
12-
optional-dependencies = { yaml = ["PyYaml"] }
12+
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
1313

1414
license = {text = "Apache 2.0"}
1515
urls = {repository = "https://github.com/ua-parser/uap-python"}

src/ua_parser/re2.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
from __future__ import annotations
2+
3+
import io
4+
import os
5+
import re
6+
from typing import List, Tuple, Union
7+
8+
import re2 # type: ignore
9+
10+
from .core import (
11+
Parser as AbstractParser,
12+
PartialParseResult,
13+
Device,
14+
Domain,
15+
OS,
16+
UserAgent,
17+
Matchers,
18+
UserAgentMatcher,
19+
OSMatcher,
20+
DeviceMatcher,
21+
)
22+
23+
24+
class Parser(AbstractParser):
25+
ua: re2.Filter
26+
user_agent_parsers: List[UserAgentMatcher]
27+
os: re2.Filter
28+
os_parsers: List[OSMatcher]
29+
devices: re2.Filter
30+
device_parsers: List[DeviceMatcher]
31+
32+
def __init__(
33+
self,
34+
matchers: Matchers,
35+
) -> None:
36+
self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers
37+
38+
self.ua = re2.Filter()
39+
for u in self.user_agent_parsers:
40+
self.ua.Add(u.regex.pattern)
41+
self.ua.Compile()
42+
43+
self.os = re2.Filter()
44+
for o in self.os_parsers:
45+
self.os.Add(o.regex.pattern)
46+
self.os.Compile()
47+
48+
self.devices = re2.Filter()
49+
for d in self.device_parsers:
50+
# Prepend the i global flag if IGNORECASE is set. Assumes
51+
# no pattern uses global flags, but since they're not
52+
# supported in JS that seems safe.
53+
if d.regex.flags & re.IGNORECASE:
54+
self.devices.Add("(?i)" + d.regex.pattern)
55+
else:
56+
self.devices.Add(d.regex.pattern)
57+
self.devices.Compile()
58+
59+
def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
60+
user_agent = os = device = None
61+
if Domain.USER_AGENT in domains:
62+
if matches := self.ua.Match(ua):
63+
# Set/Filter does not return the match in index order
64+
# (position order?) so to fit UAP semantics we need to
65+
# extract the first matching regex (lowest index).
66+
user_agent = self.user_agent_parsers[min(matches)](ua)
67+
if Domain.OS in domains:
68+
if matches := self.os.Match(ua):
69+
os = self.os_parsers[min(matches)](ua)
70+
if Domain.DEVICE in domains:
71+
if matches := self.devices.Match(ua):
72+
device = self.device_parsers[min(matches)](ua)
73+
return PartialParseResult(
74+
domains=domains, string=ua, user_agent=user_agent, os=os, device=device
75+
)

tests/test_core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Tests UAP-Python using the UAP-core test suite
22
"""
3+
import contextlib
34
import dataclasses
45
import logging
56
import pathlib
@@ -36,7 +37,10 @@
3637
PARSERS = [
3738
pytest.param(BasicParser(load_builtins()), id="basic"),
3839
]
40+
with contextlib.suppress(ImportError):
41+
from ua_parser import re2
3942

43+
PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))
4044

4145
UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
4246

tox.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,15 @@ wheel_build_env = .pkg
1919
deps =
2020
pytest
2121
pyyaml
22+
google-re2
2223
commands =
2324
pytest -Werror --doctest-glob="*.rst" {posargs}
2425

26+
[testenv:pypy3.{8,9,10},py312]
27+
deps =
28+
pytest
29+
pyyaml
30+
2531
[testenv:flake8]
2632
package = skip
2733
deps = flake8

0 commit comments

Comments
 (0)