Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit bdc33fd

Browse files
committed
Add support for lazy matchers
Support is added for lazy builtin matchers (with a separately compiled file), as well as loading json or yaml files using lazy matchers. Lazy matchers are very much a tradeoff: they improve import speed, but slow down run speed, possibly dramatically. Use them by default for the re2 parser, but not the basic parser: experimentally, on Python 3.11 - importing the package itself takes ~36ms - importing the lazy matchers takes ~36ms (including the package, so ~0) - importing the eager matchers takes ~97ms the eager matchers have a significant overhead, *however* running the bench on the sample file, they cause a runtime increase of 700~800ms on the basic parser bench, as that ends up instantiating *every* regex (likely due to match failures). Relatively this is not huge (~2.5%), but the tradeoff doesn't seem great, especially since the parser itself is initialized lazily. The re2 parser does much better, only losing 20~30ms (~1%), this is likely because it only needs to compile a fraction of the regexes (156 out of 1162 as of regexes.yaml version 0.18), and possibly because it gets to avoid some of the most expensive to compile ones. Fixes #171, fixes #173
1 parent 04d0b7d commit bdc33fd

File tree

9 files changed

+450
-138
lines changed

9 files changed

+450
-138
lines changed

setup.py

Lines changed: 139 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# flake8: noqa
33
import io
4-
from contextlib import suppress
4+
from contextlib import suppress, contextmanager
55
from os import fspath
66
from pathlib import Path
77
from typing import Optional, List, Dict
@@ -52,21 +52,6 @@ def run(self) -> None:
5252
f"Unable to find regexes.yaml, should be at {yaml_src!r}"
5353
)
5454

55-
def write_matcher(f, typ: str, fields: List[Optional[object]]):
56-
f.write(f" {typ}(".encode())
57-
while len(fields) > 1 and fields[-1] is None:
58-
fields = fields[:-1]
59-
f.write(", ".join(map(repr, fields)).encode())
60-
f.write(b"),\n")
61-
62-
def write_params(fields):
63-
# strip trailing None values
64-
while len(fields) > 1 and fields[-1] is None:
65-
fields.pop()
66-
67-
for field in fields:
68-
fp.write((f" {field!r},\n").encode())
69-
7055
with yaml_src.open("rb") as f:
7156
regexes = yaml.safe_load(f)
7257

@@ -79,96 +64,150 @@ def write_params(fields):
7964
outdir.mkdir(parents=True, exist_ok=True)
8065

8166
dest = outdir / "_matchers.py"
67+
dest_lazy = outdir / "_lazy.py"
8268
dest_legacy = outdir / "_regexes.py"
8369

84-
with dest.open("wb") as f, dest_legacy.open("wb") as fp:
85-
# fmt: off
86-
f.write(b"""\
70+
with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
71+
"wb"
72+
) as legacy:
73+
eager = EagerWriter(eager)
74+
lazy = LazyWriter(lazy)
75+
legacy = LegacyWriter(legacy)
76+
77+
for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
78+
with eager.section(section), lazy.section(section), legacy.section(
79+
section
80+
):
81+
extract = EXTRACTORS[section]
82+
for p in regexes[section]:
83+
el = trim(extract(p))
84+
eager.item(el)
85+
lazy.item(el)
86+
legacy.item(el)
87+
eager.end()
88+
lazy.end()
89+
legacy.end()
90+
91+
92+
def trim(l):
93+
while len(l) > 1 and l[-1] is None:
94+
l.pop()
95+
return l
96+
97+
98+
EXTRACTORS = {
99+
"user_agent_parsers": lambda p: [
100+
p["regex"],
101+
p.get("family_replacement"),
102+
p.get("v1_replacement"),
103+
p.get("v2_replacement"),
104+
],
105+
"os_parsers": lambda p: [
106+
p["regex"],
107+
p.get("os_replacement"),
108+
p.get("os_v1_replacement"),
109+
p.get("os_v2_replacement"),
110+
p.get("os_v3_replacement"),
111+
p.get("os_v4_replacement"),
112+
],
113+
"device_parsers": lambda p: [
114+
p["regex"],
115+
p.get("regex_flag"),
116+
p.get("device_replacement"),
117+
p.get("brand_replacement"),
118+
p.get("model_replacement"),
119+
],
120+
}
121+
122+
123+
class Writer:
124+
section_end = b""
125+
126+
def __init__(self, fp):
127+
self.fp = fp
128+
self.fp.write(
129+
b"""\
87130
########################################################
88131
# NOTICE: this file is autogenerated from regexes.yaml #
89132
########################################################
133+
"""
134+
)
135+
self.fp.write(self.prefix)
136+
self._section = None
137+
138+
@contextmanager
139+
def section(self, id):
140+
self._section = id
141+
self.fp.write(self.sections[id])
142+
yield
143+
self.fp.write(self.section_end)
144+
145+
def item(self, elements):
146+
# DeviceMatcher(re, flag, repl1),
147+
self.fp.write(self.items[self._section])
148+
self.fp.write(", ".join(map(repr, elements)).encode())
149+
self.fp.write(b"),\n")
150+
151+
def end(self):
152+
self.fp.write(self.suffix)
153+
154+
155+
class LegacyWriter(Writer):
156+
prefix = b"""\
157+
__all__ = [
158+
"USER_AGENT_PARSERS",
159+
"DEVICE_PARSERS",
160+
"OS_PARSERS",
161+
]
162+
163+
from .user_agent_parser import UserAgentParser, DeviceParser, OSParser
164+
165+
"""
166+
sections = {
167+
"user_agent_parsers": b"USER_AGENT_PARSERS = [\n",
168+
"os_parsers": b"\n\nOS_PARSERS = [\n",
169+
"device_parsers": b"\n\nDEVICE_PARSERS = [\n",
170+
}
171+
section_end = b"]"
172+
items = {
173+
"user_agent_parsers": b" UserAgentParser(",
174+
"os_parsers": b" OSParser(",
175+
"device_parsers": b" DeviceParser(",
176+
}
177+
suffix = b"\n"
178+
179+
180+
class EagerWriter(Writer):
181+
prefix = b"""\
182+
__all__ = ["MATCHERS"]
183+
184+
from typing import Tuple, List
185+
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
186+
187+
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
188+
"""
189+
sections = {
190+
"user_agent_parsers": b"",
191+
"os_parsers": b"], [\n",
192+
"device_parsers": b"], [\n",
193+
}
194+
items = {
195+
"user_agent_parsers": b" UserAgentMatcher(",
196+
"os_parsers": b" OSMatcher(",
197+
"device_parsers": b" DeviceMatcher(",
198+
}
199+
suffix = b"])\n"
200+
201+
202+
class LazyWriter(EagerWriter):
203+
prefix = b"""\
204+
__all__ = ["MATCHERS"]
205+
206+
from typing import Tuple, List
207+
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
90208
91-
from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher
92-
93-
MATCHERS: Matchers = ([
94-
""")
95-
fp.write(b"# -*- coding: utf-8 -*-\n")
96-
fp.write(b"########################################################\n")
97-
fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n")
98-
fp.write(b"########################################################\n")
99-
fp.write(b"\n")
100-
fp.write(b"from .user_agent_parser import (\n")
101-
fp.write(b" UserAgentParser, DeviceParser, OSParser,\n")
102-
fp.write(b")\n")
103-
fp.write(b"\n")
104-
fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n")
105-
fp.write(b"\n")
106-
fp.write(b"USER_AGENT_PARSERS = [\n")
107-
for device_parser in regexes["user_agent_parsers"]:
108-
write_matcher(f, "UserAgentMatcher", [
109-
device_parser["regex"],
110-
device_parser.get("family_replacement"),
111-
device_parser.get("v1_replacement"),
112-
device_parser.get("v2_replacement"),
113-
])
114-
115-
fp.write(b" UserAgentParser(\n")
116-
write_params([
117-
device_parser["regex"],
118-
device_parser.get("family_replacement"),
119-
device_parser.get("v1_replacement"),
120-
device_parser.get("v2_replacement"),
121-
])
122-
fp.write(b" ),\n")
123-
f.write(b" ], [\n")
124-
fp.write(b"]\n\n")
125-
126-
fp.write(b"OS_PARSERS = [\n")
127-
for device_parser in regexes["os_parsers"]:
128-
write_matcher(f, "OSMatcher", [
129-
device_parser["regex"],
130-
device_parser.get("os_replacement"),
131-
device_parser.get("os_v1_replacement"),
132-
device_parser.get("os_v2_replacement"),
133-
device_parser.get("os_v3_replacement"),
134-
device_parser.get("os_v4_replacement"),
135-
])
136-
137-
fp.write(b" OSParser(\n")
138-
write_params([
139-
device_parser["regex"],
140-
device_parser.get("os_replacement"),
141-
device_parser.get("os_v1_replacement"),
142-
device_parser.get("os_v2_replacement"),
143-
device_parser.get("os_v3_replacement"),
144-
device_parser.get("os_v4_replacement"),
145-
])
146-
fp.write(b" ),\n")
147-
f.write(b" ], [\n")
148-
fp.write(b"]\n\n")
149-
150-
fp.write(b"DEVICE_PARSERS = [\n")
151-
for device_parser in regexes["device_parsers"]:
152-
write_matcher(f, "DeviceMatcher", [
153-
device_parser["regex"],
154-
device_parser.get("regex_flag"),
155-
device_parser.get("device_replacement"),
156-
device_parser.get("brand_replacement"),
157-
device_parser.get("model_replacement"),
158-
])
159-
160-
fp.write(b" DeviceParser(\n")
161-
write_params([
162-
device_parser["regex"],
163-
device_parser.get("regex_flag"),
164-
device_parser.get("device_replacement"),
165-
device_parser.get("brand_replacement"),
166-
device_parser.get("model_replacement"),
167-
])
168-
fp.write(b" ),\n")
169-
f.write(b"])\n")
170-
fp.write(b"]\n")
171-
# fmt: on
209+
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
210+
"""
172211

173212

174213
setup(

src/ua_parser/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"UserAgent",
3737
"UserAgentMatcher",
3838
"load_builtins",
39+
"load_lazy_builtins",
3940
"load_data",
4041
"load_yaml",
4142
"parse",
@@ -65,7 +66,7 @@
6566
)
6667
from .basic import Parser as BasicParser
6768
from .caching import CachingParser, Clearing, LRU, Locking
68-
from .loaders import load_builtins, load_data, load_yaml
69+
from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml
6970

7071
Re2Parser: Optional[Callable[[Matchers], Parser]] = None
7172
with contextlib.suppress(ImportError):
@@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser:
7980
global parser
8081
if name == "parser":
8182
if Re2Parser is not None:
82-
parser = Re2Parser(load_builtins())
83+
parser = Re2Parser(load_lazy_builtins())
8384
else:
8485
parser = CachingParser(
8586
BasicParser(load_builtins()),

src/ua_parser/_lazy.pyi

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
__all__ = ["MATCHERS"]
2+
3+
from typing import Tuple, List
4+
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
5+
6+
MATCHERS: Tuple[
7+
List[UserAgentMatcher],
8+
List[OSMatcher],
9+
List[DeviceMatcher],
10+
]

src/ua_parser/_matchers.pyi

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1-
from .core import Matchers
1+
__all__ = ["MATCHERS"]
22

3-
MATCHERS: Matchers
3+
from typing import Tuple, List
4+
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
5+
6+
MATCHERS: Tuple[
7+
List[UserAgentMatcher],
8+
List[OSMatcher],
9+
List[DeviceMatcher],
10+
]

src/ua_parser/basic.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Device,
88
DeviceMatcher,
99
Domain,
10+
Matcher,
1011
Matchers,
1112
OS,
1213
OSMatcher,
@@ -23,9 +24,9 @@ class Parser(AbstractParser):
2324
when one matches.
2425
"""
2526

26-
user_agent_matchers: List[UserAgentMatcher]
27-
os_matchers: List[OSMatcher]
28-
device_matchers: List[DeviceMatcher]
27+
user_agent_matchers: List[Matcher[UserAgent]]
28+
os_matchers: List[Matcher[OS]]
29+
device_matchers: List[Matcher[Device]]
2930

3031
def __init__(
3132
self,

0 commit comments

Comments
 (0)