From c2539f86f387f2c32da232ff4c9807ea7c048740 Mon Sep 17 00:00:00 2001 From: Chris Natter Date: Thu, 29 Mar 2018 20:08:56 -0400 Subject: [PATCH] Partial refactor to support regular expression backreferences for replacements. uap-core has started to use these for UAParser and OSParser instances, and only DeviceParser currently supports. The spec (https://github.com/ua-parser/uap-core/blob/master/docs/specification.md) indicates that all matches from $1 to $9 should be honored for every replacement. It also allows for a v3 replacement for UserAgent (which is used for at least one useragent at this point in time), so I added that as well. Also refactored the class hierarchy so that it's sort of DRY, and moved MultiReplace to a module level fuction because it doesn't use self. --- setup.py | 1 + ua_parser/user_agent_parser.py | 147 ++++++++++++++++----------------- 2 files changed, 72 insertions(+), 76 deletions(-) diff --git a/setup.py b/setup.py index b9d825e..ad0adfc 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,7 @@ def force_bytes(text): fp.write(force_bytes(' %r,\n' % device_parser.get('family_replacement'))) fp.write(force_bytes(' %r,\n' % device_parser.get('v1_replacement'))) fp.write(force_bytes(' %r,\n' % device_parser.get('v2_replacement'))) + fp.write(force_bytes(' %r,\n' % device_parser.get('v3_replacement'))) fp.write(b' ),\n') fp.write(b']\n') fp.write(b'\n') diff --git a/ua_parser/user_agent_parser.py b/ua_parser/user_agent_parser.py index 68824cf..6339c07 100644 --- a/ua_parser/user_agent_parser.py +++ b/ua_parser/user_agent_parser.py @@ -22,8 +22,45 @@ __author__ = 'Lindsey Simon ' -class UserAgentParser(object): - def __init__(self, pattern, family_replacement=None, v1_replacement=None, v2_replacement=None): +def MultiReplace(string, match): + def _repl(m): + index = int(m.group(1)) - 1 + group = match.groups() + if index < len(group): + return group[index] + return '' + + _string = re.sub(r'\$(\d)', _repl, string) + _string = re.sub(r'^\s+|\s+$', '', _string) + if _string == '': + return None + return _string + + +class Parser(object): + def __init__(self, pattern, regex_flag=None): + self.pattern = pattern + if regex_flag == 'i': + self.user_agent_re = re.compile(self.pattern, re.IGNORECASE) + else: + self.user_agent_re = re.compile(self.pattern) + + def MatchSpans(self, user_agent_string): + match_spans = [] + match = self.user_agent_re.search(user_agent_string) + if match: + match_spans = [match.span(group_index) + for group_index in range(1, match.lastindex + 1)] + return match_spans + + def Parse(self, user_agent_string): + raise NotImplementedError + + +class UserAgentParser(Parser): + def __init__(self, pattern, family_replacement=None, + v1_replacement=None, v2_replacement=None, + v3_replacement=None): """Initialize UserAgentParser. Args: @@ -31,54 +68,46 @@ def __init__(self, pattern, family_replacement=None, v1_replacement=None, v2_rep family_replacement: a string to override the matched family (optional) v1_replacement: a string to override the matched v1 (optional) v2_replacement: a string to override the matched v2 (optional) + v3_replacement: a string to override the matched v3 (optional) """ - self.pattern = pattern - self.user_agent_re = re.compile(self.pattern) + super(UserAgentParser, self).__init__(pattern=pattern) self.family_replacement = family_replacement self.v1_replacement = v1_replacement self.v2_replacement = v2_replacement - - def MatchSpans(self, user_agent_string): - match_spans = [] - match = self.user_agent_re.search(user_agent_string) - if match: - match_spans = [match.span(group_index) - for group_index in range(1, match.lastindex + 1)] - return match_spans + self.v3_replacement = v3_replacement def Parse(self, user_agent_string): family, v1, v2, v3 = None, None, None, None match = self.user_agent_re.search(user_agent_string) if match: if self.family_replacement: - if re.search(r'\$1', self.family_replacement): - family = re.sub(r'\$1', match.group(1), self.family_replacement) - else: - family = self.family_replacement + family = MultiReplace(self.family_replacement, match) else: family = match.group(1) if self.v1_replacement: - v1 = self.v1_replacement + v1 = MultiReplace(self.v1_replacement, match) elif match.lastindex and match.lastindex >= 2: v1 = match.group(2) if self.v2_replacement: - v2 = self.v2_replacement + v2 = MultiReplace(self.v2_replacement, match) elif match.lastindex and match.lastindex >= 3: v2 = match.group(3) - if match.lastindex and match.lastindex >= 4: + if self.v3_replacement: + v3 = MultiReplace(self.v3_replacement, match) + elif match.lastindex and match.lastindex >= 4: v3 = match.group(4) return family, v1, v2, v3 -class OSParser(object): +class OSParser(Parser): def __init__(self, pattern, os_replacement=None, os_v1_replacement=None, os_v2_replacement=None, os_v3_replacement=None, os_v4_replacement=None): - """Initialize UserAgentParser. + """Initialize OSParser. Args: pattern: a regular expression string @@ -88,115 +117,76 @@ def __init__(self, pattern, os_replacement=None, os_v3_replacement: a string to override the matched v3 (optional) os_v4_replacement: a string to override the matched v4 (optional) """ - self.pattern = pattern - self.user_agent_re = re.compile(self.pattern) + super(OSParser, self).__init__(pattern=pattern) self.os_replacement = os_replacement self.os_v1_replacement = os_v1_replacement self.os_v2_replacement = os_v2_replacement self.os_v3_replacement = os_v3_replacement self.os_v4_replacement = os_v4_replacement - def MatchSpans(self, user_agent_string): - match_spans = [] - match = self.user_agent_re.search(user_agent_string) - if match: - match_spans = [match.span(group_index) - for group_index in range(1, match.lastindex + 1)] - return match_spans - def Parse(self, user_agent_string): os, os_v1, os_v2, os_v3, os_v4 = None, None, None, None, None match = self.user_agent_re.search(user_agent_string) if match: if self.os_replacement: - if re.search(r'\$1', self.os_replacement): - os = re.sub(r'\$1', match.group(1), self.os_replacement) - else: - os = self.os_replacement + os = MultiReplace(self.os_replacement, match) elif match.lastindex: os = match.group(1) if self.os_v1_replacement: - if re.search(r'\$1', self.os_v1_replacement): - os_v1 = re.sub(r'\$1', match.group(1), self.os_v1_replacement) - else: - os_v1 = self.os_v1_replacement + os_v1 = MultiReplace(self.os_v1_replacement, match) elif match.lastindex and match.lastindex >= 2: os_v1 = match.group(2) if self.os_v2_replacement: - os_v2 = self.os_v2_replacement + os_v2 = MultiReplace(self.os_v2_replacement, match) elif match.lastindex and match.lastindex >= 3: os_v2 = match.group(3) if self.os_v3_replacement: - os_v3 = self.os_v3_replacement + os_v3 = MultiReplace(self.os_v3_replacement, match) elif match.lastindex and match.lastindex >= 4: os_v3 = match.group(4) if self.os_v4_replacement: - os_v4 = self.os_v4_replacement + os_v4 = MultiReplace(self.os_v4_replacement, match) elif match.lastindex and match.lastindex >= 5: os_v4 = match.group(5) return os, os_v1, os_v2, os_v3, os_v4 -class DeviceParser(object): +class DeviceParser(Parser): def __init__(self, pattern, regex_flag=None, device_replacement=None, brand_replacement=None, model_replacement=None): - """Initialize UserAgentParser. + """Initialize DeviceParser. Args: pattern: a regular expression string device_replacement: a string to override the matched device (optional) """ - self.pattern = pattern - if regex_flag == 'i': - self.user_agent_re = re.compile(self.pattern, re.IGNORECASE) - else: - self.user_agent_re = re.compile(self.pattern) + super(DeviceParser, self).__init__(pattern=pattern, + regex_flag=regex_flag) self.device_replacement = device_replacement self.brand_replacement = brand_replacement self.model_replacement = model_replacement - def MatchSpans(self, user_agent_string): - match_spans = [] - match = self.user_agent_re.search(user_agent_string) - if match: - match_spans = [match.span(group_index) - for group_index in range(1, match.lastindex + 1)] - return match_spans - - def MultiReplace(self, string, match): - def _repl(m): - index = int(m.group(1)) - 1 - group = match.groups() - if index < len(group): - return group[index] - return '' - - _string = re.sub(r'\$(\d)', _repl, string) - _string = re.sub(r'^\s+|\s+$', '', _string) - if _string == '': - return None - return _string - def Parse(self, user_agent_string): device, brand, model = None, None, None match = self.user_agent_re.search(user_agent_string) if match: if self.device_replacement: - device = self.MultiReplace(self.device_replacement, match) + device = MultiReplace(self.device_replacement, match) else: device = match.group(1) if self.brand_replacement: - brand = self.MultiReplace(self.brand_replacement, match) + brand = MultiReplace(self.brand_replacement, match) + match_length = len(match.groups()) if self.model_replacement: - model = self.MultiReplace(self.model_replacement, match) - elif len(match.groups()) > 0: + model = MultiReplace(self.model_replacement, match) + elif match_length > 0: model = match.group(1) return device, brand, model @@ -480,10 +470,15 @@ def GetFilters(user_agent_string, js_user_agent_string=None, if 'v2_replacement' in _ua_parser: _v2_replacement = _ua_parser['v2_replacement'] + _v3_replacement = None + if 'v3_replacement' in _ua_parser: + _v3_replacement = _ua_parser['v3_replacement'] + USER_AGENT_PARSERS.append(UserAgentParser(_regex, _family_replacement, _v1_replacement, - _v2_replacement)) + _v2_replacement, + _v3_replacement)) OS_PARSERS = [] for _os_parser in regexes['os_parsers']: