|
21 | 21 | from lib.core.settings import DIFF_TOLERANCE |
22 | 22 | from lib.core.settings import HTML_TITLE_REGEX |
23 | 23 | from lib.core.settings import MIN_RATIO |
| 24 | +from lib.core.settings import MAX_DIFFLIB_SEQUENCE_LENGTH |
24 | 25 | from lib.core.settings import MAX_RATIO |
25 | 26 | from lib.core.settings import REFLECTED_VALUE_MARKER |
26 | 27 | from lib.core.settings import LOWER_RATIO_BOUND |
@@ -54,8 +55,6 @@ def _comparison(page, headers, code, getRatioValue, pageLength): |
54 | 55 | if page is None and pageLength is None: |
55 | 56 | return None |
56 | 57 |
|
57 | | - count = 0 |
58 | | - |
59 | 58 | seqMatcher = threadData.seqMatcher |
60 | 59 | seqMatcher.set_seq1(kb.pageTemplate) |
61 | 60 |
|
@@ -110,59 +109,37 @@ def _comparison(page, headers, code, getRatioValue, pageLength): |
110 | 109 | elif isinstance(seqMatcher.a, unicode) and isinstance(page, str): |
111 | 110 | seqMatcher.a = seqMatcher.a.encode(kb.pageEncoding or DEFAULT_PAGE_ENCODING, 'ignore') |
112 | 111 |
|
113 | | - seq1, seq2 = None, None |
114 | | - |
115 | | - if conf.titles: |
116 | | - seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a) |
117 | | - seq2 = extractRegexResult(HTML_TITLE_REGEX, page) |
| 112 | + if seqMatcher.a and page and seqMatcher.a == page: |
| 113 | + ratio = 1 |
| 114 | + elif kb.skipSeqMatcher or seqMatcher.a and page and any(len(_) > MAX_DIFFLIB_SEQUENCE_LENGTH for _ in (seqMatcher.a, page)): |
| 115 | + ratio = 1.0 * len(seqMatcher.a) / len(page) |
| 116 | + if ratio > 1: |
| 117 | + ratio = 1. / ratio |
118 | 118 | else: |
119 | | - seq1 = getFilteredPageContent(seqMatcher.a, True) if conf.textOnly else seqMatcher.a |
120 | | - seq2 = getFilteredPageContent(page, True) if conf.textOnly else page |
| 119 | + seq1, seq2 = None, None |
121 | 120 |
|
122 | | - if seq1 is None or seq2 is None: |
123 | | - return None |
| 121 | + if conf.titles: |
| 122 | + seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a) |
| 123 | + seq2 = extractRegexResult(HTML_TITLE_REGEX, page) |
| 124 | + else: |
| 125 | + seq1 = getFilteredPageContent(seqMatcher.a, True) if conf.textOnly else seqMatcher.a |
| 126 | + seq2 = getFilteredPageContent(page, True) if conf.textOnly else page |
124 | 127 |
|
125 | | - seq1 = seq1.replace(REFLECTED_VALUE_MARKER, "") |
126 | | - seq2 = seq2.replace(REFLECTED_VALUE_MARKER, "") |
| 128 | + if seq1 is None or seq2 is None: |
| 129 | + return None |
127 | 130 |
|
128 | | - while count < min(len(seq1), len(seq2)): |
129 | | - if seq1[count] == seq2[count]: |
130 | | - count += 1 |
131 | | - else: |
132 | | - break |
133 | | - |
134 | | - if count: |
135 | | - try: |
136 | | - _seq1 = seq1[count:] |
137 | | - _seq2 = seq2[count:] |
138 | | - except MemoryError: |
139 | | - pass |
140 | | - else: |
141 | | - seq1 = _seq1 |
142 | | - seq2 = _seq2 |
143 | | - |
144 | | - while True: |
145 | | - try: |
146 | | - seqMatcher.set_seq1(seq1) |
147 | | - except MemoryError: |
148 | | - seq1 = seq1[:len(seq1) / 1024] |
149 | | - else: |
150 | | - break |
| 131 | + seq1 = seq1.replace(REFLECTED_VALUE_MARKER, "") |
| 132 | + seq2 = seq2.replace(REFLECTED_VALUE_MARKER, "") |
151 | 133 |
|
152 | | - while True: |
153 | | - try: |
154 | | - seqMatcher.set_seq2(seq2) |
155 | | - except MemoryError: |
156 | | - seq2 = seq2[:len(seq2) / 1024] |
157 | | - else: |
158 | | - break |
| 134 | + seqMatcher.set_seq1(seq1) |
| 135 | + seqMatcher.set_seq2(seq2) |
159 | 136 |
|
160 | | - ratio = round(seqMatcher.quick_ratio(), 3) |
| 137 | + ratio = round(seqMatcher.quick_ratio(), 3) |
161 | 138 |
|
162 | 139 | # If the url is stable and we did not set yet the match ratio and the |
163 | 140 | # current injected value changes the url page content |
164 | 141 | if kb.matchRatio is None: |
165 | | - if (count or ratio >= LOWER_RATIO_BOUND) and ratio <= UPPER_RATIO_BOUND: |
| 142 | + if ratio >= LOWER_RATIO_BOUND and ratio <= UPPER_RATIO_BOUND: |
166 | 143 | kb.matchRatio = ratio |
167 | 144 | logger.debug("setting match ratio for current parameter to %.3f" % kb.matchRatio) |
168 | 145 |
|
|
0 commit comments