Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 66dc464

Browse files
Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups. Fast searching optimization now can't be disabled at compile time.
1 parent 6ee588f commit 66dc464

4 files changed

Lines changed: 99 additions & 78 deletions

File tree

Lib/sre_compile.py

Lines changed: 68 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -409,57 +409,39 @@ def _generate_overlap_table(prefix):
409409
table[i] = idx + 1
410410
return table
411411

412-
def _compile_info(code, pattern, flags):
413-
# internal: compile an info block. in the current version,
414-
# this contains min/max pattern width, and an optional literal
415-
# prefix or a character map
416-
lo, hi = pattern.getwidth()
417-
if hi > MAXCODE:
418-
hi = MAXCODE
419-
if lo == 0:
420-
code.extend([INFO, 4, 0, lo, hi])
421-
return
422-
# look for a literal prefix
412+
def _get_literal_prefix(pattern):
413+
# look for literal prefix
423414
prefix = []
424415
prefixappend = prefix.append
425-
prefix_skip = 0
416+
prefix_skip = None
417+
got_all = True
418+
for op, av in pattern.data:
419+
if op is LITERAL:
420+
prefixappend(av)
421+
elif op is SUBPATTERN:
422+
prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
423+
if prefix_skip is None:
424+
if av[0] is not None:
425+
prefix_skip = len(prefix)
426+
elif prefix_skip1 is not None:
427+
prefix_skip = len(prefix) + prefix_skip1
428+
prefix.extend(prefix1)
429+
if not got_all:
430+
break
431+
else:
432+
got_all = False
433+
break
434+
return prefix, prefix_skip, got_all
435+
436+
def _get_charset_prefix(pattern):
426437
charset = [] # not used
427438
charsetappend = charset.append
428-
if not (flags & SRE_FLAG_IGNORECASE):
429-
# look for literal prefix
430-
for op, av in pattern.data:
439+
if pattern.data:
440+
op, av = pattern.data[0]
441+
if op is SUBPATTERN and av[1]:
442+
op, av = av[1][0]
431443
if op is LITERAL:
432-
if len(prefix) == prefix_skip:
433-
prefix_skip = prefix_skip + 1
434-
prefixappend(av)
435-
elif op is SUBPATTERN and len(av[1]) == 1:
436-
op, av = av[1][0]
437-
if op is LITERAL:
438-
prefixappend(av)
439-
else:
440-
break
441-
else:
442-
break
443-
# if no prefix, look for charset prefix
444-
if not prefix and pattern.data:
445-
op, av = pattern.data[0]
446-
if op is SUBPATTERN and av[1]:
447-
op, av = av[1][0]
448-
if op is LITERAL:
449-
charsetappend((op, av))
450-
elif op is BRANCH:
451-
c = []
452-
cappend = c.append
453-
for p in av[1]:
454-
if not p:
455-
break
456-
op, av = p[0]
457-
if op is LITERAL:
458-
cappend((op, av))
459-
else:
460-
break
461-
else:
462-
charset = c
444+
charsetappend((op, av))
463445
elif op is BRANCH:
464446
c = []
465447
cappend = c.append
@@ -473,8 +455,43 @@ def _compile_info(code, pattern, flags):
473455
break
474456
else:
475457
charset = c
476-
elif op is IN:
477-
charset = av
458+
elif op is BRANCH:
459+
c = []
460+
cappend = c.append
461+
for p in av[1]:
462+
if not p:
463+
break
464+
op, av = p[0]
465+
if op is LITERAL:
466+
cappend((op, av))
467+
else:
468+
break
469+
else:
470+
charset = c
471+
elif op is IN:
472+
charset = av
473+
return charset
474+
475+
def _compile_info(code, pattern, flags):
476+
# internal: compile an info block. in the current version,
477+
# this contains min/max pattern width, and an optional literal
478+
# prefix or a character map
479+
lo, hi = pattern.getwidth()
480+
if hi > MAXCODE:
481+
hi = MAXCODE
482+
if lo == 0:
483+
code.extend([INFO, 4, 0, lo, hi])
484+
return
485+
# look for a literal prefix
486+
prefix = []
487+
prefix_skip = 0
488+
charset = [] # not used
489+
if not (flags & SRE_FLAG_IGNORECASE):
490+
# look for literal prefix
491+
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
492+
# if no prefix, look for charset prefix
493+
if not prefix:
494+
charset = _get_charset_prefix(pattern)
478495
## if prefix:
479496
## print("*** PREFIX", prefix, prefix_skip)
480497
## if charset:
@@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags):
487504
mask = 0
488505
if prefix:
489506
mask = SRE_INFO_PREFIX
490-
if len(prefix) == prefix_skip == len(pattern.data):
507+
if prefix_skip is None and got_all:
491508
mask = mask | SRE_INFO_LITERAL
492509
elif charset:
493510
mask = mask | SRE_INFO_CHARSET
@@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags):
502519
# add literal prefix
503520
if prefix:
504521
emit(len(prefix)) # length
522+
if prefix_skip is None:
523+
prefix_skip = len(prefix)
505524
emit(prefix_skip) # skip
506525
code.extend(prefix)
507526
# generate overlap table

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ Core and Builtins
1313
Library
1414
-------
1515

16+
- Issue #24426: Fast searching optimization in regular expressions now works
17+
for patterns that starts with capturing groups. Fast searching optimization
18+
now can't be disabled at compile time.
19+
1620
Documentation
1721
-------------
1822

Modules/_sre.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,6 @@ static char copyright[] =
6262
/* -------------------------------------------------------------------- */
6363
/* optional features */
6464

65-
/* enables fast searching */
66-
#define USE_FAST_SEARCH
67-
6865
/* enables copy/deepcopy handling (work in progress) */
6966
#undef USE_BUILTIN_COPY
7067

Modules/sre_lib.h

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
12481248
prefix, prefix_len, prefix_skip));
12491249
TRACE(("charset = %p\n", charset));
12501250

1251-
#if defined(USE_FAST_SEARCH)
1251+
if (prefix_len == 1) {
1252+
/* pattern starts with a literal character */
1253+
SRE_CHAR c = (SRE_CHAR) prefix[0];
1254+
#if SIZEOF_SRE_CHAR < 4
1255+
if ((SRE_CODE) c != prefix[0])
1256+
return 0; /* literal can't match: doesn't fit in char width */
1257+
#endif
1258+
end = (SRE_CHAR *)state->end;
1259+
while (ptr < end) {
1260+
while (*ptr != c) {
1261+
if (++ptr >= end)
1262+
return 0;
1263+
}
1264+
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1265+
state->start = ptr;
1266+
state->ptr = ptr + prefix_skip;
1267+
if (flags & SRE_INFO_LITERAL)
1268+
return 1; /* we got all of it */
1269+
status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1270+
if (status != 0)
1271+
return status;
1272+
++ptr;
1273+
}
1274+
return 0;
1275+
}
1276+
12521277
if (prefix_len > 1) {
12531278
/* pattern starts with a known prefix. use the overlap
12541279
table to skip forward as fast as we possibly can */
@@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
12971322
}
12981323
return 0;
12991324
}
1300-
#endif
13011325

1302-
if (pattern[0] == SRE_OP_LITERAL) {
1303-
/* pattern starts with a literal character. this is used
1304-
for short prefixes, and if fast search is disabled */
1305-
SRE_CHAR c = (SRE_CHAR) pattern[1];
1306-
#if SIZEOF_SRE_CHAR < 4
1307-
if ((SRE_CODE) c != pattern[1])
1308-
return 0; /* literal can't match: doesn't fit in char width */
1309-
#endif
1310-
end = (SRE_CHAR *)state->end;
1311-
while (ptr < end) {
1312-
while (*ptr != c) {
1313-
if (++ptr >= end)
1314-
return 0;
1315-
}
1316-
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1317-
state->start = ptr;
1318-
state->ptr = ++ptr;
1319-
if (flags & SRE_INFO_LITERAL)
1320-
return 1; /* we got all of it */
1321-
status = SRE(match)(state, pattern + 2, 0);
1322-
if (status != 0)
1323-
break;
1324-
}
1325-
} else if (charset) {
1326+
if (charset) {
13261327
/* pattern starts with a character from a known set */
13271328
end = (SRE_CHAR *)state->end;
13281329
for (;;) {

0 commit comments

Comments
 (0)