Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit be9a4e5

Browse files
Issue #433028: Added support of modifier spans in regular expressions.
1 parent ee73a65 commit be9a4e5

7 files changed

Lines changed: 180 additions & 66 deletions

File tree

Doc/library/re.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,16 @@ The special characters are:
237237
*cannot* be retrieved after performing a match or referenced later in the
238238
pattern.
239239

240+
``(?imsx-imsx:...)``
241+
(Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
242+
optionally followed by ``'-'`` followed by one or more letters from the
243+
same set.) The letters set or removes the corresponding flags:
244+
:const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
245+
(dot matches all), and :const:`re.X` (verbose), for the part of the
246+
expression. (The flags are described in :ref:`contents-of-module-re`.)
247+
248+
.. versionadded: 3.7
249+
240250
``(?P<name>...)``
241251
Similar to regular parentheses, but the substring matched by the group is
242252
accessible via the symbolic group name *name*. Group names must be valid

Doc/whatsnew/3.6.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,15 @@ Protocol version 4 already supports this case. (Contributed by Serhiy
645645
Storchaka in :issue:`24164`.)
646646

647647

648+
re
649+
--
650+
651+
Added support of modifier spans in regular expressions. Examples:
652+
``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``;
653+
``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``.
654+
(Contributed by Serhiy Storchaka in :issue:`433028`.)
655+
656+
648657
readline
649658
--------
650659

Lib/re.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def __init__(self, lexicon, flags=0):
352352
for phrase, action in lexicon:
353353
gid = s.opengroup()
354354
p.append(sre_parse.SubPattern(s, [
355-
(SUBPATTERN, (gid, sre_parse.parse(phrase, flags))),
355+
(SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
356356
]))
357357
s.closegroup(gid, p[-1])
358358
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])

Lib/sre_compile.py

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
7171
ASSERT_CODES = _ASSERT_CODES
7272
if (flags & SRE_FLAG_IGNORECASE and
7373
not (flags & SRE_FLAG_LOCALE) and
74-
flags & SRE_FLAG_UNICODE):
74+
flags & SRE_FLAG_UNICODE and
75+
not (flags & SRE_FLAG_ASCII)):
7576
fixes = _ignorecase_fixes
7677
else:
7778
fixes = None
@@ -137,14 +138,15 @@ def fixup(literal, flags=flags):
137138
else:
138139
emit(MIN_UNTIL)
139140
elif op is SUBPATTERN:
140-
if av[0]:
141+
group, add_flags, del_flags, p = av
142+
if group:
141143
emit(MARK)
142-
emit((av[0]-1)*2)
143-
# _compile_info(code, av[1], flags)
144-
_compile(code, av[1], flags)
145-
if av[0]:
144+
emit((group-1)*2)
145+
# _compile_info(code, p, (flags | add_flags) & ~del_flags)
146+
_compile(code, p, (flags | add_flags) & ~del_flags)
147+
if group:
146148
emit(MARK)
147-
emit((av[0]-1)*2+1)
149+
emit((group-1)*2+1)
148150
elif op in SUCCESS_CODES:
149151
emit(op)
150152
elif op in ASSERT_CODES:
@@ -172,7 +174,7 @@ def fixup(literal, flags=flags):
172174
av = AT_MULTILINE.get(av, av)
173175
if flags & SRE_FLAG_LOCALE:
174176
av = AT_LOCALE.get(av, av)
175-
elif flags & SRE_FLAG_UNICODE:
177+
elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
176178
av = AT_UNICODE.get(av, av)
177179
emit(av)
178180
elif op is BRANCH:
@@ -193,7 +195,7 @@ def fixup(literal, flags=flags):
193195
emit(op)
194196
if flags & SRE_FLAG_LOCALE:
195197
av = CH_LOCALE[av]
196-
elif flags & SRE_FLAG_UNICODE:
198+
elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
197199
av = CH_UNICODE[av]
198200
emit(av)
199201
elif op is GROUPREF:
@@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
237239
elif op is CATEGORY:
238240
if flags & SRE_FLAG_LOCALE:
239241
emit(CH_LOCALE[av])
240-
elif flags & SRE_FLAG_UNICODE:
242+
elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
241243
emit(CH_UNICODE[av])
242244
else:
243245
emit(av)
@@ -414,47 +416,52 @@ def _get_literal_prefix(pattern):
414416
prefix = []
415417
prefixappend = prefix.append
416418
prefix_skip = None
417-
got_all = True
418419
for op, av in pattern.data:
419420
if op is LITERAL:
420421
prefixappend(av)
421422
elif op is SUBPATTERN:
422-
prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
423+
group, add_flags, del_flags, p = av
424+
if add_flags & SRE_FLAG_IGNORECASE:
425+
break
426+
prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
423427
if prefix_skip is None:
424-
if av[0] is not None:
428+
if group is not None:
425429
prefix_skip = len(prefix)
426430
elif prefix_skip1 is not None:
427431
prefix_skip = len(prefix) + prefix_skip1
428432
prefix.extend(prefix1)
429433
if not got_all:
430434
break
431435
else:
432-
got_all = False
433436
break
434-
return prefix, prefix_skip, got_all
437+
else:
438+
return prefix, prefix_skip, True
439+
return prefix, prefix_skip, False
435440

436441
def _get_charset_prefix(pattern):
437442
charset = [] # not used
438443
charsetappend = charset.append
439444
if pattern.data:
440445
op, av = pattern.data[0]
441-
if op is SUBPATTERN and av[1]:
442-
op, av = av[1][0]
443-
if op is LITERAL:
444-
charsetappend((op, av))
445-
elif op is BRANCH:
446-
c = []
447-
cappend = c.append
448-
for p in av[1]:
449-
if not p:
450-
break
451-
op, av = p[0]
452-
if op is LITERAL:
453-
cappend((op, av))
446+
if op is SUBPATTERN:
447+
group, add_flags, del_flags, p = av
448+
if p and not (add_flags & SRE_FLAG_IGNORECASE):
449+
op, av = p[0]
450+
if op is LITERAL:
451+
charsetappend((op, av))
452+
elif op is BRANCH:
453+
c = []
454+
cappend = c.append
455+
for p in av[1]:
456+
if not p:
457+
break
458+
op, av = p[0]
459+
if op is LITERAL:
460+
cappend((op, av))
461+
else:
462+
break
454463
else:
455-
break
456-
else:
457-
charset = c
464+
charset = c
458465
elif op is BRANCH:
459466
c = []
460467
cappend = c.append

Lib/sre_parse.py

Lines changed: 84 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@
6565
"u": SRE_FLAG_UNICODE,
6666
}
6767

68+
GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
69+
SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
70+
71+
class Verbose(Exception):
72+
pass
73+
6874
class Pattern:
6975
# master pattern object. keeps track of global attributes
7076
def __init__(self):
@@ -184,7 +190,7 @@ def getwidth(self):
184190
lo = lo + i
185191
hi = hi + j
186192
elif op is SUBPATTERN:
187-
i, j = av[1].getwidth()
193+
i, j = av[-1].getwidth()
188194
lo = lo + i
189195
hi = hi + j
190196
elif op in _REPEATCODES:
@@ -395,15 +401,15 @@ def _escape(source, escape, state):
395401
pass
396402
raise source.error("bad escape %s" % escape, len(escape))
397403

398-
def _parse_sub(source, state, nested=True):
404+
def _parse_sub(source, state, verbose, nested=True):
399405
# parse an alternation: a|b|c
400406

401407
items = []
402408
itemsappend = items.append
403409
sourcematch = source.match
404410
start = source.tell()
405411
while True:
406-
itemsappend(_parse(source, state))
412+
itemsappend(_parse(source, state, verbose))
407413
if not sourcematch("|"):
408414
break
409415

@@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True):
445451
subpattern.append((BRANCH, (None, items)))
446452
return subpattern
447453

448-
def _parse_sub_cond(source, state, condgroup):
449-
item_yes = _parse(source, state)
454+
def _parse_sub_cond(source, state, condgroup, verbose):
455+
item_yes = _parse(source, state, verbose)
450456
if source.match("|"):
451-
item_no = _parse(source, state)
457+
item_no = _parse(source, state, verbose)
452458
if source.next == "|":
453459
raise source.error("conditional backref with more than two branches")
454460
else:
@@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup):
457463
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
458464
return subpattern
459465

460-
def _parse(source, state):
466+
def _parse(source, state, verbose):
461467
# parse a simple pattern
462468
subpattern = SubPattern(state)
463469

@@ -467,7 +473,6 @@ def _parse(source, state):
467473
sourcematch = source.match
468474
_len = len
469475
_ord = ord
470-
verbose = state.flags & SRE_FLAG_VERBOSE
471476

472477
while True:
473478

@@ -621,6 +626,8 @@ def _parse(source, state):
621626
group = True
622627
name = None
623628
condgroup = None
629+
add_flags = 0
630+
del_flags = 0
624631
if sourcematch("?"):
625632
# options
626633
char = sourceget()
@@ -682,7 +689,7 @@ def _parse(source, state):
682689
lookbehindgroups = state.lookbehindgroups
683690
if lookbehindgroups is None:
684691
state.lookbehindgroups = state.groups
685-
p = _parse_sub(source, state)
692+
p = _parse_sub(source, state, verbose)
686693
if dir < 0:
687694
if lookbehindgroups is None:
688695
state.lookbehindgroups = None
@@ -718,19 +725,13 @@ def _parse(source, state):
718725
raise source.error("invalid group reference",
719726
len(condname) + 1)
720727
state.checklookbehindgroup(condgroup, source)
721-
elif char in FLAGS:
728+
elif char in FLAGS or char == "-":
722729
# flags
723-
while True:
724-
state.flags |= FLAGS[char]
725-
char = sourceget()
726-
if char is None:
727-
raise source.error("missing )")
728-
if char == ")":
729-
break
730-
if char not in FLAGS:
731-
raise source.error("unknown flag", len(char))
732-
verbose = state.flags & SRE_FLAG_VERBOSE
733-
continue
730+
flags = _parse_flags(source, state, char)
731+
if flags is None: # global flags
732+
continue
733+
add_flags, del_flags = flags
734+
group = None
734735
else:
735736
raise source.error("unknown extension ?" + char,
736737
len(char) + 1)
@@ -742,15 +743,17 @@ def _parse(source, state):
742743
except error as err:
743744
raise source.error(err.msg, len(name) + 1) from None
744745
if condgroup:
745-
p = _parse_sub_cond(source, state, condgroup)
746+
p = _parse_sub_cond(source, state, condgroup, verbose)
746747
else:
747-
p = _parse_sub(source, state)
748+
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
749+
not (del_flags & SRE_FLAG_VERBOSE))
750+
p = _parse_sub(source, state, sub_verbose)
748751
if not source.match(")"):
749752
raise source.error("missing ), unterminated subpattern",
750753
source.tell() - start)
751754
if group is not None:
752755
state.closegroup(group, p)
753-
subpatternappend((SUBPATTERN, (group, p)))
756+
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
754757

755758
elif this == "^":
756759
subpatternappend((AT, AT_BEGINNING))
@@ -763,6 +766,53 @@ def _parse(source, state):
763766

764767
return subpattern
765768

769+
def _parse_flags(source, state, char):
770+
sourceget = source.get
771+
add_flags = 0
772+
del_flags = 0
773+
if char != "-":
774+
while True:
775+
add_flags |= FLAGS[char]
776+
char = sourceget()
777+
if char is None:
778+
raise source.error("missing -, : or )")
779+
if char in ")-:":
780+
break
781+
if char not in FLAGS:
782+
msg = "unknown flag" if char.isalpha() else "missing -, : or )"
783+
raise source.error(msg, len(char))
784+
if char == ")":
785+
if ((add_flags & SRE_FLAG_VERBOSE) and
786+
not (state.flags & SRE_FLAG_VERBOSE)):
787+
raise Verbose
788+
state.flags |= add_flags
789+
return None
790+
if add_flags & GLOBAL_FLAGS:
791+
raise source.error("bad inline flags: cannot turn on global flag", 1)
792+
if char == "-":
793+
char = sourceget()
794+
if char is None:
795+
raise source.error("missing flag")
796+
if char not in FLAGS:
797+
msg = "unknown flag" if char.isalpha() else "missing flag"
798+
raise source.error(msg, len(char))
799+
while True:
800+
del_flags |= FLAGS[char]
801+
char = sourceget()
802+
if char is None:
803+
raise source.error("missing :")
804+
if char == ":":
805+
break
806+
if char not in FLAGS:
807+
msg = "unknown flag" if char.isalpha() else "missing :"
808+
raise source.error(msg, len(char))
809+
assert char == ":"
810+
if del_flags & GLOBAL_FLAGS:
811+
raise source.error("bad inline flags: cannot turn off global flag", 1)
812+
if add_flags & del_flags:
813+
raise source.error("bad inline flags: flag turned on and off", 1)
814+
return add_flags, del_flags
815+
766816
def fix_flags(src, flags):
767817
# Check and fix flags according to the type of pattern (str or bytes)
768818
if isinstance(src, str):
@@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None):
789839
pattern.flags = flags
790840
pattern.str = str
791841

792-
p = _parse_sub(source, pattern, 0)
842+
try:
843+
p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
844+
except Verbose:
845+
# the VERBOSE flag was switched on inside the pattern. to be
846+
# on the safe side, we'll parse the whole thing again...
847+
pattern = Pattern()
848+
pattern.flags = flags | SRE_FLAG_VERBOSE
849+
pattern.str = str
850+
p = _parse_sub(source, pattern, True, False)
851+
793852
p.pattern.flags = fix_flags(str, p.pattern.flags)
794853

795854
if source.next is not None:
796855
assert source.next == ")"
797856
raise source.error("unbalanced parenthesis")
798857

799-
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
800-
# the VERBOSE flag was switched on inside the pattern. to be
801-
# on the safe side, we'll parse the whole thing again...
802-
return parse(str, p.pattern.flags)
803-
804858
if flags & SRE_FLAG_DEBUG:
805859
p.dump()
806860

0 commit comments

Comments
 (0)