Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7cafe4d

Browse files
author
Fredrik Lundh
committed
- actually enabled charset anchors in the engine (still not
used by the code generator) - changed max repeat value in engine (to match earlier array fix) - added experimental "which part matched?" mechanism to sre; see http://hem.passagen.se/eff/2000_07_01_bot-archive.htm#416954 or python-dev for details.
1 parent b19948b commit 7cafe4d

7 files changed

Lines changed: 95 additions & 20 deletions

File tree

Lib/sre.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,34 @@ def _pickle(p):
155155
return _compile, (p.pattern, p.flags)
156156

157157
copy_reg.pickle(type(_compile("")), _pickle, _compile)
158+
159+
# --------------------------------------------------------------------
160+
# experimental stuff (see python-dev discussions for details)
161+
162+
class Scanner:
163+
def __init__(self, lexicon):
164+
self.lexicon = lexicon
165+
p = []
166+
for phrase, action in lexicon:
167+
p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
168+
self.scanner = sre.compile("|".join(p))
169+
def scan(self, string):
170+
result = []
171+
append = result.append
172+
match = self.scanner.match
173+
i = 0
174+
while 1:
175+
m = match(string, i)
176+
if not m:
177+
break
178+
j = m.end()
179+
if i == j:
180+
break
181+
action = self.lexicon[m.index][1]
182+
if callable(action):
183+
self.match = match
184+
action = action(self, m.group())
185+
if action is not None:
186+
append(action)
187+
i = j
188+
return result, string[i:]

Lib/sre_compile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def fixup(literal, flags=flags):
208208
else:
209209
emit(OPCODES[op])
210210
emit(av-1)
211-
elif op is MARK:
211+
elif op in (MARK, INDEX):
212212
emit(OPCODES[op])
213213
emit(av)
214214
else:

Lib/sre_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class error(Exception):
3333
GROUP_IGNORE = "group_ignore"
3434
IN = "in"
3535
IN_IGNORE = "in_ignore"
36+
INDEX = "index"
3637
INFO = "info"
3738
JUMP = "jump"
3839
LITERAL = "literal"
@@ -90,6 +91,7 @@ class error(Exception):
9091
CATEGORY,
9192
CHARSET,
9293
GROUP, GROUP_IGNORE,
94+
INDEX,
9395
IN, IN_IGNORE,
9496
INFO,
9597
JUMP,

Lib/sre_parse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,23 @@ def _parse(source, state):
451451
if gid is None:
452452
raise error, "unknown group name"
453453
subpattern.append((GROUP, gid))
454+
elif source.match("#"):
455+
index = ""
456+
while 1:
457+
char = source.get()
458+
if char is None:
459+
raise error, "unterminated index"
460+
if char == ")":
461+
break
462+
index = index + char
463+
try:
464+
index = int(index)
465+
if index < 0 or index > MAXREPEAT:
466+
raise ValueError
467+
except ValueError:
468+
raise error, "illegal index"
469+
subpattern.append((INDEX, index))
470+
continue
454471
else:
455472
char = source.get()
456473
if char is None:

Modules/_sre.c

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
2222
* 00-06-30 fl added fast search optimization (0.9.3)
2323
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
24+
* 00-07-02 fl added charset optimizations, etc (0.9.5)
2425
*
2526
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
2627
*
@@ -31,7 +32,7 @@
3132

3233
#ifndef SRE_RECURSIVE
3334

34-
char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
35+
char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
3536

3637
#include "Python.h"
3738

@@ -587,6 +588,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
587588
pattern++;
588589
break;
589590

591+
case SRE_OP_INDEX:
592+
/* set index */
593+
/* args: <index> */
594+
TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
595+
state->index = pattern[0];
596+
pattern++;
597+
break;
598+
590599
case SRE_OP_JUMP:
591600
case SRE_OP_INFO:
592601
/* jump forward */
@@ -810,7 +819,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
810819
/* match maximum number of items, pushing alternate end
811820
points to the stack */
812821

813-
while (pattern[2] == 32767 || count < (int) pattern[2]) {
822+
while (pattern[2] == 65535 || count < (int) pattern[2]) {
814823
state->stackbase = stack;
815824
i = SRE_MATCH(state, pattern + 3);
816825
state->stackbase = stackbase; /* rewind */
@@ -980,10 +989,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
980989
}
981990

982991
if (flags & SRE_INFO_PREFIX) {
992+
/* pattern starts with a known prefix */
983993
prefix_len = pattern[5];
984994
prefix = pattern + 6;
985995
overlap = prefix + prefix_len - 1;
986996
} else if (flags & SRE_INFO_CHARSET)
997+
/* pattern starts with a character from a known set */
987998
charset = pattern + 5;
988999

9891000
pattern += 1 + pattern[1];
@@ -1042,7 +1053,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
10421053
if (status != 0)
10431054
break;
10441055
}
1045-
#if 0
10461056
} else if (charset) {
10471057
/* pattern starts with a character from a known set */
10481058
for (;;) {
@@ -1057,7 +1067,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
10571067
if (status != 0)
10581068
break;
10591069
}
1060-
#endif
10611070
} else
10621071
/* general case */
10631072
while (ptr <= end) {
@@ -1204,6 +1213,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
12041213
for (i = 0; i < SRE_MARK_SIZE; i++)
12051214
state->mark[i] = NULL;
12061215

1216+
state->index = -1;
1217+
12071218
state->stack = NULL;
12081219
state->stackbase = 0;
12091220
state->stacksize = 0;
@@ -1286,6 +1297,8 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
12861297
} else
12871298
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
12881299

1300+
match->index = state->index;
1301+
12891302
return (PyObject*) match;
12901303

12911304
} else if (status < 0) {
@@ -1887,6 +1900,15 @@ match_getattr(MatchObject* self, char* name)
18871900
if (!strcmp(name, "endpos"))
18881901
return Py_BuildValue("i", 0); /* FIXME */
18891902

1903+
if (!strcmp(name, "index")) {
1904+
/* experimental */
1905+
if (self->index < 0) {
1906+
Py_INCREF(Py_None);
1907+
return Py_None;
1908+
} else
1909+
return Py_BuildValue("i", self->index);
1910+
}
1911+
18901912
PyErr_SetString(PyExc_AttributeError, name);
18911913
return NULL;
18921914
}

Modules/sre.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ typedef struct {
3333
PyObject_HEAD
3434
PyObject* string; /* link to the target string */
3535
PatternObject* pattern; /* link to the regex (pattern) object */
36+
int index; /* last index marker seen by the engine (-1 if none) */
3637
int groups; /* number of groups (start/end marks) */
3738
int mark[2];
3839
} MatchObject;
@@ -57,6 +58,7 @@ typedef struct {
5758
/* character size */
5859
int charsize;
5960
/* registers */
61+
int index;
6062
int lastmark;
6163
void* mark[SRE_MARK_SIZE];
6264
/* backtracking stack */

Modules/sre_constants.h

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,22 @@
2323
#define SRE_OP_CHARSET 9
2424
#define SRE_OP_GROUP 10
2525
#define SRE_OP_GROUP_IGNORE 11
26-
#define SRE_OP_IN 12
27-
#define SRE_OP_IN_IGNORE 13
28-
#define SRE_OP_INFO 14
29-
#define SRE_OP_JUMP 15
30-
#define SRE_OP_LITERAL 16
31-
#define SRE_OP_LITERAL_IGNORE 17
32-
#define SRE_OP_MARK 18
33-
#define SRE_OP_MAX_REPEAT 19
34-
#define SRE_OP_MAX_REPEAT_ONE 20
35-
#define SRE_OP_MIN_REPEAT 21
36-
#define SRE_OP_NOT_LITERAL 22
37-
#define SRE_OP_NOT_LITERAL_IGNORE 23
38-
#define SRE_OP_NEGATE 24
39-
#define SRE_OP_RANGE 25
40-
#define SRE_OP_REPEAT 26
26+
#define SRE_OP_INDEX 12
27+
#define SRE_OP_IN 13
28+
#define SRE_OP_IN_IGNORE 14
29+
#define SRE_OP_INFO 15
30+
#define SRE_OP_JUMP 16
31+
#define SRE_OP_LITERAL 17
32+
#define SRE_OP_LITERAL_IGNORE 18
33+
#define SRE_OP_MARK 19
34+
#define SRE_OP_MAX_REPEAT 20
35+
#define SRE_OP_MAX_REPEAT_ONE 21
36+
#define SRE_OP_MIN_REPEAT 22
37+
#define SRE_OP_NOT_LITERAL 23
38+
#define SRE_OP_NOT_LITERAL_IGNORE 24
39+
#define SRE_OP_NEGATE 25
40+
#define SRE_OP_RANGE 26
41+
#define SRE_OP_REPEAT 27
4142
#define SRE_AT_BEGINNING 0
4243
#define SRE_AT_BEGINNING_LINE 1
4344
#define SRE_AT_BOUNDARY 2

0 commit comments

Comments
 (0)