|
| 1 | +/************************************************* |
| 2 | +* Perl-Compatible Regular Expressions * |
| 3 | +*************************************************/ |
| 4 | + |
| 5 | + |
| 6 | +#define PCRE_VERSION "0.95 23-Sep-1997" |
| 7 | + |
| 8 | + |
| 9 | +/* This is a library of functions to support regular expressions whose syntax |
| 10 | +and semantics are as close as possible to those of the Perl 5 language. See |
| 11 | +the file Tech.Notes for some information on the internals. |
| 12 | +
|
| 13 | +Written by: Philip Hazel <[email protected]> |
| 14 | +
|
| 15 | + Copyright (c) 1997 University of Cambridge |
| 16 | +
|
| 17 | +----------------------------------------------------------------------------- |
| 18 | +Permission is granted to anyone to use this software for any purpose on any |
| 19 | +computer system, and to redistribute it freely, subject to the following |
| 20 | +restrictions: |
| 21 | +
|
| 22 | +1. This software is distributed in the hope that it will be useful, |
| 23 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 24 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 25 | +
|
| 26 | +2. The origin of this software must not be misrepresented, either by |
| 27 | + explicit claim or by omission. |
| 28 | +
|
| 29 | +3. Altered versions must be plainly marked as such, and must not be |
| 30 | + misrepresented as being the original software. |
| 31 | +----------------------------------------------------------------------------- |
| 32 | +*/ |
| 33 | + |
| 34 | +/* This header contains definitions that are shared between the different |
| 35 | +modules, but which are not relevant to the outside. */ |
| 36 | + |
| 37 | +/* Standard C headers plus the external interface definition */ |
| 38 | + |
| 39 | +#include <ctype.h> |
| 40 | +#include <limits.h> |
| 41 | +#include <stdio.h> |
| 42 | +#include <stdlib.h> |
| 43 | +#include <string.h> |
| 44 | +#include "pcre.h" |
| 45 | + |
| 46 | +/* Private options flags start at the most significant end of the byte. The |
| 47 | +public options defined in pcre.h start at the least significant end. Make sure |
| 48 | +they don't overlap! */ |
| 49 | + |
| 50 | +#define PCRE_FIRSTSET 0x80 /* first_char is set */ |
| 51 | +#define PCRE_STARTLINE 0x40 /* start after \n for multiline */ |
| 52 | + |
| 53 | +/* Options for the "extra" block produced by pcre_study(). */ |
| 54 | + |
| 55 | +#define PCRE_STUDY_CASELESS 0x01 /* study was caseless */ |
| 56 | +#define PCRE_STUDY_MAPPED 0x20 /* a map of starting chars exists */ |
| 57 | + |
| 58 | +/* Masks for identifying the public options: all permitted at compile time, |
| 59 | +only some permitted at run or study time. */ |
| 60 | + |
| 61 | +#ifdef FOR_PYTHON |
| 62 | +#define PUBLIC_OPTIONS \ |
| 63 | + (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_DOTALL) |
| 64 | +#else |
| 65 | +#define PUBLIC_OPTIONS \ |
| 66 | + (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE) |
| 67 | +#endif |
| 68 | +#define PUBLIC_EXEC_OPTIONS (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE) |
| 69 | +#define PUBLIC_STUDY_OPTIONS (PCRE_CASELESS) |
| 70 | + |
| 71 | +/* Magic number to provide a small check against being handed junk. */ |
| 72 | + |
| 73 | +#define MAGIC_NUMBER 0x50435245 /* 'PCRE' */ |
| 74 | + |
| 75 | +/* Miscellaneous definitions */ |
| 76 | + |
| 77 | +typedef int BOOL; |
| 78 | + |
| 79 | +#define FALSE 0 |
| 80 | +#define TRUE 1 |
| 81 | + |
| 82 | +/* Flags for character classes - see also class_ops table below. */ |
| 83 | + |
| 84 | +#define CLASS_DIGITS 0x01 |
| 85 | +#define CLASS_NOT_DIGITS 0x02 |
| 86 | +#define CLASS_WHITESPACE 0x04 |
| 87 | +#define CLASS_NOT_WHITESPACE 0x08 |
| 88 | +#define CLASS_WORD 0x10 |
| 89 | +#define CLASS_NOT_WORD 0x20 |
| 90 | + |
| 91 | +/* These are escaped items that aren't just an encoding of a particular data |
| 92 | +value such as \n. They must have non-zero values, as check_escape() returns |
| 93 | +their negation. Also, they must appear in the same order as in the opcode |
| 94 | +definitions below, up to ESC_Z. The final one must be ESC_REF as subsequent |
| 95 | +values are used for \1, \2, \3, etc. There is a test in the code for an escape |
| 96 | +greater than ESC_b and less than ESC_Z to detect the types that may be |
| 97 | +repeated. If any new escapes are put in-between that don't consume a character, |
| 98 | +that code will have to change. */ |
| 99 | + |
| 100 | +enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, |
| 101 | + ESC_Z, ESC_REF }; |
| 102 | + |
| 103 | +/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
| 104 | +that extract substrings. Starting from 1 (i.e. after OP_END), the values up to |
| 105 | +OP_EOL must correspond in order to the list of escapes immediately above. */ |
| 106 | + |
| 107 | +enum { |
| 108 | + OP_END, /* End of pattern */ |
| 109 | + |
| 110 | + /* Values corresponding to backslashed metacharacters */ |
| 111 | + |
| 112 | + OP_SOD, /* Start of data: \A */ |
| 113 | + OP_NOT_WORD_BOUNDARY, /* \W */ |
| 114 | + OP_WORD_BOUNDARY, /* \w */ |
| 115 | + OP_NOT_DIGIT, /* \D */ |
| 116 | + OP_DIGIT, /* \d */ |
| 117 | + OP_NOT_WHITESPACE, /* \S */ |
| 118 | + OP_WHITESPACE, /* \s */ |
| 119 | + OP_NOT_WORDCHAR, /* \W */ |
| 120 | + OP_WORDCHAR, /* \w */ |
| 121 | + OP_EOD, /* End of data: or \Z. This must always be the last |
| 122 | + of the backslashed meta values. */ |
| 123 | + |
| 124 | + OP_CIRC, /* Start of line - varies with multiline switch */ |
| 125 | + OP_DOLL, /* End of line - varies with multiline switch */ |
| 126 | + OP_ANY, /* Match any character */ |
| 127 | + OP_CHARS, /* Match string of characters */ |
| 128 | + |
| 129 | + OP_STAR, /* The maximizing and minimizing versions of */ |
| 130 | + OP_MINSTAR, /* all these opcodes must come in pairs, with */ |
| 131 | + OP_PLUS, /* the minimizing one second. */ |
| 132 | + OP_MINPLUS, /* This first set applies to single characters */ |
| 133 | + OP_QUERY, |
| 134 | + OP_MINQUERY, |
| 135 | + OP_UPTO, /* From 0 to n matches. */ |
| 136 | + OP_MINUPTO, |
| 137 | + OP_EXACT, /* Exactly n matches. */ |
| 138 | + |
| 139 | + OP_TYPESTAR, /* The maximizing and minimizing versions of */ |
| 140 | + OP_TYPEMINSTAR, /* all these opcodes must come in pairs, with */ |
| 141 | + OP_TYPEPLUS, /* the minimizing one second. These codes must */ |
| 142 | + OP_TYPEMINPLUS, /* be in exactly the same order as those above. */ |
| 143 | + OP_TYPEQUERY, /* This set applies to character types such as \d */ |
| 144 | + OP_TYPEMINQUERY, |
| 145 | + OP_TYPEUPTO, |
| 146 | + OP_TYPEMINUPTO, |
| 147 | + OP_TYPEEXACT, |
| 148 | + |
| 149 | + OP_CRSTAR, /* The maximizing and minimizing versions of */ |
| 150 | + OP_CRMINSTAR, /* all these opcodes must come in pairs, with */ |
| 151 | + OP_CRPLUS, /* the minimizing one second. These codes must */ |
| 152 | + OP_CRMINPLUS, /* be in exactly the same order as those above. */ |
| 153 | + OP_CRQUERY, /* These are for character classes and back refs */ |
| 154 | + OP_CRMINQUERY, |
| 155 | + OP_CRRANGE, /* These are different to the two seta above. */ |
| 156 | + OP_CRMINRANGE, |
| 157 | + |
| 158 | + OP_CLASS, /* Match a character class */ |
| 159 | + OP_NEGCLASS, /* Don't match a character class */ |
| 160 | + OP_REF, /* Match a back reference */ |
| 161 | + |
| 162 | + OP_ALT, /* Start of alternation */ |
| 163 | + OP_KET, /* End of group that doesn't have an unbounded repeat */ |
| 164 | + OP_KETRMAX, /* These two must remain together and in this */ |
| 165 | + OP_KETRMIN, /* order. They are for groups the repeat for ever. */ |
| 166 | + |
| 167 | + OP_ASSERT, |
| 168 | + OP_ASSERT_NOT, |
| 169 | + |
| 170 | + OP_BRAZERO, /* These two must remain together and in this */ |
| 171 | + OP_BRAMINZERO, /* order. */ |
| 172 | + |
| 173 | + OP_BRA /* This and greater values are used for brackets that |
| 174 | + extract substrings. */ |
| 175 | +}; |
| 176 | + |
| 177 | +/* The highest extraction number. This is limited by the number of opcodes |
| 178 | +left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */ |
| 179 | + |
| 180 | +#define EXTRACT_MAX 99 |
| 181 | + |
| 182 | +/* All character handling must be done as unsigned characters. Otherwise there |
| 183 | +are problems with top-bit-set characters and functions such as isspace(). |
| 184 | +However, we leave the interface to the outside world as char *, because that |
| 185 | +should make things easier for callers. We define a short type for unsigned char |
| 186 | +to save lots of typing. I tried "uchar", but it causes problems on Digital |
| 187 | +Unix, where it is defined in sys/types, so use "uschar" instead. */ |
| 188 | + |
| 189 | +typedef unsigned char uschar; |
| 190 | + |
| 191 | +/* The real format of the start of the pcre block; the actual code vector |
| 192 | +runs on as long as necessary after the end. */ |
| 193 | + |
| 194 | +typedef struct real_pcre { |
| 195 | + unsigned int magic_number; |
| 196 | + unsigned char options; |
| 197 | + unsigned char top_bracket; |
| 198 | + unsigned char first_char; |
| 199 | + unsigned char code[1]; |
| 200 | +} real_pcre; |
| 201 | + |
| 202 | +/* The real format of the extra block returned by pcre_study(). */ |
| 203 | + |
| 204 | +typedef struct real_pcre_extra { |
| 205 | + unsigned char options; |
| 206 | + unsigned char start_bits[32]; |
| 207 | +} real_pcre_extra; |
| 208 | + |
| 209 | +/* Global tables from pcre-chartables.c */ |
| 210 | + |
| 211 | +extern uschar pcre_lcc[]; |
| 212 | +extern uschar pcre_ucc[]; |
| 213 | +extern uschar pcre_ctypes[]; |
| 214 | + |
| 215 | +/* Bit definitions for entries in pcre_ctypes[]. */ |
| 216 | + |
| 217 | +#define ctype_space 0x01 |
| 218 | +#define ctype_digit 0x02 |
| 219 | +#define ctype_xdigit 0x04 |
| 220 | +#define ctype_word 0x08 /* alphameric or '_' */ |
| 221 | +#ifdef FOR_PYTHON |
| 222 | +#define ctype_odigit 0x10 /* Octal digits */ |
| 223 | +#endif |
| 224 | +#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
| 225 | + |
| 226 | +/* End of pcre-internal.h */ |
0 commit comments