@@ -175,13 +175,14 @@ \subsection{Regular Expression Syntax \label{re-syntax}}
175175\regexp {(?P<\var {name}>...)} is the only exception to this rule.
176176Following are the currently supported extensions.
177177
178- \item [\code {(?iLmsx)}] (One or more letters from the set \character {i},
179- \character {L}, \character {m}, \character {s}, \character {x}.) The group matches
180- the empty string; the letters set the corresponding flags
181- (\constant {re.I}, \constant {re.L}, \constant {re.M}, \constant {re.S},
182- \constant {re.X}) for the entire regular expression. This is useful if
183- you wish to include the flags as part of the regular expression, instead
184- of passing a \var {flag} argument to the \function {compile()} function.
178+ \item [\code {(?iLmsux)}] (One or more letters from the set \character {i},
179+ \character {L}, \character {m}, \character {s}, \character {u},
180+ \character {x}.) The group matches the empty string; the letters set
181+ the corresponding flags (\constant {re.I}, \constant {re.L},
182+ \constant {re.M}, \constant {re.S}, \constant {re.U}, \constant {re.X})
183+ for the entire regular expression. This is useful if you wish to
184+ include the flags as part of the regular expression, instead of
185+ passing a \var {flag} argument to the \function {compile()} function.
185186
186187\item [\code {(?:...)}] A non-grouping version of regular parentheses.
187188Matches whatever regular expression is inside the parentheses, but the
@@ -227,7 +228,6 @@ \subsection{Regular Expression Syntax \label{re-syntax}}
227228
228229\begin {list }{}{\leftmargin 0.7in \labelwidth 0.65in}
229230
230- %
231231\item [\code {\e \var {number}}] Matches the contents of the group of the
232232same number. Groups are numbered starting from 1. For example,
233233\regexp {(.+) \e 1} matches \code {'the the'} or \code {'55 55'}, but not
@@ -238,45 +238,50 @@ \subsection{Regular Expression Syntax \label{re-syntax}}
238238as a group match, but as the character with octal value \var {number}.
239239Inside the \character {[} and \character {]} of a character class, all numeric
240240escapes are treated as characters.
241- %
241+
242242\item [\code {\e A}] Matches only at the start of the string.
243- %
243+
244244\item [\code {\e b}] Matches the empty string, but only at the
245245beginning or end of a word. A word is defined as a sequence of
246246alphanumeric characters, so the end of a word is indicated by
247247whitespace or a non-alphanumeric character. Inside a character range,
248248\regexp {\e b} represents the backspace character, for compatibility with
249249Python's string literals.
250- %
250+
251251\item [\code {\e B}] Matches the empty string, but only when it is
252252\emph {not } at the beginning or end of a word.
253- %
253+
254254\item [\code {\e d}]Matches any decimal digit; this is
255255equivalent to the set \regexp {[0-9]}.
256- %
256+
257257\item [\code {\e D}]Matches any non-digit character; this is
258258equivalent to the set \regexp {[{\^ }0-9]}.
259- %
259+
260260\item [\code {\e s}]Matches any whitespace character; this is
261261equivalent to the set \regexp {[ \e t\e n\e r\e f\e v]}.
262- %
262+
263263\item [\code {\e S}]Matches any non-whitespace character; this is
264264equivalent to the set \regexp {[\^\ \e t\e n\e r\e f\e v]}.
265- %
266- \item [\code {\e w}]When the \constant {LOCALE} flag is not specified,
265+
266+ \item [\code {\e w}]When the \constant {LOCALE} and \constant {UNICODE}
267+ flags are not specified,
267268matches any alphanumeric character; this is equivalent to the set
268269\regexp {[a-zA-Z0-9_]}. With \constant {LOCALE}, it will match the set
269- \regexp {[0-9_]} plus whatever characters are defined as letters for the
270- current locale.
271- %
272- \item [\code {\e W}]When the \constant {LOCALE} flag is not specified,
273- matches any non-alphanumeric character; this is equivalent to the set
274- \regexp {[{\^ }a-zA-Z0-9_]}. With \constant {LOCALE}, it will match any
275- character not in the set \regexp {[0-9_]}, and not defined as a letter
276- for the current locale.
270+ \regexp {[0-9_]} plus whatever characters are defined as letters for
271+ the current locale. If \constant {UNICODE} is set, this will match the
272+ characters \regexp {[0-9_]} plus whatever is classified as alphanumeric
273+ in the Unicode character properties database.
274+
275+ \item [\code {\e W}]When the \constant {LOCALE} and \constant {UNICODE}
276+ flags are not specified, matches any non-alphanumeric character; this
277+ is equivalent to the set \regexp {[{\^ }a-zA-Z0-9_]}. With
278+ \constant {LOCALE}, it will match any character not in the set
279+ \regexp {[0-9_]}, and not defined as a letter for the current locale.
280+ If \constant {UNICODE} is set, this will match anything other than
281+ \regexp {[0-9_]} and characters marked at alphanumeric in the Unicode
282+ character properties database.
277283
278284\item [\code {\e Z}]Matches only at the end of the string.
279- %
280285
281286\item [\code {\e \e }] Matches a literal backslash.
282287
@@ -354,8 +359,8 @@ \subsection{Module Contents}
354359
355360\begin {datadesc }{L}
356361\dataline {LOCALE}
357- Make \regexp {\e w}, \regexp {\e W}, \regexp {\e b},
358- \regexp {\e B}, dependent on the current locale.
362+ Make \regexp {\e w}, \regexp {\e W}, \regexp {\e b}, and
363+ \regexp {\e B} dependent on the current locale.
359364\end {datadesc }
360365
361366\begin {datadesc }{M}
@@ -372,9 +377,16 @@ \subsection{Module Contents}
372377
373378\begin {datadesc }{S}
374379\dataline {DOTALL}
375- Make the \character {.} special character match any character at all, including a
376- newline; without this flag, \character {.} will match anything \emph {except }
377- a newline.
380+ Make the \character {.} special character match any character at all,
381+ including a newline; without this flag, \character {.} will match
382+ anything \emph {except } a newline.
383+ \end {datadesc }
384+
385+ \begin {datadesc }{U}
386+ \dataline {UNICODE}
387+ Make \regexp {\e w}, \regexp {\e W}, \regexp {\e b}, and
388+ \regexp {\e B} dependent on the Unicode character properties database.
389+ \versionadded {2.0}
378390\end {datadesc }
379391
380392\begin {datadesc }{X}
0 commit comments