Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1e16a81

Browse files
committed
Teach regular expression operators to honor collations.
This involves getting the character classification and case-folding functions in the regex library to use the collations infrastructure. Most of this work had been done already in connection with the upper/lower and LIKE logic, so it was a simple matter of transposition. While at it, split out these functions into a separate source file regc_pg_locale.c, so that they can be correctly labeled with the Postgres project's license rather than the Scriptics license. These functions are 100% Postgres-written code whereas what remains in regc_locale.c is still mostly not ours, so lumping them both under the same copyright notice was getting more and more misleading.
1 parent 210f95f commit 1e16a81

File tree

12 files changed

+819
-192
lines changed

12 files changed

+819
-192
lines changed

doc/src/sgml/charset.sgml

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -221,17 +221,21 @@ initdb --locale=sv_SE
221221

222222
<listitem>
223223
<para>
224-
The ability to use indexes with <literal>LIKE</> clauses
225-
<indexterm><primary>LIKE</><secondary>and locales</></indexterm>
224+
The <function>upper</>, <function>lower</>, and <function>initcap</>
225+
functions
226+
<indexterm><primary>upper</><secondary>and locales</></indexterm>
227+
<indexterm><primary>lower</><secondary>and locales</></indexterm>
226228
</para>
227229
</listitem>
228230

229231
<listitem>
230232
<para>
231-
The <function>upper</>, <function>lower</>, and <function>initcap</>
232-
functions
233-
<indexterm><primary>upper</><secondary>and locales</></indexterm>
234-
<indexterm><primary>lower</><secondary>and locales</></indexterm>
233+
Pattern matching operators (<literal>LIKE</>, <literal>SIMILAR TO</>,
234+
and POSIX-style regular expressions); locales affect both case
235+
insensitive matching and the classification of characters by
236+
character-class regular expressions
237+
<indexterm><primary>LIKE</><secondary>and locales</></indexterm>
238+
<indexterm><primary>regular expressions</><secondary>and locales</></indexterm>
235239
</para>
236240
</listitem>
237241

@@ -241,6 +245,12 @@ initdb --locale=sv_SE
241245
<indexterm><primary>to_char</><secondary>and locales</></indexterm>
242246
</para>
243247
</listitem>
248+
249+
<listitem>
250+
<para>
251+
The ability to use indexes with <literal>LIKE</> clauses
252+
</para>
253+
</listitem>
244254
</itemizedlist>
245255
</para>
246256

@@ -319,8 +329,8 @@ initdb --locale=sv_SE
319329
<indexterm zone="collation"><primary>collation</></>
320330

321331
<para>
322-
The collation feature allows specifying the sort order and certain
323-
other locale aspects of data per-column, or even per-operation.
332+
The collation feature allows specifying the sort order and character
333+
classification behavior of data per-column, or even per-operation.
324334
This alleviates the restriction that the
325335
<symbol>LC_COLLATE</symbol> and <symbol>LC_CTYPE</symbol> settings
326336
of a database cannot be changed after its creation.
@@ -351,8 +361,8 @@ initdb --locale=sv_SE
351361
</para>
352362

353363
<para>
354-
When the database system has to perform an ordering or a
355-
comparison, it uses the collation of the input expression. This
364+
When the database system has to perform an ordering or a character
365+
classification, it uses the collation of the input expression. This
356366
happens, for example, with <literal>ORDER BY</literal> clauses
357367
and function or operator calls such as <literal>&lt;</literal>.
358368
The collation to apply for an <literal>ORDER BY</literal> clause
@@ -361,7 +371,8 @@ initdb --locale=sv_SE
361371
below. In addition to comparison operators, collations are taken into
362372
account by functions that convert between lower and upper case
363373
letters, such as <function>lower</>, <function>upper</>, and
364-
<function>initcap</>.
374+
<function>initcap</>; by pattern matching operators; and by
375+
<function>to_char</> and related functions.
365376
</para>
366377

367378
<para>

src/backend/libpq/hba.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <arpa/inet.h>
2626
#include <unistd.h>
2727

28+
#include "catalog/pg_collation.h"
2829
#include "libpq/ip.h"
2930
#include "libpq/libpq.h"
3031
#include "regex/regex.h"
@@ -1781,7 +1782,7 @@ parse_ident_usermap(List *line, int line_number, const char *usermap_name,
17811782
* XXX: Major room for optimization: regexps could be compiled when
17821783
* the file is loaded and then re-used in every connection.
17831784
*/
1784-
r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED);
1785+
r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED, C_COLLATION_OID);
17851786
if (r)
17861787
{
17871788
char errstr[100];

src/backend/regex/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
1717
include $(top_srcdir)/src/backend/common.mk
1818

1919
# mark inclusion dependencies between .c files explicitly
20-
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
20+
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
21+
regc_locale.c regc_pg_locale.c
2122

2223
regexec.o: regexec.c rege_dfa.c

src/backend/regex/regc_locale.c

Lines changed: 0 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -350,171 +350,6 @@ static const struct cname
350350
};
351351

352352

353-
/*
354-
* ctype functions adapted to work on pg_wchar (a/k/a chr)
355-
*
356-
* When working in UTF8 encoding, we use the <wctype.h> functions if
357-
* available. This assumes that every platform uses Unicode codepoints
358-
* directly as the wchar_t representation of Unicode. On some platforms
359-
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
360-
*
361-
* In all other encodings, we use the <ctype.h> functions for pg_wchar
362-
* values up to 255, and punt for values above that. This is only 100%
363-
* correct in single-byte encodings such as LATINn. However, non-Unicode
364-
* multibyte encodings are mostly Far Eastern character sets for which the
365-
* properties being tested here aren't relevant for higher code values anyway.
366-
*
367-
* NB: the coding here assumes pg_wchar is an unsigned type.
368-
*/
369-
370-
static int
371-
pg_wc_isdigit(pg_wchar c)
372-
{
373-
#ifdef USE_WIDE_UPPER_LOWER
374-
if (GetDatabaseEncoding() == PG_UTF8)
375-
{
376-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
377-
return iswdigit((wint_t) c);
378-
}
379-
#endif
380-
return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
381-
}
382-
383-
static int
384-
pg_wc_isalpha(pg_wchar c)
385-
{
386-
#ifdef USE_WIDE_UPPER_LOWER
387-
if (GetDatabaseEncoding() == PG_UTF8)
388-
{
389-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
390-
return iswalpha((wint_t) c);
391-
}
392-
#endif
393-
return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
394-
}
395-
396-
static int
397-
pg_wc_isalnum(pg_wchar c)
398-
{
399-
#ifdef USE_WIDE_UPPER_LOWER
400-
if (GetDatabaseEncoding() == PG_UTF8)
401-
{
402-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
403-
return iswalnum((wint_t) c);
404-
}
405-
#endif
406-
return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
407-
}
408-
409-
static int
410-
pg_wc_isupper(pg_wchar c)
411-
{
412-
#ifdef USE_WIDE_UPPER_LOWER
413-
if (GetDatabaseEncoding() == PG_UTF8)
414-
{
415-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
416-
return iswupper((wint_t) c);
417-
}
418-
#endif
419-
return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
420-
}
421-
422-
static int
423-
pg_wc_islower(pg_wchar c)
424-
{
425-
#ifdef USE_WIDE_UPPER_LOWER
426-
if (GetDatabaseEncoding() == PG_UTF8)
427-
{
428-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
429-
return iswlower((wint_t) c);
430-
}
431-
#endif
432-
return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
433-
}
434-
435-
static int
436-
pg_wc_isgraph(pg_wchar c)
437-
{
438-
#ifdef USE_WIDE_UPPER_LOWER
439-
if (GetDatabaseEncoding() == PG_UTF8)
440-
{
441-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
442-
return iswgraph((wint_t) c);
443-
}
444-
#endif
445-
return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
446-
}
447-
448-
static int
449-
pg_wc_isprint(pg_wchar c)
450-
{
451-
#ifdef USE_WIDE_UPPER_LOWER
452-
if (GetDatabaseEncoding() == PG_UTF8)
453-
{
454-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
455-
return iswprint((wint_t) c);
456-
}
457-
#endif
458-
return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
459-
}
460-
461-
static int
462-
pg_wc_ispunct(pg_wchar c)
463-
{
464-
#ifdef USE_WIDE_UPPER_LOWER
465-
if (GetDatabaseEncoding() == PG_UTF8)
466-
{
467-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
468-
return iswpunct((wint_t) c);
469-
}
470-
#endif
471-
return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
472-
}
473-
474-
static int
475-
pg_wc_isspace(pg_wchar c)
476-
{
477-
#ifdef USE_WIDE_UPPER_LOWER
478-
if (GetDatabaseEncoding() == PG_UTF8)
479-
{
480-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
481-
return iswspace((wint_t) c);
482-
}
483-
#endif
484-
return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
485-
}
486-
487-
static pg_wchar
488-
pg_wc_toupper(pg_wchar c)
489-
{
490-
#ifdef USE_WIDE_UPPER_LOWER
491-
if (GetDatabaseEncoding() == PG_UTF8)
492-
{
493-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
494-
return towupper((wint_t) c);
495-
}
496-
#endif
497-
if (c <= (pg_wchar) UCHAR_MAX)
498-
return toupper((unsigned char) c);
499-
return c;
500-
}
501-
502-
static pg_wchar
503-
pg_wc_tolower(pg_wchar c)
504-
{
505-
#ifdef USE_WIDE_UPPER_LOWER
506-
if (GetDatabaseEncoding() == PG_UTF8)
507-
{
508-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
509-
return towlower((wint_t) c);
510-
}
511-
#endif
512-
if (c <= (pg_wchar) UCHAR_MAX)
513-
return tolower((unsigned char) c);
514-
return c;
515-
}
516-
517-
518353
/*
519354
* element - map collating-element name to celt
520355
*/

0 commit comments

Comments
 (0)