diff --git a/.gitignore b/.gitignore index 8a9a6c9..1167d7c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.so results +/log/ diff --git a/Makefile b/Makefile index 9ddd3fc..bbc36cb 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,9 @@ DATA = pg_tsparser--1.0.sql PGFILEDESC = "pg_tsparser - parser for text search" REGRESS = pg_tsparser +# We need a UTF8 database +ENCODING = UTF8 +NO_LOCALE = 1 ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/expected/pg_tsparser.out b/expected/pg_tsparser.out index 537d2df..23dbb8c 100644 --- a/expected/pg_tsparser.out +++ b/expected/pg_tsparser.out @@ -236,3 +236,39 @@ SELECT to_tsvector('english_ts', 'test2.com'); 'com':3 'test2':2 'test2.com':1 (1 row) +-- Test non-ASCII symbols +-- must have a UTF8 database +SELECT getdatabaseencoding(); + getdatabaseencoding +--------------------- + UTF8 +(1 row) + +SET client_encoding TO 'UTF8'; +SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); + tokid | token +-------+-------- + 17 | аб_вгд + 10 | аб + 12 | _ + 10 | вгд + 12 | + 15 | 12_абв + 9 | 12 + 12 | _ + 10 | абв + 12 | + 15 | 12-абв + 9 | 12 + 12 | - + 10 | абв + 12 | + 2 | абв + 12 | . + 2 | рф + 12 | + 3 | абв2 + 12 | . + 2 | рф +(22 rows) + diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..9dc5c8f --- /dev/null +++ b/meson.build @@ -0,0 +1,37 @@ +# Copyright (c) 2025, Postgres Professional + +# Does not support the PGXS infrastructure at this time. Please, compile as part +# of the contrib source tree. + +pg_tsparser_sources = files( + 'tsparser.c' +) + +if host_system == 'windows' + pg_tsparser_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_tsparser', + '--FILEDESC', 'pg_tsparser - modifies the default text parsing strategy.',]) +endif + +pg_tsparser = shared_module('pg_tsparser', + pg_tsparser_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_tsparser + +install_data( + 'pg_tsparser.control', + 'pg_tsparser--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_tsparser', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_tsparser', + ], + }, +} diff --git a/sql/pg_tsparser.sql b/sql/pg_tsparser.sql index 7cd9b1b..6f27d8f 100644 --- a/sql/pg_tsparser.sql +++ b/sql/pg_tsparser.sql @@ -26,3 +26,11 @@ SELECT to_tsvector('english_ts', '12_abc'); SELECT to_tsvector('english_ts', '12-abc'); SELECT to_tsvector('english_ts', 'test.com'); SELECT to_tsvector('english_ts', 'test2.com'); + +-- Test non-ASCII symbols + +-- must have a UTF8 database +SELECT getdatabaseencoding(); +SET client_encoding TO 'UTF8'; + +SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); diff --git a/tsparser.c b/tsparser.c index 3cdafc5..e821dce 100644 --- a/tsparser.c +++ b/tsparser.c @@ -249,11 +249,9 @@ typedef struct TParser /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ -#ifdef USE_WIDE_UPPER_LOWER wchar_t *wstr; /* wide character string */ pg_wchar *pgwstr; /* wide character string for C-locale */ bool usewide; -#endif /* State of parse */ int charmaxlen; @@ -271,6 +269,10 @@ typedef struct TParser int type; } TParser; +#if PG_VERSION_NUM < 120000 +#define pg_strtoint32(value) pg_atoi((value), sizeof(int32), 0) +#endif + /* forward decls here */ static bool TParserGet(TParser *prs); @@ -302,18 +304,19 @@ TParserInit(char *str, int len) prs->str = str; prs->lenstr = len; -#ifdef USE_WIDE_UPPER_LOWER - /* * Use wide char code only when max encoding length > 1. */ if (prs->charmaxlen > 1) { - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ pg_locale_t mylocale = 0; /* TODO */ prs->usewide = true; - if (lc_ctype_is_c(collation)) +#if PG_VERSION_NUM >= 150000 || (defined(PGPRO_STD) && PG_VERSION_NUM >= 120000) + if (database_ctype_is_c) +#else + if (lc_ctype_is_c(DEFAULT_COLLATION_OID)) +#endif { /* * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could @@ -331,7 +334,6 @@ TParserInit(char *str, int len) } else prs->usewide = false; -#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -368,15 +370,12 @@ TParserCopyInit(const TParser *orig) prs->charmaxlen = orig->charmaxlen; prs->str = orig->str + orig->state->posbyte; prs->lenstr = orig->lenstr - orig->state->posbyte; - -#ifdef USE_WIDE_UPPER_LOWER prs->usewide = orig->usewide; if (orig->pgwstr) prs->pgwstr = orig->pgwstr + orig->state->poschar; if (orig->wstr) prs->wstr = orig->wstr + orig->state->poschar; -#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -401,12 +400,10 @@ TParserClose(TParser *prs) prs->state = ptr; } -#ifdef USE_WIDE_UPPER_LOWER if (prs->wstr) pfree(prs->wstr); if (prs->pgwstr) pfree(prs->pgwstr); -#endif #ifdef WPARSER_TRACE fprintf(stderr, "closing parser\n"); @@ -445,96 +442,45 @@ TParserCopyClose(TParser *prs) * - if locale is C then we use pgwstr instead of wstr. */ -#ifdef USE_WIDE_UPPER_LOWER - -#define p_iswhat(type) \ +#define p_iswhat(type, nonascii) \ + \ static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - if ( prs->usewide ) \ +p_is##type(TParser *prs) \ +{ \ + Assert(prs->state); \ + if (prs->usewide) \ { \ - if ( prs->pgwstr ) \ + if (prs->pgwstr) \ { \ unsigned int c = *(prs->pgwstr + prs->state->poschar); \ - if ( c > 0x7f ) \ - return 0; \ - return is##type( c ); \ + if (c > 0x7f) \ + return nonascii; \ + return is##type(c); \ } \ - return isw##type( *( prs->wstr + prs->state->poschar ) ); \ + return isw##type(*(prs->wstr + prs->state->poschar)); \ } \ - \ - return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ -} \ + return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ +} \ \ static int \ -p_isnot##type(TParser *prs) { \ +p_isnot##type(TParser *prs) \ +{ \ return !p_is##type(prs); \ } -static int -p_isalnum(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalnum(c); - } - - return iswalnum(*(prs->wstr + prs->state->poschar)); - } - - return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte)); -} -static int -p_isnotalnum(TParser *prs) -{ - return !p_isalnum(prs); -} - -static int -p_isalpha(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalpha(c); - } - - return iswalpha(*(prs->wstr + prs->state->poschar)); - } - - return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte)); -} - -static int -p_isnotalpha(TParser *prs) -{ - return !p_isalpha(prs); -} +/* + * In C locale with a multibyte encoding, any non-ASCII symbol is considered + * an alpha character, but not a member of other char classes. + */ +p_iswhat(alnum, 1) +p_iswhat(alpha, 1) +p_iswhat(digit, 0) +p_iswhat(lower, 0) +p_iswhat(print, 0) +p_iswhat(punct, 0) +p_iswhat(space, 0) +p_iswhat(upper, 0) +p_iswhat(xdigit, 0) /* p_iseq should be used only for ascii symbols */ @@ -544,39 +490,6 @@ p_iseq(TParser *prs, char c) Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } -#else /* USE_WIDE_UPPER_LOWER */ - -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ -} - - -static int -p_iseq(TParser *prs, char c) -{ - Assert(prs->state); - return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; -} - -p_iswhat(alnum) -p_iswhat(alpha) -#endif /* USE_WIDE_UPPER_LOWER */ - -p_iswhat(digit) -p_iswhat(lower) -p_iswhat(print) -p_iswhat(punct) -p_iswhat(space) -p_iswhat(upper) -p_iswhat(xdigit) static int p_isEOF(TParser *prs) @@ -793,8 +706,6 @@ p_isspecial(TParser *prs) if (pg_dsplen(prs->str + prs->state->posbyte) == 0) return 1; -#ifdef USE_WIDE_UPPER_LOWER - /* * Unicode Characters in the 'Mark, Spacing Combining' Category That * characters are not alpha although they are not breakers of word too. @@ -1058,7 +969,6 @@ p_isspecial(TParser *prs) StopHigh = StopMiddle; } } -#endif return 0; } @@ -2070,7 +1980,11 @@ typedef struct #undef USE_PHRASE_SEARCH #endif +#if PG_VERSION_NUM >= 130000 +static TSTernaryValue +#else static bool +#endif #ifdef USE_PHRASE_SEARCH checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) #else @@ -2087,7 +2001,11 @@ checkcondition_HL(void *opaque, QueryOperand *val) { /* don't need to find all positions */ if (!data) +#if PG_VERSION_NUM >= 130000 + return TS_YES; +#else return true; +#endif if (!data->pos) { @@ -2102,17 +2020,29 @@ checkcondition_HL(void *opaque, QueryOperand *val) data->pos[data->npos++] = checkval->words[i].pos; } } +#else +#if PG_VERSION_NUM >= 130000 + return TS_YES; #else return true; +#endif #endif } #ifdef USE_PHRASE_SEARCH if (data && data->npos > 0) +#if PG_VERSION_NUM >= 130000 + return TS_YES; +#else return true; #endif +#endif +#if PG_VERSION_NUM >= 130000 + return TS_NO; +#else return false; +#endif } @@ -2610,13 +2540,13 @@ tsparser_headline(PG_FUNCTION_ARGS) char *val = defGetString(defel); if (pg_strcasecmp(defel->defname, "MaxWords") == 0) - max_words = pg_atoi(val, sizeof(int32), 0); + max_words = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "MinWords") == 0) - min_words = pg_atoi(val, sizeof(int32), 0); + min_words = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) - shortword = pg_atoi(val, sizeof(int32), 0); + shortword = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0) - max_fragments = pg_atoi(val, sizeof(int32), 0); + max_fragments = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "StartSel") == 0) prs->startsel = pstrdup(val); else if (pg_strcasecmp(defel->defname, "StopSel") == 0)