Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ca450a0

Browse files
committed
Add an Accept parameter to "simple" dictionaries. The default of true
gives the old behavior; selecting false allows the dictionary to be used as a filter ahead of other dictionaries, because it will pass on rather than accept words that aren't in its stopword list. Jan Urbanski
1 parent a44c81d commit ca450a0

File tree

2 files changed

+67
-9
lines changed

2 files changed

+67
-9
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
22

33
<chapter id="textsearch">
44
<title id="textsearch-title">Full Text Search</title>
@@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a
20932093
<para>
20942094
The <literal>simple</> dictionary template operates by converting the
20952095
input token to lower case and checking it against a file of stop words.
2096-
If it is found in the file then <literal>NULL</> is returned, causing
2096+
If it is found in the file then an empty array is returned, causing
20972097
the token to be discarded. If not, the lower-cased form of the word
2098-
is returned as the normalized lexeme.
2098+
is returned as the normalized lexeme. Alternatively, the dictionary
2099+
can be configured to report non-stop-words as unrecognized, allowing
2100+
them to be passed on to the next dictionary in the list.
20992101
</para>
21002102

21012103
<para>
@@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The');
21382140
</programlisting>
21392141
</para>
21402142

2143+
<para>
2144+
We can also choose to return <literal>NULL</>, instead of the lower-cased
2145+
word, if it is not found in the stop words file. This behavior is
2146+
selected by setting the dictionary's <literal>Accept</> parameter to
2147+
<literal>false</>. Continuing the example:
2148+
2149+
<programlisting>
2150+
ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
2151+
2152+
SELECT ts_lexize('public.simple_dict','YeS');
2153+
ts_lexize
2154+
-----------
2155+
2156+
2157+
SELECT ts_lexize('public.simple_dict','The');
2158+
ts_lexize
2159+
-----------
2160+
{}
2161+
</programlisting>
2162+
</para>
2163+
2164+
<para>
2165+
With the default setting of <literal>Accept</> = <literal>true</>,
2166+
it is only useful to place a <literal>simple</> dictionary at the end
2167+
of a list of dictionaries, since it will never pass on any token to
2168+
a following dictionary. Conversely, <literal>Accept</> = <literal>false</>
2169+
is only useful when there is at least one following dictionary.
2170+
</para>
2171+
21412172
<caution>
21422173
<para>
21432174
Most types of dictionaries rely on configuration files, such as files of

src/backend/tsearch/dict_simple.c

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -23,6 +23,7 @@
2323
typedef struct
2424
{
2525
StopList stoplist;
26+
bool accept;
2627
} DictSimple;
2728

2829

@@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS)
3132
{
3233
List *dictoptions = (List *) PG_GETARG_POINTER(0);
3334
DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
34-
bool stoploaded = false;
35+
bool stoploaded = false,
36+
acceptloaded = false;
3537
ListCell *l;
3638

39+
d->accept = true; /* default */
40+
3741
foreach(l, dictoptions)
3842
{
3943
DefElem *defel = (DefElem *) lfirst(l);
@@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS)
4751
readstoplist(defGetString(defel), &d->stoplist, lowerstr);
4852
stoploaded = true;
4953
}
54+
else if (pg_strcasecmp("Accept", defel->defname) == 0)
55+
{
56+
if (acceptloaded)
57+
ereport(ERROR,
58+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
59+
errmsg("multiple Accept parameters")));
60+
d->accept = defGetBoolean(defel);
61+
acceptloaded = true;
62+
}
5063
else
5164
{
5265
ereport(ERROR,
@@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS)
6679
char *in = (char *) PG_GETARG_POINTER(1);
6780
int32 len = PG_GETARG_INT32(2);
6881
char *txt;
69-
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
82+
TSLexeme *res;
7083

7184
txt = lowerstr_with_len(in, len);
7285

7386
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
87+
{
88+
/* reject as stopword */
7489
pfree(txt);
75-
else
90+
res = palloc0(sizeof(TSLexeme) * 2);
91+
PG_RETURN_POINTER(res);
92+
}
93+
else if (d->accept)
94+
{
95+
/* accept */
96+
res = palloc0(sizeof(TSLexeme) * 2);
7697
res[0].lexeme = txt;
77-
78-
PG_RETURN_POINTER(res);
98+
PG_RETURN_POINTER(res);
99+
}
100+
else
101+
{
102+
/* report as unrecognized */
103+
pfree(txt);
104+
PG_RETURN_POINTER(NULL);
105+
}
79106
}

0 commit comments

Comments
 (0)