Thanks to visit codestin.com
Credit goes to doxygen.postgresql.org

PostgreSQL Source Code git master
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
49 * even for non-UTF8 encodings, which may be a problem.) On some platforms
50 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
51 *
52 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
53 * values up to 255, and punt for values above that. This is 100% correct
54 * only in single-byte encodings such as LATINn. However, non-Unicode
55 * multibyte encodings are mostly Far Eastern character sets for which the
56 * properties being tested here aren't very relevant for higher code values
57 * anyway. The difficulty with using the <wctype.h> functions with
58 * non-Unicode multibyte encodings is that we can have no certainty that
59 * the platform's wchar_t representation matches what we do in pg_wchar
60 * conversions.
61 *
62 * As a special case, in the "default" collation, (2) and (3) force ASCII
63 * letters to follow ASCII upcase/downcase rules, while in a non-default
64 * collation we just let the library functions do what they will. The case
65 * where this matters is treatment of I/i in Turkish, and the behavior is
66 * meant to match the upper()/lower() SQL functions.
67 *
68 * We store the active collation setting in static variables. In principle
69 * it could be passed down to here via the regex library's "struct vars" data
70 * structure; but that would require somewhat invasive changes in the regex
71 * library, and right now there's no real benefit to be gained from that.
72 *
73 * NB: the coding here assumes pg_wchar is an unsigned type.
74 */
75
76/*
77 * Size of stack buffer to use for string transformations, used to avoid heap
78 * allocations in typical cases. This should be large enough that most strings
79 * will fit, but small enough that we feel comfortable putting it on the
80 * stack.
81 */
82#define TEXTBUFLEN 1024
83
85
86static int strncoll_libc(const char *arg1, ssize_t len1,
87 const char *arg2, ssize_t len2,
89static size_t strnxfrm_libc(char *dest, size_t destsize,
90 const char *src, ssize_t srclen,
92extern char *get_collation_actual_version_libc(const char *collcollate);
93static locale_t make_libc_collator(const char *collate,
94 const char *ctype);
95
96#ifdef WIN32
97static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
98 const char *arg2, ssize_t len2,
100#endif
101
102static size_t strlower_libc_sb(char *dest, size_t destsize,
103 const char *src, ssize_t srclen,
105static size_t strlower_libc_mb(char *dest, size_t destsize,
106 const char *src, ssize_t srclen,
108static size_t strtitle_libc_sb(char *dest, size_t destsize,
109 const char *src, ssize_t srclen,
111static size_t strtitle_libc_mb(char *dest, size_t destsize,
112 const char *src, ssize_t srclen,
114static size_t strupper_libc_sb(char *dest, size_t destsize,
115 const char *src, ssize_t srclen,
117static size_t strupper_libc_mb(char *dest, size_t destsize,
118 const char *src, ssize_t srclen,
120
121static bool
123{
124 return isdigit_l((unsigned char) wc, locale->info.lt);
125}
126
127static bool
129{
130 return isalpha_l((unsigned char) wc, locale->info.lt);
131}
132
133static bool
135{
136 return isalnum_l((unsigned char) wc, locale->info.lt);
137}
138
139static bool
141{
142 return isupper_l((unsigned char) wc, locale->info.lt);
143}
144
145static bool
147{
148 return islower_l((unsigned char) wc, locale->info.lt);
149}
150
151static bool
153{
154 return isgraph_l((unsigned char) wc, locale->info.lt);
155}
156
157static bool
159{
160 return isprint_l((unsigned char) wc, locale->info.lt);
161}
162
163static bool
165{
166 return ispunct_l((unsigned char) wc, locale->info.lt);
167}
168
169static bool
171{
172 return isspace_l((unsigned char) wc, locale->info.lt);
173}
174
175static bool
177{
178 return iswdigit_l((wint_t) wc, locale->info.lt);
179}
180
181static bool
183{
184 return iswalpha_l((wint_t) wc, locale->info.lt);
185}
186
187static bool
189{
190 return iswalnum_l((wint_t) wc, locale->info.lt);
191}
192
193static bool
195{
196 return iswupper_l((wint_t) wc, locale->info.lt);
197}
198
199static bool
201{
202 return iswlower_l((wint_t) wc, locale->info.lt);
203}
204
205static bool
207{
208 return iswgraph_l((wint_t) wc, locale->info.lt);
209}
210
211static bool
213{
214 return iswprint_l((wint_t) wc, locale->info.lt);
215}
216
217static bool
219{
220 return iswpunct_l((wint_t) wc, locale->info.lt);
221}
222
223static bool
225{
226 return iswspace_l((wint_t) wc, locale->info.lt);
227}
228
229static char
231{
233 return tolower_l(ch, locale->info.lt);
234}
235
236static bool
238{
239 bool is_multibyte = pg_database_encoding_max_length() > 1;
240
241 if (is_multibyte && IS_HIGHBIT_SET(ch))
242 return true;
243 else
244 return isalpha_l((unsigned char) ch, locale->info.lt);
245}
246
247static pg_wchar
249{
251
252 /* force C behavior for ASCII characters, per comments above */
253 if (locale->is_default && wc <= (pg_wchar) 127)
254 return pg_ascii_toupper((unsigned char) wc);
255 if (wc <= (pg_wchar) UCHAR_MAX)
256 return toupper_l((unsigned char) wc, locale->info.lt);
257 else
258 return wc;
259}
260
261static pg_wchar
263{
265
266 /* force C behavior for ASCII characters, per comments above */
267 if (locale->is_default && wc <= (pg_wchar) 127)
268 return pg_ascii_toupper((unsigned char) wc);
269 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
270 return towupper_l((wint_t) wc, locale->info.lt);
271 else
272 return wc;
273}
274
275static pg_wchar
277{
279
280 /* force C behavior for ASCII characters, per comments above */
281 if (locale->is_default && wc <= (pg_wchar) 127)
282 return pg_ascii_tolower((unsigned char) wc);
283 if (wc <= (pg_wchar) UCHAR_MAX)
284 return tolower_l((unsigned char) wc, locale->info.lt);
285 else
286 return wc;
287}
288
289static pg_wchar
291{
293
294 /* force C behavior for ASCII characters, per comments above */
295 if (locale->is_default && wc <= (pg_wchar) 127)
296 return pg_ascii_tolower((unsigned char) wc);
297 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
298 return towlower_l((wint_t) wc, locale->info.lt);
299 else
300 return wc;
301}
302
305 .strtitle = strtitle_libc_sb,
306 .strupper = strupper_libc_sb,
307 .wc_isdigit = wc_isdigit_libc_sb,
308 .wc_isalpha = wc_isalpha_libc_sb,
309 .wc_isalnum = wc_isalnum_libc_sb,
310 .wc_isupper = wc_isupper_libc_sb,
311 .wc_islower = wc_islower_libc_sb,
312 .wc_isgraph = wc_isgraph_libc_sb,
313 .wc_isprint = wc_isprint_libc_sb,
314 .wc_ispunct = wc_ispunct_libc_sb,
315 .wc_isspace = wc_isspace_libc_sb,
316 .char_is_cased = char_is_cased_libc,
317 .char_tolower = char_tolower_libc,
318 .wc_toupper = toupper_libc_sb,
319 .wc_tolower = tolower_libc_sb,
320 .max_chr = UCHAR_MAX,
321};
322
323/*
324 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
325 * single-byte semantics for pattern matching.
326 */
329 .strtitle = strtitle_libc_mb,
330 .strupper = strupper_libc_mb,
331 .wc_isdigit = wc_isdigit_libc_sb,
332 .wc_isalpha = wc_isalpha_libc_sb,
333 .wc_isalnum = wc_isalnum_libc_sb,
334 .wc_isupper = wc_isupper_libc_sb,
335 .wc_islower = wc_islower_libc_sb,
336 .wc_isgraph = wc_isgraph_libc_sb,
337 .wc_isprint = wc_isprint_libc_sb,
338 .wc_ispunct = wc_ispunct_libc_sb,
339 .wc_isspace = wc_isspace_libc_sb,
340 .char_is_cased = char_is_cased_libc,
341 .char_tolower = char_tolower_libc,
342 .wc_toupper = toupper_libc_sb,
343 .wc_tolower = tolower_libc_sb,
344 .max_chr = UCHAR_MAX,
345};
346
349 .strtitle = strtitle_libc_mb,
350 .strupper = strupper_libc_mb,
351 .wc_isdigit = wc_isdigit_libc_mb,
352 .wc_isalpha = wc_isalpha_libc_mb,
353 .wc_isalnum = wc_isalnum_libc_mb,
354 .wc_isupper = wc_isupper_libc_mb,
355 .wc_islower = wc_islower_libc_mb,
356 .wc_isgraph = wc_isgraph_libc_mb,
357 .wc_isprint = wc_isprint_libc_mb,
358 .wc_ispunct = wc_ispunct_libc_mb,
359 .wc_isspace = wc_isspace_libc_mb,
360 .char_is_cased = char_is_cased_libc,
361 .char_tolower = char_tolower_libc,
362 .wc_toupper = toupper_libc_mb,
363 .wc_tolower = tolower_libc_mb,
364};
365
368 .strnxfrm = strnxfrm_libc,
369 .strnxfrm_prefix = NULL,
370
371 /*
372 * Unfortunately, it seems that strxfrm() for non-C collations is broken
373 * on many common platforms; testing of multiple versions of glibc reveals
374 * that, for many locales, strcoll() and strxfrm() do not return
375 * consistent results. While no other libc other than Cygwin has so far
376 * been shown to have a problem, we take the conservative course of action
377 * for right now and disable this categorically. (Users who are certain
378 * this isn't a problem on their system can define TRUST_STRXFRM.)
379 */
380#ifdef TRUST_STRXFRM
381 .strxfrm_is_safe = true,
382#else
383 .strxfrm_is_safe = false,
384#endif
385};
386
387#ifdef WIN32
388static const struct collate_methods collate_methods_libc_win32_utf8 = {
389 .strncoll = strncoll_libc_win32_utf8,
390 .strnxfrm = strnxfrm_libc,
391 .strnxfrm_prefix = NULL,
392#ifdef TRUST_STRXFRM
393 .strxfrm_is_safe = true,
394#else
395 .strxfrm_is_safe = false,
396#endif
397};
398#endif
399
400static size_t
401strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
403{
404 if (srclen < 0)
405 srclen = strlen(src);
406
407 if (srclen + 1 <= destsize)
408 {
409 locale_t loc = locale->info.lt;
410 char *p;
411
412 if (srclen + 1 > destsize)
413 return srclen;
414
415 memcpy(dest, src, srclen);
416 dest[srclen] = '\0';
417
418 /*
419 * Note: we assume that tolower_l() will not be so broken as to need
420 * an isupper_l() guard test. When using the default collation, we
421 * apply the traditional Postgres behavior that forces ASCII-style
422 * treatment of I/i, but in non-default collations you get exactly
423 * what the collation says.
424 */
425 for (p = dest; *p; p++)
426 {
427 if (locale->is_default)
428 *p = pg_tolower((unsigned char) *p);
429 else
430 *p = tolower_l((unsigned char) *p, loc);
431 }
432 }
433
434 return srclen;
435}
436
437static size_t
438strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
440{
441 locale_t loc = locale->info.lt;
442 size_t result_size;
443 wchar_t *workspace;
444 char *result;
445 size_t curr_char;
446 size_t max_size;
447
448 if (srclen < 0)
449 srclen = strlen(src);
450
451 /* Overflow paranoia */
452 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
454 (errcode(ERRCODE_OUT_OF_MEMORY),
455 errmsg("out of memory")));
456
457 /* Output workspace cannot have more codes than input bytes */
458 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
459
460 char2wchar(workspace, srclen + 1, src, srclen, loc);
461
462 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
463 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
464
465 /*
466 * Make result large enough; case change might change number of bytes
467 */
468 max_size = curr_char * pg_database_encoding_max_length();
469 result = palloc(max_size + 1);
470
471 result_size = wchar2char(result, workspace, max_size + 1, loc);
472
473 if (result_size + 1 > destsize)
474 return result_size;
475
476 memcpy(dest, result, result_size);
477 dest[result_size] = '\0';
478
479 pfree(workspace);
480 pfree(result);
481
482 return result_size;
483}
484
485static size_t
486strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
488{
489 if (srclen < 0)
490 srclen = strlen(src);
491
492 if (srclen + 1 <= destsize)
493 {
494 locale_t loc = locale->info.lt;
495 int wasalnum = false;
496 char *p;
497
498 memcpy(dest, src, srclen);
499 dest[srclen] = '\0';
500
501 /*
502 * Note: we assume that toupper_l()/tolower_l() will not be so broken
503 * as to need guard tests. When using the default collation, we apply
504 * the traditional Postgres behavior that forces ASCII-style treatment
505 * of I/i, but in non-default collations you get exactly what the
506 * collation says.
507 */
508 for (p = dest; *p; p++)
509 {
510 if (locale->is_default)
511 {
512 if (wasalnum)
513 *p = pg_tolower((unsigned char) *p);
514 else
515 *p = pg_toupper((unsigned char) *p);
516 }
517 else
518 {
519 if (wasalnum)
520 *p = tolower_l((unsigned char) *p, loc);
521 else
522 *p = toupper_l((unsigned char) *p, loc);
523 }
524 wasalnum = isalnum_l((unsigned char) *p, loc);
525 }
526 }
527
528 return srclen;
529}
530
531static size_t
532strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
534{
535 locale_t loc = locale->info.lt;
536 int wasalnum = false;
537 size_t result_size;
538 wchar_t *workspace;
539 char *result;
540 size_t curr_char;
541 size_t max_size;
542
543 if (srclen < 0)
544 srclen = strlen(src);
545
546 /* Overflow paranoia */
547 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
549 (errcode(ERRCODE_OUT_OF_MEMORY),
550 errmsg("out of memory")));
551
552 /* Output workspace cannot have more codes than input bytes */
553 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
554
555 char2wchar(workspace, srclen + 1, src, srclen, loc);
556
557 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
558 {
559 if (wasalnum)
560 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
561 else
562 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
563 wasalnum = iswalnum_l(workspace[curr_char], loc);
564 }
565
566 /*
567 * Make result large enough; case change might change number of bytes
568 */
569 max_size = curr_char * pg_database_encoding_max_length();
570 result = palloc(max_size + 1);
571
572 result_size = wchar2char(result, workspace, max_size + 1, loc);
573
574 if (result_size + 1 > destsize)
575 return result_size;
576
577 memcpy(dest, result, result_size);
578 dest[result_size] = '\0';
579
580 pfree(workspace);
581 pfree(result);
582
583 return result_size;
584}
585
586static size_t
587strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
589{
590 if (srclen < 0)
591 srclen = strlen(src);
592
593 if (srclen + 1 <= destsize)
594 {
595 locale_t loc = locale->info.lt;
596 char *p;
597
598 memcpy(dest, src, srclen);
599 dest[srclen] = '\0';
600
601 /*
602 * Note: we assume that toupper_l() will not be so broken as to need
603 * an islower_l() guard test. When using the default collation, we
604 * apply the traditional Postgres behavior that forces ASCII-style
605 * treatment of I/i, but in non-default collations you get exactly
606 * what the collation says.
607 */
608 for (p = dest; *p; p++)
609 {
610 if (locale->is_default)
611 *p = pg_toupper((unsigned char) *p);
612 else
613 *p = toupper_l((unsigned char) *p, loc);
614 }
615 }
616
617 return srclen;
618}
619
620static size_t
621strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
623{
624 locale_t loc = locale->info.lt;
625 size_t result_size;
626 wchar_t *workspace;
627 char *result;
628 size_t curr_char;
629 size_t max_size;
630
631 if (srclen < 0)
632 srclen = strlen(src);
633
634 /* Overflow paranoia */
635 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
637 (errcode(ERRCODE_OUT_OF_MEMORY),
638 errmsg("out of memory")));
639
640 /* Output workspace cannot have more codes than input bytes */
641 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
642
643 char2wchar(workspace, srclen + 1, src, srclen, loc);
644
645 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
646 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
647
648 /*
649 * Make result large enough; case change might change number of bytes
650 */
651 max_size = curr_char * pg_database_encoding_max_length();
652 result = palloc(max_size + 1);
653
654 result_size = wchar2char(result, workspace, max_size + 1, loc);
655
656 if (result_size + 1 > destsize)
657 return result_size;
658
659 memcpy(dest, result, result_size);
660 dest[result_size] = '\0';
661
662 pfree(workspace);
663 pfree(result);
664
665 return result_size;
666}
667
670{
671 const char *collate;
672 const char *ctype;
673 locale_t loc;
674 pg_locale_t result;
675
676 if (collid == DEFAULT_COLLATION_OID)
677 {
678 HeapTuple tp;
679 Datum datum;
680
682 if (!HeapTupleIsValid(tp))
683 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
684 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
685 Anum_pg_database_datcollate);
686 collate = TextDatumGetCString(datum);
687 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
688 Anum_pg_database_datctype);
689 ctype = TextDatumGetCString(datum);
690
691 ReleaseSysCache(tp);
692 }
693 else
694 {
695 HeapTuple tp;
696 Datum datum;
697
699 if (!HeapTupleIsValid(tp))
700 elog(ERROR, "cache lookup failed for collation %u", collid);
701
702 datum = SysCacheGetAttrNotNull(COLLOID, tp,
703 Anum_pg_collation_collcollate);
704 collate = TextDatumGetCString(datum);
705 datum = SysCacheGetAttrNotNull(COLLOID, tp,
706 Anum_pg_collation_collctype);
707 ctype = TextDatumGetCString(datum);
708
709 ReleaseSysCache(tp);
710 }
711
712
713 loc = make_libc_collator(collate, ctype);
714
715 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
716 result->deterministic = true;
717 result->collate_is_c = (strcmp(collate, "C") == 0) ||
718 (strcmp(collate, "POSIX") == 0);
719 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
720 (strcmp(ctype, "POSIX") == 0);
721 result->info.lt = loc;
722 if (!result->collate_is_c)
723 {
724#ifdef WIN32
726 result->collate = &collate_methods_libc_win32_utf8;
727 else
728#endif
729 result->collate = &collate_methods_libc;
730 }
731 if (!result->ctype_is_c)
732 {
737 else
738 result->ctype = &ctype_methods_libc_sb;
739 }
740
741 return result;
742}
743
744/*
745 * Create a locale_t with the given collation and ctype.
746 *
747 * The "C" and "POSIX" locales are not actually handled by libc, so return
748 * NULL.
749 *
750 * Ensure that no path leaks a locale_t.
751 */
752static locale_t
753make_libc_collator(const char *collate, const char *ctype)
754{
755 locale_t loc = 0;
756
757 if (strcmp(collate, ctype) == 0)
758 {
759 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
760 {
761 /* Normal case where they're the same */
762 errno = 0;
763#ifndef WIN32
764 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
765 NULL);
766#else
767 loc = _create_locale(LC_ALL, collate);
768#endif
769 if (!loc)
771 }
772 }
773 else
774 {
775#ifndef WIN32
776 /* We need two newlocale() steps */
777 locale_t loc1 = 0;
778
779 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
780 {
781 errno = 0;
782 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
783 if (!loc1)
785 }
786
787 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
788 {
789 errno = 0;
790 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
791 if (!loc)
792 {
793 if (loc1)
794 freelocale(loc1);
796 }
797 }
798 else
799 loc = loc1;
800#else
801
802 /*
803 * XXX The _create_locale() API doesn't appear to support this. Could
804 * perhaps be worked around by changing pg_locale_t to contain two
805 * separate fields.
806 */
808 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
809 errmsg("collations with different collate and ctype values are not supported on this platform")));
810#endif
811 }
812
813 return loc;
814}
815
816/*
817 * strncoll_libc
818 *
819 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
820 *
821 * An input string length of -1 means that it's already NUL-terminated.
822 */
823int
824strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
826{
827 char sbuf[TEXTBUFLEN];
828 char *buf = sbuf;
829 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
830 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
831 const char *arg1n;
832 const char *arg2n;
833 int result;
834
835 if (bufsize1 + bufsize2 > TEXTBUFLEN)
836 buf = palloc(bufsize1 + bufsize2);
837
838 /* nul-terminate arguments if necessary */
839 if (len1 == -1)
840 {
841 arg1n = arg1;
842 }
843 else
844 {
845 char *buf1 = buf;
846
847 memcpy(buf1, arg1, len1);
848 buf1[len1] = '\0';
849 arg1n = buf1;
850 }
851
852 if (len2 == -1)
853 {
854 arg2n = arg2;
855 }
856 else
857 {
858 char *buf2 = buf + bufsize1;
859
860 memcpy(buf2, arg2, len2);
861 buf2[len2] = '\0';
862 arg2n = buf2;
863 }
864
865 result = strcoll_l(arg1n, arg2n, locale->info.lt);
866
867 if (buf != sbuf)
868 pfree(buf);
869
870 return result;
871}
872
873/*
874 * strnxfrm_libc
875 *
876 * NUL-terminate src, if necessary, and pass to strxfrm_l().
877 *
878 * A source length of -1 means that it's already NUL-terminated.
879 */
880size_t
881strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
883{
884 char sbuf[TEXTBUFLEN];
885 char *buf = sbuf;
886 size_t bufsize = srclen + 1;
887 size_t result;
888
889 if (srclen == -1)
890 return strxfrm_l(dest, src, destsize, locale->info.lt);
891
892 if (bufsize > TEXTBUFLEN)
893 buf = palloc(bufsize);
894
895 /* nul-terminate argument */
896 memcpy(buf, src, srclen);
897 buf[srclen] = '\0';
898
899 result = strxfrm_l(dest, buf, destsize, locale->info.lt);
900
901 if (buf != sbuf)
902 pfree(buf);
903
904 /* if dest is defined, it should be nul-terminated */
905 Assert(result >= destsize || dest[result] == '\0');
906
907 return result;
908}
909
910char *
911get_collation_actual_version_libc(const char *collcollate)
912{
913 char *collversion = NULL;
914
915 if (pg_strcasecmp("C", collcollate) != 0 &&
916 pg_strncasecmp("C.", collcollate, 2) != 0 &&
917 pg_strcasecmp("POSIX", collcollate) != 0)
918 {
919#if defined(__GLIBC__)
920 /* Use the glibc version because we don't have anything better. */
921 collversion = pstrdup(gnu_get_libc_version());
922#elif defined(LC_VERSION_MASK)
923 locale_t loc;
924
925 /* Look up FreeBSD collation version. */
926 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
927 if (loc)
928 {
929 collversion =
930 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
931 freelocale(loc);
932 }
933 else
935 (errmsg("could not load locale \"%s\"", collcollate)));
936#elif defined(WIN32)
937 /*
938 * If we are targeting Windows Vista and above, we can ask for a name
939 * given a collation name (earlier versions required a location code
940 * that we don't have).
941 */
942 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
943 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
944
945 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
946 LOCALE_NAME_MAX_LENGTH);
947 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
948 {
949 /*
950 * GetNLSVersionEx() wants a language tag such as "en-US", not a
951 * locale name like "English_United States.1252". Until those
952 * values can be prevented from entering the system, or 100%
953 * reliably converted to the more useful tag format, tolerate the
954 * resulting error and report that we have no version data.
955 */
956 if (GetLastError() == ERROR_INVALID_PARAMETER)
957 return NULL;
958
960 (errmsg("could not get collation version for locale \"%s\": error code %lu",
961 collcollate,
962 GetLastError())));
963 }
964 collversion = psprintf("%lu.%lu,%lu.%lu",
965 (version.dwNLSVersion >> 8) & 0xFFFF,
966 version.dwNLSVersion & 0xFF,
967 (version.dwDefinedVersion >> 8) & 0xFFFF,
968 version.dwDefinedVersion & 0xFF);
969#endif
970 }
971
972 return collversion;
973}
974
975/*
976 * strncoll_libc_win32_utf8
977 *
978 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
979 * invoke wcscoll_l().
980 *
981 * An input string length of -1 means that it's NUL-terminated.
982 */
983#ifdef WIN32
984static int
985strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
986 ssize_t len2, pg_locale_t locale)
987{
988 char sbuf[TEXTBUFLEN];
989 char *buf = sbuf;
990 char *a1p,
991 *a2p;
992 int a1len;
993 int a2len;
994 int r;
995 int result;
996
998
999 if (len1 == -1)
1000 len1 = strlen(arg1);
1001 if (len2 == -1)
1002 len2 = strlen(arg2);
1003
1004 a1len = len1 * 2 + 2;
1005 a2len = len2 * 2 + 2;
1006
1007 if (a1len + a2len > TEXTBUFLEN)
1008 buf = palloc(a1len + a2len);
1009
1010 a1p = buf;
1011 a2p = buf + a1len;
1012
1013 /* API does not work for zero-length input */
1014 if (len1 == 0)
1015 r = 0;
1016 else
1017 {
1018 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1019 (LPWSTR) a1p, a1len / 2);
1020 if (!r)
1021 ereport(ERROR,
1022 (errmsg("could not convert string to UTF-16: error code %lu",
1023 GetLastError())));
1024 }
1025 ((LPWSTR) a1p)[r] = 0;
1026
1027 if (len2 == 0)
1028 r = 0;
1029 else
1030 {
1031 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1032 (LPWSTR) a2p, a2len / 2);
1033 if (!r)
1034 ereport(ERROR,
1035 (errmsg("could not convert string to UTF-16: error code %lu",
1036 GetLastError())));
1037 }
1038 ((LPWSTR) a2p)[r] = 0;
1039
1040 errno = 0;
1041 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1042 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1043 ereport(ERROR,
1044 (errmsg("could not compare Unicode strings: %m")));
1045
1046 if (buf != sbuf)
1047 pfree(buf);
1048
1049 return result;
1050}
1051#endif /* WIN32 */
1052
1053/* simple subroutine for reporting errors from newlocale() */
1054void
1055report_newlocale_failure(const char *localename)
1056{
1057 int save_errno;
1058
1059 /*
1060 * Windows doesn't provide any useful error indication from
1061 * _create_locale(), and BSD-derived platforms don't seem to feel they
1062 * need to set errno either (even though POSIX is pretty clear that
1063 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1064 * is what to report.
1065 */
1066 if (errno == 0)
1067 errno = ENOENT;
1068
1069 /*
1070 * ENOENT means "no such locale", not "no such file", so clarify that
1071 * errno with an errdetail message.
1072 */
1073 save_errno = errno; /* auxiliary funcs might change errno */
1074 ereport(ERROR,
1075 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1076 errmsg("could not create locale \"%s\": %m",
1077 localename),
1078 (save_errno == ENOENT ?
1079 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1080 localename) : 0)));
1081}
1082
1083/*
1084 * POSIX doesn't define _l-variants of these functions, but several systems
1085 * have them. We provide our own replacements here.
1086 */
1087#ifndef HAVE_MBSTOWCS_L
1088static size_t
1089mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1090{
1091#ifdef WIN32
1092 return _mbstowcs_l(dest, src, n, loc);
1093#else
1094 size_t result;
1095 locale_t save_locale = uselocale(loc);
1096
1097 result = mbstowcs(dest, src, n);
1098 uselocale(save_locale);
1099 return result;
1100#endif
1101}
1102#endif
1103#ifndef HAVE_WCSTOMBS_L
1104static size_t
1105wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1106{
1107#ifdef WIN32
1108 return _wcstombs_l(dest, src, n, loc);
1109#else
1110 size_t result;
1111 locale_t save_locale = uselocale(loc);
1112
1113 result = wcstombs(dest, src, n);
1114 uselocale(save_locale);
1115 return result;
1116#endif
1117}
1118#endif
1119
1120/*
1121 * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1122 * Therefore we keep them here rather than with the mbutils code.
1123 */
1124
1125/*
1126 * wchar2char --- convert wide characters to multibyte format
1127 *
1128 * This has the same API as the standard wcstombs_l() function; in particular,
1129 * tolen is the maximum number of bytes to store at *to, and *from must be
1130 * zero-terminated. The output will be zero-terminated iff there is room.
1131 */
1132size_t
1133wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1134{
1135 size_t result;
1136
1137 if (tolen == 0)
1138 return 0;
1139
1140#ifdef WIN32
1141
1142 /*
1143 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1144 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1145 * MultiByteToWideChar().
1146 */
1148 {
1149 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1150 NULL, NULL);
1151 /* A zero return is failure */
1152 if (result <= 0)
1153 result = -1;
1154 else
1155 {
1156 Assert(result <= tolen);
1157 /* Microsoft counts the zero terminator in the result */
1158 result--;
1159 }
1160 }
1161 else
1162#endif /* WIN32 */
1163 if (loc == (locale_t) 0)
1164 {
1165 /* Use wcstombs directly for the default locale */
1166 result = wcstombs(to, from, tolen);
1167 }
1168 else
1169 {
1170 /* Use wcstombs_l for nondefault locales */
1171 result = wcstombs_l(to, from, tolen, loc);
1172 }
1173
1174 return result;
1175}
1176
1177/*
1178 * char2wchar --- convert multibyte characters to wide characters
1179 *
1180 * This has almost the API of mbstowcs_l(), except that *from need not be
1181 * null-terminated; instead, the number of input bytes is specified as
1182 * fromlen. Also, we ereport() rather than returning -1 for invalid
1183 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1184 * The output will be zero-terminated iff there is room.
1185 */
1186size_t
1187char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1188 locale_t loc)
1189{
1190 size_t result;
1191
1192 if (tolen == 0)
1193 return 0;
1194
1195#ifdef WIN32
1196 /* See WIN32 "Unicode" comment above */
1198 {
1199 /* Win32 API does not work for zero-length input */
1200 if (fromlen == 0)
1201 result = 0;
1202 else
1203 {
1204 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1205 /* A zero return is failure */
1206 if (result == 0)
1207 result = -1;
1208 }
1209
1210 if (result != -1)
1211 {
1212 Assert(result < tolen);
1213 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1214 to[result] = 0;
1215 }
1216 }
1217 else
1218#endif /* WIN32 */
1219 {
1220 /* mbstowcs requires ending '\0' */
1221 char *str = pnstrdup(from, fromlen);
1222
1223 if (loc == (locale_t) 0)
1224 {
1225 /* Use mbstowcs directly for the default locale */
1226 result = mbstowcs(to, str, tolen);
1227 }
1228 else
1229 {
1230 /* Use mbstowcs_l for nondefault locales */
1231 result = mbstowcs_l(to, str, tolen, loc);
1232 }
1233
1234 pfree(str);
1235 }
1236
1237 if (result == -1)
1238 {
1239 /*
1240 * Invalid multibyte character encountered. We try to give a useful
1241 * error message by letting pg_verifymbstr check the string. But it's
1242 * possible that the string is OK to us, and not OK to mbstowcs ---
1243 * this suggests that the LC_CTYPE locale is different from the
1244 * database encoding. Give a generic error message if pg_verifymbstr
1245 * can't find anything wrong.
1246 */
1247 pg_verifymbstr(from, fromlen, false); /* might not return */
1248 /* but if it does ... */
1249 ereport(ERROR,
1250 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1251 errmsg("invalid multibyte character for locale"),
1252 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1253 }
1254
1255 return result;
1256}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1155
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1207
int errhint(const char *fmt,...)
Definition: elog.c:1321
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1262
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1557
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1547
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1770
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static char char_tolower_libc(unsigned char ch, pg_locale_t locale)
static bool char_is_cased_libc(char ch, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
#define TEXTBUFLEN
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
unsigned char pg_toupper(unsigned char ch)
Definition: pgstrcasecmp.c:105
unsigned char pg_tolower(unsigned char ch)
Definition: pgstrcasecmp.c:122
unsigned char pg_ascii_tolower(unsigned char ch)
Definition: pgstrcasecmp.c:146
unsigned char pg_ascii_toupper(unsigned char ch)
Definition: pgstrcasecmp.c:135
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:64
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:90
const struct ctype_methods * ctype
Definition: pg_locale.h:157
const struct collate_methods * collate
Definition: pg_locale.h:156
union pg_locale_struct::@162 info
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
#define locale_t
Definition: win32_port.h:432
#define toupper_l
Definition: win32_port.h:434
#define iswalnum_l
Definition: win32_port.h:442
#define isgraph_l
Definition: win32_port.h:447
#define towupper_l
Definition: win32_port.h:436
#define ispunct_l
Definition: win32_port.h:451
#define isalpha_l
Definition: win32_port.h:439
#define strcoll_l
Definition: win32_port.h:455
#define iswgraph_l
Definition: win32_port.h:448
#define strxfrm_l
Definition: win32_port.h:456
#define towlower_l
Definition: win32_port.h:435
#define iswspace_l
Definition: win32_port.h:454
#define isdigit_l
Definition: win32_port.h:437
#define wcscoll_l
Definition: win32_port.h:457
#define tolower_l
Definition: win32_port.h:433
#define iswupper_l
Definition: win32_port.h:444
#define iswalpha_l
Definition: win32_port.h:440
#define isprint_l
Definition: win32_port.h:449
#define iswprint_l
Definition: win32_port.h:450
#define isupper_l
Definition: win32_port.h:443
#define isalnum_l
Definition: win32_port.h:441
#define islower_l
Definition: win32_port.h:445
#define iswlower_l
Definition: win32_port.h:446
#define iswpunct_l
Definition: win32_port.h:452
#define isspace_l
Definition: win32_port.h:453
#define iswdigit_l
Definition: win32_port.h:438