Thanks to visit codestin.com
Credit goes to chromium.googlesource.com

blob: 2efcd67912b7587d041734c7fb756143d5cdcd07 [file] [log] [blame]
drha5d14fe2004-05-04 15:00:461/*
2** 2004 April 13
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file contains routines used to translate between UTF-8,
13** UTF-16, UTF-16BE, and UTF-16LE.
14**
drha5d14fe2004-05-04 15:00:4615** Notes on UTF-8:
16**
17** Byte-0 Byte-1 Byte-2 Byte-3 Value
18** 0xxxxxxx 00000000 00000000 0xxxxxxx
19** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
20** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
21** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
22**
23**
24** Notes on UTF-16: (with wwww+1==uuuuu)
25**
drh51846b52004-05-28 16:00:2126** Word-0 Word-1 Value
27** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
28** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
drha5d14fe2004-05-04 15:00:4629**
danielk1977998b56c2004-05-06 23:37:5230**
drha5d14fe2004-05-04 15:00:4631** BOM or Byte Order Mark:
32** 0xff 0xfe little-endian utf-16 follows
33** 0xfe 0xff big-endian utf-16 follows
danielk1977998b56c2004-05-06 23:37:5234**
drha5d14fe2004-05-04 15:00:4635*/
danielk1977998b56c2004-05-06 23:37:5236#include "sqliteInt.h"
drhb659e9b2005-01-28 01:29:0837#include <assert.h>
danielk1977bfd6cce2004-06-18 04:24:5438#include "vdbeInt.h"
danielk1977998b56c2004-05-06 23:37:5239
drhe1462a72015-12-24 14:53:2740#if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
danielk1977998b56c2004-05-06 23:37:5241/*
drh38def052007-03-31 15:27:5942** The following constant value is used by the SQLITE_BIGENDIAN and
43** SQLITE_LITTLEENDIAN macros.
44*/
45const int sqlite3one = 1;
drhe1462a72015-12-24 14:53:2746#endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
drh38def052007-03-31 15:27:5947
48/*
drh4a919112007-05-15 11:55:0949** This lookup table is used to help decode the first byte of
50** a multi-byte UTF8 character.
danielk1977d02eb1f2004-06-06 09:44:0351*/
shane18e526c2008-12-10 22:30:2452static const unsigned char sqlite3Utf8Trans1[] = {
drh4a919112007-05-15 11:55:0953 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
danielk1977bfd6cce2004-06-18 04:24:5461};
62
drh66150952007-07-23 19:12:4163
danielk1977bfd6cce2004-06-18 04:24:5464#define WRITE_UTF8(zOut, c) { \
65 if( c<0x00080 ){ \
drhaa78bec2008-12-09 03:55:1466 *zOut++ = (u8)(c&0xFF); \
danielk1977bfd6cce2004-06-18 04:24:5467 } \
68 else if( c<0x00800 ){ \
drhaa78bec2008-12-09 03:55:1469 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
70 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:5471 } \
72 else if( c<0x10000 ){ \
drhaa78bec2008-12-09 03:55:1473 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
74 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
75 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:5476 }else{ \
drhaa78bec2008-12-09 03:55:1477 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
78 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
79 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
80 *zOut++ = 0x80 + (u8)(c & 0x3F); \
danielk1977bfd6cce2004-06-18 04:24:5481 } \
82}
83
drhaa78bec2008-12-09 03:55:1484#define WRITE_UTF16LE(zOut, c) { \
85 if( c<=0xFFFF ){ \
86 *zOut++ = (u8)(c&0x00FF); \
87 *zOut++ = (u8)((c>>8)&0x00FF); \
88 }else{ \
89 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
91 *zOut++ = (u8)(c&0x00FF); \
92 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
93 } \
danielk1977bfd6cce2004-06-18 04:24:5494}
95
drhaa78bec2008-12-09 03:55:1496#define WRITE_UTF16BE(zOut, c) { \
97 if( c<=0xFFFF ){ \
98 *zOut++ = (u8)((c>>8)&0x00FF); \
99 *zOut++ = (u8)(c&0x00FF); \
100 }else{ \
101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
104 *zOut++ = (u8)(c&0x00FF); \
105 } \
danielk1977bfd6cce2004-06-18 04:24:54106}
107
danielk1977bfd6cce2004-06-18 04:24:54108/*
drha357a902025-02-25 11:47:34109** Write a single UTF8 character whose value is v into the
110** buffer starting at zOut. zOut must be sized to hold at
drha58208a2025-05-19 12:34:11111** least four bytes. Return the number of bytes needed
drha357a902025-02-25 11:47:34112** to encode the new character.
113*/
114int sqlite3AppendOneUtf8Character(char *zOut, u32 v){
115 if( v<0x00080 ){
116 zOut[0] = (u8)(v & 0xff);
117 return 1;
118 }
119 if( v<0x00800 ){
120 zOut[0] = 0xc0 + (u8)((v>>6) & 0x1f);
121 zOut[1] = 0x80 + (u8)(v & 0x3f);
122 return 2;
123 }
124 if( v<0x10000 ){
125 zOut[0] = 0xe0 + (u8)((v>>12) & 0x0f);
126 zOut[1] = 0x80 + (u8)((v>>6) & 0x3f);
127 zOut[2] = 0x80 + (u8)(v & 0x3f);
128 return 3;
129 }
130 zOut[0] = 0xf0 + (u8)((v>>18) & 0x07);
131 zOut[1] = 0x80 + (u8)((v>>12) & 0x3f);
132 zOut[2] = 0x80 + (u8)((v>>6) & 0x3f);
133 zOut[3] = 0x80 + (u8)(v & 0x3f);
134 return 4;
135}
136
137/*
drh66150952007-07-23 19:12:41138** Translate a single UTF-8 character. Return the unicode value.
139**
140** During translation, assume that the byte that zTerm points
141** is a 0x00.
142**
143** Write a pointer to the next unread byte back into *pzNext.
144**
145** Notes On Invalid UTF-8:
146**
147** * This routine never allows a 7-bit character (0x00 through 0x7f) to
148** be encoded as a multi-byte character. Any multi-byte character that
149** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
150**
151** * This routine never allows a UTF16 surrogate value to be encoded.
152** If a multi-byte character attempts to encode a value between
153** 0xd800 and 0xe000 then it is rendered as 0xfffd.
154**
155** * Bytes in the range of 0x80 through 0xbf which occur as the first
156** byte of a character are interpreted as single-byte characters
157** and rendered as themselves even though they are technically
158** invalid characters.
159**
drh6c34e582014-06-18 15:24:40160** * This routine accepts over-length UTF8 encodings
161** for unicode values 0x80 and greater. It does not change over-length
drh66150952007-07-23 19:12:41162** encodings to 0xfffd as some systems recommend.
163*/
danielk1977ad76a81e2008-07-29 11:25:14164#define READ_UTF8(zIn, zTerm, c) \
165 c = *(zIn++); \
166 if( c>=0xc0 ){ \
shane18e526c2008-12-10 22:30:24167 c = sqlite3Utf8Trans1[c-0xc0]; \
drh49248472024-10-14 18:43:04168 while( zIn<zTerm && (*zIn & 0xc0)==0x80 ){ \
danielk1977ad76a81e2008-07-29 11:25:14169 c = (c<<6) + (0x3f & *(zIn++)); \
170 } \
171 if( c<0x80 \
172 || (c&0xFFFFF800)==0xD800 \
173 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
174 }
drh0a32fa62011-06-13 12:19:21175u32 sqlite3Utf8Read(
drh42610962012-09-17 18:56:32176 const unsigned char **pz /* Pointer to string from which to read char */
drh66150952007-07-23 19:12:41177){
shanehdba2cc42011-03-24 17:43:18178 unsigned int c;
drh769e97e2009-04-01 16:33:37179
180 /* Same as READ_UTF8() above but without the zTerm parameter.
181 ** For this routine, we assume the UTF8 string is always zero-terminated.
182 */
drh42610962012-09-17 18:56:32183 c = *((*pz)++);
drh769e97e2009-04-01 16:33:37184 if( c>=0xc0 ){
185 c = sqlite3Utf8Trans1[c-0xc0];
drh42610962012-09-17 18:56:32186 while( (*(*pz) & 0xc0)==0x80 ){
187 c = (c<<6) + (0x3f & *((*pz)++));
drh769e97e2009-04-01 16:33:37188 }
189 if( c<0x80
190 || (c&0xFFFFF800)==0xD800
191 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
192 }
drh66150952007-07-23 19:12:41193 return c;
194}
195
drh001d1e72023-12-13 14:31:15196/*
197** Read a single UTF8 character out of buffer z[], but reading no
198** more than n characters from the buffer. z[] is not zero-terminated.
199**
200** Return the number of bytes used to construct the character.
201**
202** Invalid UTF8 might generate a strange result. No effort is made
203** to detect invalid UTF8.
204**
205** At most 4 bytes will be read out of z[]. The return value will always
206** be between 1 and 4.
207*/
208int sqlite3Utf8ReadLimited(
209 const u8 *z,
210 int n,
211 u32 *piOut
212){
213 u32 c;
214 int i = 1;
215 assert( n>0 );
216 c = z[0];
217 if( c>=0xc0 ){
218 c = sqlite3Utf8Trans1[c-0xc0];
219 if( n>4 ) n = 4;
220 while( i<n && (z[i] & 0xc0)==0x80 ){
221 c = (c<<6) + (0x3f & z[i]);
222 i++;
223 }
224 }
225 *piOut = c;
226 return i;
227}
drh66150952007-07-23 19:12:41228
danielk1977ad76a81e2008-07-29 11:25:14229
drh66150952007-07-23 19:12:41230/*
danielk1977bfd6cce2004-06-18 04:24:54231** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
232** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
233*/
234/* #define TRANSLATE_TRACE 1 */
235
drh6c626082004-11-14 21:56:29236#ifndef SQLITE_OMIT_UTF16
danielk1977bfd6cce2004-06-18 04:24:54237/*
238** This routine transforms the internal text encoding used by pMem to
239** desiredEnc. It is an error if the string is already of the desired
240** encoding, or if *pMem does not contain a string value.
241*/
drh4274dae2014-08-24 02:53:23242SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
drhd4de9f72019-04-14 00:34:20243 sqlite3_int64 len; /* Maximum length of output string in bytes */
244 unsigned char *zOut; /* Output buffer */
245 unsigned char *zIn; /* Input iterator */
246 unsigned char *zTerm; /* End of input */
247 unsigned char *z; /* Output iterator */
drha39f4c52006-10-04 15:23:21248 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54249
drhb21c8cd2007-08-21 19:33:56250 assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
danielk1977bfd6cce2004-06-18 04:24:54251 assert( pMem->flags&MEM_Str );
252 assert( pMem->enc!=desiredEnc );
253 assert( pMem->enc!=0 );
254 assert( pMem->n>=0 );
255
danielk1977b5402fb2005-01-12 07:15:04256#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54257 {
drh5ca06322020-01-06 19:23:41258 StrAccum acc;
259 char zBuf[1000];
260 sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
261 sqlite3VdbeMemPrettyPrint(pMem, &acc);
262 fprintf(stderr, "INPUT: %s\n", sqlite3StrAccumFinish(&acc));
danielk1977ad7dd422004-06-06 12:41:49263 }
264#endif
265
danielk1977bfd6cce2004-06-18 04:24:54266 /* If the translation is between UTF-16 little and big endian, then
267 ** all that is required is to swap the byte order. This case is handled
268 ** differently from the others.
danielk1977998b56c2004-05-06 23:37:52269 */
danielk1977bfd6cce2004-06-18 04:24:54270 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
271 u8 temp;
drh71c697e2004-08-08 23:39:19272 int rc;
drhb21c8cd2007-08-21 19:33:56273 rc = sqlite3VdbeMemMakeWriteable(pMem);
drh71c697e2004-08-08 23:39:19274 if( rc!=SQLITE_OK ){
275 assert( rc==SQLITE_NOMEM );
mistachkinfad30392016-02-13 23:43:46276 return SQLITE_NOMEM_BKPT;
drh71c697e2004-08-08 23:39:19277 }
drh2646da72005-12-09 20:02:05278 zIn = (u8*)pMem->z;
drhbbf695d2008-11-07 03:29:33279 zTerm = &zIn[pMem->n&~1];
danielk1977bfd6cce2004-06-18 04:24:54280 while( zIn<zTerm ){
281 temp = *zIn;
282 *zIn = *(zIn+1);
283 zIn++;
284 *zIn++ = temp;
285 }
286 pMem->enc = desiredEnc;
287 goto translate_out;
288 }
289
danielk1977d7e69642004-06-23 00:23:49290 /* Set len to the maximum number of bytes required in the output buffer. */
291 if( desiredEnc==SQLITE_UTF8 ){
292 /* When converting from UTF-16, the maximum growth results from
drha49b8612006-04-16 12:05:03293 ** translating a 2-byte character to a 4-byte UTF-8 character.
294 ** A single byte is required for the output string
danielk1977d7e69642004-06-23 00:23:49295 ** nul-terminator.
296 */
drhbbf695d2008-11-07 03:29:33297 pMem->n &= ~1;
drhd4de9f72019-04-14 00:34:20298 len = 2 * (sqlite3_int64)pMem->n + 1;
danielk1977d7e69642004-06-23 00:23:49299 }else{
300 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
301 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
302 ** character. Two bytes are required in the output buffer for the
303 ** nul-terminator.
304 */
drhd4de9f72019-04-14 00:34:20305 len = 2 * (sqlite3_int64)pMem->n + 2;
danielk1977d7e69642004-06-23 00:23:49306 }
307
danielk1977bfd6cce2004-06-18 04:24:54308 /* Set zIn to point at the start of the input buffer and zTerm to point 1
309 ** byte past the end.
310 **
danielk1977a7a8e142008-02-13 18:25:27311 ** Variable zOut is set to point at the output buffer, space obtained
312 ** from sqlite3_malloc().
danielk1977bfd6cce2004-06-18 04:24:54313 */
drh2646da72005-12-09 20:02:05314 zIn = (u8*)pMem->z;
danielk1977bfd6cce2004-06-18 04:24:54315 zTerm = &zIn[pMem->n];
danielk1977a7a8e142008-02-13 18:25:27316 zOut = sqlite3DbMallocRaw(pMem->db, len);
317 if( !zOut ){
mistachkinfad30392016-02-13 23:43:46318 return SQLITE_NOMEM_BKPT;
danielk1977bfd6cce2004-06-18 04:24:54319 }
320 z = zOut;
321
322 if( pMem->enc==SQLITE_UTF8 ){
323 if( desiredEnc==SQLITE_UTF16LE ){
324 /* UTF-8 -> UTF-16 Little-endian */
325 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14326 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54327 WRITE_UTF16LE(z, c);
328 }
drhb8dd3152004-09-24 23:20:51329 }else{
330 assert( desiredEnc==SQLITE_UTF16BE );
danielk1977bfd6cce2004-06-18 04:24:54331 /* UTF-8 -> UTF-16 Big-endian */
332 while( zIn<zTerm ){
danielk1977ad76a81e2008-07-29 11:25:14333 READ_UTF8(zIn, zTerm, c);
danielk1977bfd6cce2004-06-18 04:24:54334 WRITE_UTF16BE(z, c);
335 }
danielk1977bfd6cce2004-06-18 04:24:54336 }
drhea678832008-12-10 19:26:22337 pMem->n = (int)(z - zOut);
drhb8dd3152004-09-24 23:20:51338 *z++ = 0;
danielk1977bfd6cce2004-06-18 04:24:54339 }else{
340 assert( desiredEnc==SQLITE_UTF8 );
341 if( pMem->enc==SQLITE_UTF16LE ){
342 /* UTF-16 Little-endian -> UTF-8 */
343 while( zIn<zTerm ){
drh0184a252020-02-17 23:08:16344 c = *(zIn++);
345 c += (*(zIn++))<<8;
346 if( c>=0xd800 && c<0xe000 ){
drh4f1315a2020-05-20 15:02:04347#ifdef SQLITE_REPLACE_INVALID_UTF
drh0184a252020-02-17 23:08:16348 if( c>=0xdc00 || zIn>=zTerm ){
349 c = 0xfffd;
350 }else{
351 int c2 = *(zIn++);
352 c2 += (*(zIn++))<<8;
353 if( c2<0xdc00 || c2>=0xe000 ){
354 zIn -= 2;
355 c = 0xfffd;
356 }else{
357 c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
358 }
359 }
drh4f1315a2020-05-20 15:02:04360#else
361 if( zIn<zTerm ){
362 int c2 = (*zIn++);
363 c2 += ((*zIn++)<<8);
364 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
365 }
366#endif
drh0184a252020-02-17 23:08:16367 }
danielk1977bfd6cce2004-06-18 04:24:54368 WRITE_UTF8(z, c);
369 }
danielk1977bfd6cce2004-06-18 04:24:54370 }else{
mihailim7ffb2b52008-06-27 18:59:44371 /* UTF-16 Big-endian -> UTF-8 */
danielk1977bfd6cce2004-06-18 04:24:54372 while( zIn<zTerm ){
drh0184a252020-02-17 23:08:16373 c = (*(zIn++))<<8;
374 c += *(zIn++);
375 if( c>=0xd800 && c<0xe000 ){
drh4f1315a2020-05-20 15:02:04376#ifdef SQLITE_REPLACE_INVALID_UTF
drh0184a252020-02-17 23:08:16377 if( c>=0xdc00 || zIn>=zTerm ){
378 c = 0xfffd;
379 }else{
380 int c2 = (*(zIn++))<<8;
381 c2 += *(zIn++);
382 if( c2<0xdc00 || c2>=0xe000 ){
383 zIn -= 2;
384 c = 0xfffd;
385 }else{
386 c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
387 }
388 }
drh4f1315a2020-05-20 15:02:04389#else
390 if( zIn<zTerm ){
391 int c2 = ((*zIn++)<<8);
392 c2 += (*zIn++);
393 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
394 }
395#endif
drh0184a252020-02-17 23:08:16396 }
danielk1977bfd6cce2004-06-18 04:24:54397 WRITE_UTF8(z, c);
398 }
danielk1977998b56c2004-05-06 23:37:52399 }
drhaa78bec2008-12-09 03:55:14400 pMem->n = (int)(z - zOut);
danielk1977998b56c2004-05-06 23:37:52401 }
drhb8dd3152004-09-24 23:20:51402 *z = 0;
danielk1977d7e69642004-06-23 00:23:49403 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
danielk1977998b56c2004-05-06 23:37:52404
drh21b473d2020-06-04 02:50:47405 c = MEM_Str|MEM_Term|(pMem->flags&(MEM_AffMask|MEM_Subtype));
danielk1977bfd6cce2004-06-18 04:24:54406 sqlite3VdbeMemRelease(pMem);
drh21b473d2020-06-04 02:50:47407 pMem->flags = c;
danielk1977bfd6cce2004-06-18 04:24:54408 pMem->enc = desiredEnc;
drh2646da72005-12-09 20:02:05409 pMem->z = (char*)zOut;
danielk19775f096132008-03-28 15:44:09410 pMem->zMalloc = pMem->z;
drh17bcb102014-09-18 21:25:33411 pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);
danielk1977bfd6cce2004-06-18 04:24:54412
413translate_out:
danielk1977b5402fb2005-01-12 07:15:04414#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
danielk1977bfd6cce2004-06-18 04:24:54415 {
drh5ca06322020-01-06 19:23:41416 StrAccum acc;
417 char zBuf[1000];
418 sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
419 sqlite3VdbeMemPrettyPrint(pMem, &acc);
420 fprintf(stderr, "OUTPUT: %s\n", sqlite3StrAccumFinish(&acc));
danielk1977bfd6cce2004-06-18 04:24:54421 }
422#endif
423 return SQLITE_OK;
danielk1977998b56c2004-05-06 23:37:52424}
drhf0f44b72017-07-12 12:19:33425#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52426
drhf0f44b72017-07-12 12:19:33427#ifndef SQLITE_OMIT_UTF16
danielk197793d46752004-05-23 13:30:58428/*
danielk1977bfd6cce2004-06-18 04:24:54429** This routine checks for a byte-order mark at the beginning of the
430** UTF-16 string stored in *pMem. If one is present, it is removed and
431** the encoding of the Mem adjusted. This routine does not do any
432** byte-swapping, it just sets Mem.enc appropriately.
433**
434** The allocation (static, dynamic etc.) and encoding of the Mem may be
435** changed by this function.
danielk197793d46752004-05-23 13:30:58436*/
drhb21c8cd2007-08-21 19:33:56437int sqlite3VdbeMemHandleBom(Mem *pMem){
danielk1977bfd6cce2004-06-18 04:24:54438 int rc = SQLITE_OK;
439 u8 bom = 0;
440
drh769e97e2009-04-01 16:33:37441 assert( pMem->n>=0 );
442 if( pMem->n>1 ){
danielk1977bfd6cce2004-06-18 04:24:54443 u8 b1 = *(u8 *)pMem->z;
444 u8 b2 = *(((u8 *)pMem->z) + 1);
danielk197793d46752004-05-23 13:30:58445 if( b1==0xFE && b2==0xFF ){
danielk1977bfd6cce2004-06-18 04:24:54446 bom = SQLITE_UTF16BE;
danielk197793d46752004-05-23 13:30:58447 }
448 if( b1==0xFF && b2==0xFE ){
danielk1977bfd6cce2004-06-18 04:24:54449 bom = SQLITE_UTF16LE;
danielk197793d46752004-05-23 13:30:58450 }
451 }
danielk1977bfd6cce2004-06-18 04:24:54452
453 if( bom ){
danielk1977a7a8e142008-02-13 18:25:27454 rc = sqlite3VdbeMemMakeWriteable(pMem);
455 if( rc==SQLITE_OK ){
456 pMem->n -= 2;
457 memmove(pMem->z, &pMem->z[2], pMem->n);
458 pMem->z[pMem->n] = '\0';
459 pMem->z[pMem->n+1] = '\0';
460 pMem->flags |= MEM_Term;
461 pMem->enc = bom;
danielk1977bfd6cce2004-06-18 04:24:54462 }
danielk1977998b56c2004-05-06 23:37:52463 }
danielk1977bfd6cce2004-06-18 04:24:54464 return rc;
danielk1977998b56c2004-05-06 23:37:52465}
drh6c626082004-11-14 21:56:29466#endif /* SQLITE_OMIT_UTF16 */
danielk1977998b56c2004-05-06 23:37:52467
468/*
danielk19776622cce2004-05-20 11:00:52469** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
470** return the number of unicode characters in pZ up to (but not including)
471** the first 0x00 byte. If nByte is not less than zero, return the
472** number of unicode characters in the first nByte of pZ (or up to
473** the first 0x00, whichever comes first).
danielk1977998b56c2004-05-06 23:37:52474*/
drh4a919112007-05-15 11:55:09475int sqlite3Utf8CharLen(const char *zIn, int nByte){
danielk1977bfd6cce2004-06-18 04:24:54476 int r = 0;
drh4a919112007-05-15 11:55:09477 const u8 *z = (const u8*)zIn;
478 const u8 *zTerm;
danielk19771ba1b552004-06-23 13:46:32479 if( nByte>=0 ){
danielk1977bfd6cce2004-06-18 04:24:54480 zTerm = &z[nByte];
481 }else{
drh4a919112007-05-15 11:55:09482 zTerm = (const u8*)(-1);
danielk1977998b56c2004-05-06 23:37:52483 }
danielk1977bfd6cce2004-06-18 04:24:54484 assert( z<=zTerm );
485 while( *z!=0 && z<zTerm ){
drh4a919112007-05-15 11:55:09486 SQLITE_SKIP_UTF8(z);
danielk1977bfd6cce2004-06-18 04:24:54487 r++;
488 }
489 return r;
danielk19776622cce2004-05-20 11:00:52490}
491
danielk19774152e672007-09-12 17:01:45492/* This test function is not currently used by the automated test-suite.
493** Hence it is only available in debug builds.
494*/
495#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
496/*
497** Translate UTF-8 to UTF-8.
498**
499** This has the effect of making sure that the string is well-formed
500** UTF-8. Miscoded characters are removed.
501**
shanehdba2cc42011-03-24 17:43:18502** The translation is done in-place and aborted if the output
503** overruns the input.
danielk19774152e672007-09-12 17:01:45504*/
505int sqlite3Utf8To8(unsigned char *zIn){
506 unsigned char *zOut = zIn;
507 unsigned char *zStart = zIn;
danielk19774152e672007-09-12 17:01:45508 u32 c;
509
shanehdba2cc42011-03-24 17:43:18510 while( zIn[0] && zOut<=zIn ){
drh42610962012-09-17 18:56:32511 c = sqlite3Utf8Read((const u8**)&zIn);
danielk19774152e672007-09-12 17:01:45512 if( c!=0xfffd ){
513 WRITE_UTF8(zOut, c);
514 }
515 }
516 *zOut = 0;
shaneb08a67a2009-03-31 03:41:56517 return (int)(zOut - zStart);
danielk19774152e672007-09-12 17:01:45518}
519#endif
520
drh6c626082004-11-14 21:56:29521#ifndef SQLITE_OMIT_UTF16
danielk19776622cce2004-05-20 11:00:52522/*
drhaf9a7c22005-12-15 03:04:10523** Convert a UTF-16 string in the native encoding into a UTF-8 string.
drh17435752007-08-16 04:30:38524** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
525** be freed by the calling function.
drhaf9a7c22005-12-15 03:04:10526**
527** NULL is returned if there is an allocation error.
528*/
danb7dca7d2010-03-05 16:32:12529char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
drhaf9a7c22005-12-15 03:04:10530 Mem m;
531 memset(&m, 0, sizeof(m));
drhb21c8cd2007-08-21 19:33:56532 m.db = db;
danb7dca7d2010-03-05 16:32:12533 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
drhb21c8cd2007-08-21 19:33:56534 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
danielk1977ae72d982007-10-03 08:46:44535 if( db->mallocFailed ){
536 sqlite3VdbeMemRelease(&m);
537 m.z = 0;
538 }
drh17435752007-08-16 04:30:38539 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
540 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
danb7dca7d2010-03-05 16:32:12541 assert( m.z || db->mallocFailed );
542 return m.z;
drhaf9a7c22005-12-15 03:04:10543}
544
545/*
drhf8305e42024-09-19 13:39:06546** zIn is a UTF-16 encoded unicode string at least nByte bytes long.
drhaed382f2009-04-01 18:40:32547** Return the number of bytes in the first nChar unicode characters
drhf8305e42024-09-19 13:39:06548** in pZ. nChar must be non-negative. Surrogate pairs count as a single
549** character.
danielk19776622cce2004-05-20 11:00:52550*/
drhf8305e42024-09-19 13:39:06551int sqlite3Utf16ByteLen(const void *zIn, int nByte, int nChar){
drhaed382f2009-04-01 18:40:32552 int c;
553 unsigned char const *z = zIn;
drhf8305e42024-09-19 13:39:06554 unsigned char const *zEnd = &z[nByte-1];
danielk1977bfd6cce2004-06-18 04:24:54555 int n = 0;
drh6d116ca2009-10-24 01:55:14556
drh0184a252020-02-17 23:08:16557 if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++;
drhfda6e502025-04-08 20:00:33558 while( n<nChar && z<=zEnd ){
drh0184a252020-02-17 23:08:16559 c = z[0];
560 z += 2;
drhf8305e42024-09-19 13:39:06561 if( c>=0xd8 && c<0xdc && z<=zEnd && z[0]>=0xdc && z[0]<0xe0 ) z += 2;
drh0184a252020-02-17 23:08:16562 n++;
danielk19776622cce2004-05-20 11:00:52563 }
drh0184a252020-02-17 23:08:16564 return (int)(z-(unsigned char const *)zIn)
565 - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE);
danielk1977998b56c2004-05-06 23:37:52566}
567
drh53c14022007-05-10 17:23:11568#if defined(SQLITE_TEST)
569/*
danielk1977bfd6cce2004-06-18 04:24:54570** This routine is called from the TCL test function "translate_selftest".
571** It checks that the primitives for serializing and deserializing
572** characters in each encoding are inverses of each other.
573*/
danielk197744a376f2008-08-12 15:04:58574void sqlite3UtfSelfTest(void){
drhb3fa0e02006-10-19 01:58:43575 unsigned int i, t;
danielk1977bfd6cce2004-06-18 04:24:54576 unsigned char zBuf[20];
577 unsigned char *z;
578 int n;
drha39f4c52006-10-04 15:23:21579 unsigned int c;
danielk1977bfd6cce2004-06-18 04:24:54580
danielk19771ba1b552004-06-23 13:46:32581 for(i=0; i<0x00110000; i++){
danielk1977bfd6cce2004-06-18 04:24:54582 z = zBuf;
583 WRITE_UTF8(z, i);
shane18e526c2008-12-10 22:30:24584 n = (int)(z-zBuf);
585 assert( n>0 && n<=4 );
drh4a919112007-05-15 11:55:09586 z[0] = 0;
danielk1977bfd6cce2004-06-18 04:24:54587 z = zBuf;
drh42610962012-09-17 18:56:32588 c = sqlite3Utf8Read((const u8**)&z);
drhb3fa0e02006-10-19 01:58:43589 t = i;
590 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
591 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
592 assert( c==t );
danielk1977bfd6cce2004-06-18 04:24:54593 assert( (z-zBuf)==n );
594 }
danielk1977bfd6cce2004-06-18 04:24:54595}
drh6c626082004-11-14 21:56:29596#endif /* SQLITE_TEST */
597#endif /* SQLITE_OMIT_UTF16 */