Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a0dd021

Browse files
committed
Close #17693: Rewrite CJK decoders to use the _PyUnicodeWriter API instead of
the legacy Py_UNICODE API. Add also a new _PyUnicodeWriter_WriteChar() function.
1 parent d8a5cc9 commit a0dd021

15 files changed

Lines changed: 401 additions & 440 deletions

Include/unicodeobject.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,13 @@ PyAPI_FUNC(int)
933933
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
934934
Py_ssize_t length, Py_UCS4 maxchar);
935935

936+
/* Append a Unicode character.
937+
Return 0 on success, raise an exception and return -1 on error. */
938+
PyAPI_FUNC(int)
939+
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
940+
Py_UCS4 ch
941+
);
942+
936943
/* Append a Unicode string.
937944
Return 0 on success, raise an exception and return -1 on error. */
938945
PyAPI_FUNC(int)

Modules/cjkcodecs/_codecs_cn.c

Lines changed: 41 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323
* A844 undefined U+2015 HORIZONTAL BAR
2424
*/
2525

26-
#define GBK_DECODE(dc1, dc2, assi) \
27-
if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28-
else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29-
else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30-
else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31-
else TRYMAP_DEC(gbkext, assi, dc1, dc2);
26+
#define GBK_DECODE(dc1, dc2, writer) \
27+
if ((dc1) == 0xa1 && (dc2) == 0xaa) OUTCHAR(0x2014); \
28+
else if ((dc1) == 0xa8 && (dc2) == 0x44) OUTCHAR(0x2015); \
29+
else if ((dc1) == 0xa1 && (dc2) == 0xa4) OUTCHAR(0x00b7); \
30+
else TRYMAP_DEC(gb2312, writer, dc1 ^ 0x80, dc2 ^ 0x80); \
31+
else TRYMAP_DEC(gbkext, writer, dc1, dc2);
3232

3333
#define GBK_ENCODE(code, assi) \
3434
if ((code) == 0x2014) (assi) = 0xa1aa; \
@@ -43,7 +43,7 @@
4343
ENCODER(gb2312)
4444
{
4545
while (inleft > 0) {
46-
Py_UNICODE c = IN1;
46+
Py_UCS4 c = IN1;
4747
DBCHAR code;
4848

4949
if (c < 0x80) {
@@ -73,17 +73,15 @@ DECODER(gb2312)
7373
while (inleft > 0) {
7474
unsigned char c = **inbuf;
7575

76-
REQUIRE_OUTBUF(1)
77-
7876
if (c < 0x80) {
79-
OUT1(c)
80-
NEXT(1, 1)
77+
OUTCHAR(c);
78+
NEXT_IN(1);
8179
continue;
8280
}
8381

8482
REQUIRE_INBUF(2)
85-
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
86-
NEXT(2, 1)
83+
TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) {
84+
NEXT_IN(2);
8785
}
8886
else return 1;
8987
}
@@ -99,7 +97,7 @@ DECODER(gb2312)
9997
ENCODER(gbk)
10098
{
10199
while (inleft > 0) {
102-
Py_UNICODE c = IN1;
100+
Py_UCS4 c = IN1;
103101
DBCHAR code;
104102

105103
if (c < 0x80) {
@@ -130,20 +128,18 @@ DECODER(gbk)
130128
while (inleft > 0) {
131129
unsigned char c = IN1;
132130

133-
REQUIRE_OUTBUF(1)
134-
135131
if (c < 0x80) {
136-
OUT1(c)
137-
NEXT(1, 1)
132+
OUTCHAR(c);
133+
NEXT_IN(1);
138134
continue;
139135
}
140136

141137
REQUIRE_INBUF(2)
142138

143-
GBK_DECODE(c, IN2, **outbuf)
139+
GBK_DECODE(c, IN2, writer)
144140
else return 1;
145141

146-
NEXT(2, 1)
142+
NEXT_IN(2);
147143
}
148144

149145
return 0;
@@ -157,7 +153,7 @@ DECODER(gbk)
157153
ENCODER(gb18030)
158154
{
159155
while (inleft > 0) {
160-
ucs4_t c = IN1;
156+
Py_UCS4 c = IN1;
161157
DBCHAR code;
162158

163159
if (c < 0x80) {
@@ -174,7 +170,7 @@ ENCODER(gb18030)
174170
return 1;
175171
#endif
176172
else if (c >= 0x10000) {
177-
ucs4_t tc = c - 0x10000;
173+
Py_UCS4 tc = c - 0x10000;
178174

179175
REQUIRE_OUTBUF(4)
180176

@@ -208,7 +204,7 @@ ENCODER(gb18030)
208204
utrrange++)
209205
if (utrrange->first <= c &&
210206
c <= utrrange->last) {
211-
Py_UNICODE tc;
207+
Py_UCS4 tc;
212208

213209
tc = c - utrrange->first +
214210
utrrange->base;
@@ -247,11 +243,9 @@ DECODER(gb18030)
247243
while (inleft > 0) {
248244
unsigned char c = IN1, c2;
249245

250-
REQUIRE_OUTBUF(1)
251-
252246
if (c < 0x80) {
253-
OUT1(c)
254-
NEXT(1, 1)
247+
OUTCHAR(c);
248+
NEXT_IN(1);
255249
continue;
256250
}
257251

@@ -261,7 +255,7 @@ DECODER(gb18030)
261255
if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262256
const struct _gb18030_to_unibmp_ranges *utr;
263257
unsigned char c3, c4;
264-
ucs4_t lseq;
258+
Py_UCS4 lseq;
265259

266260
REQUIRE_INBUF(4)
267261
c3 = IN3;
@@ -272,34 +266,34 @@ DECODER(gb18030)
272266
c3 -= 0x81; c4 -= 0x30;
273267

274268
if (c < 4) { /* U+0080 - U+FFFF */
275-
lseq = ((ucs4_t)c * 10 + c2) * 1260 +
276-
(ucs4_t)c3 * 10 + c4;
269+
lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
270+
(Py_UCS4)c3 * 10 + c4;
277271
if (lseq < 39420) {
278272
for (utr = gb18030_to_unibmp_ranges;
279273
lseq >= (utr + 1)->base;
280274
utr++) ;
281-
OUT1(utr->first - utr->base + lseq)
282-
NEXT(4, 1)
275+
OUTCHAR(utr->first - utr->base + lseq);
276+
NEXT_IN(4);
283277
continue;
284278
}
285279
}
286280
else if (c >= 15) { /* U+10000 - U+10FFFF */
287-
lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
288-
* 1260 + (ucs4_t)c3 * 10 + c4;
281+
lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
282+
* 1260 + (Py_UCS4)c3 * 10 + c4;
289283
if (lseq <= 0x10FFFF) {
290-
WRITEUCS4(lseq);
291-
NEXT_IN(4)
284+
OUTCHAR(lseq);
285+
NEXT_IN(4);
292286
continue;
293287
}
294288
}
295289
return 1;
296290
}
297291

298-
GBK_DECODE(c, c2, **outbuf)
299-
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
292+
GBK_DECODE(c, c2, writer)
293+
else TRYMAP_DEC(gb18030ext, writer, c, c2);
300294
else return 1;
301295

302-
NEXT(2, 1)
296+
NEXT_IN(2);
303297
}
304298

305299
return 0;
@@ -329,7 +323,7 @@ ENCODER_RESET(hz)
329323
ENCODER(hz)
330324
{
331325
while (inleft > 0) {
332-
Py_UNICODE c = IN1;
326+
Py_UCS4 c = IN1;
333327
DBCHAR code;
334328

335329
if (c < 0x80) {
@@ -389,8 +383,8 @@ DECODER(hz)
389383

390384
REQUIRE_INBUF(2)
391385
if (c2 == '~') {
392-
WRITE1('~')
393-
NEXT(2, 1)
386+
OUTCHAR('~');
387+
NEXT_IN(2);
394388
continue;
395389
}
396390
else if (c2 == '{' && state->i == 0)
@@ -401,22 +395,21 @@ DECODER(hz)
401395
; /* line-continuation */
402396
else
403397
return 1;
404-
NEXT(2, 0);
398+
NEXT_IN(2);
405399
continue;
406400
}
407401

408402
if (c & 0x80)
409403
return 1;
410404

411405
if (state->i == 0) { /* ASCII mode */
412-
WRITE1(c)
413-
NEXT(1, 1)
406+
OUTCHAR(c);
407+
NEXT_IN(1);
414408
}
415409
else { /* GB mode */
416410
REQUIRE_INBUF(2)
417-
REQUIRE_OUTBUF(1)
418-
TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
419-
NEXT(2, 1)
411+
TRYMAP_DEC(gb2312, writer, c, IN2) {
412+
NEXT_IN(2);
420413
}
421414
else
422415
return 1;

Modules/cjkcodecs/_codecs_hk.c

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5
3939
ENCODER(big5hkscs)
4040
{
4141
while (inleft > 0) {
42-
ucs4_t c = **inbuf;
42+
Py_UCS4 c = **inbuf;
4343
DBCHAR code;
4444
Py_ssize_t insize;
4545

@@ -103,26 +103,24 @@ DECODER(big5hkscs)
103103
{
104104
while (inleft > 0) {
105105
unsigned char c = IN1;
106-
ucs4_t decoded;
107-
108-
REQUIRE_OUTBUF(1)
106+
Py_UCS4 decoded;
109107

110108
if (c < 0x80) {
111-
OUT1(c)
112-
NEXT(1, 1)
109+
OUTCHAR(c);
110+
NEXT_IN(1);
113111
continue;
114112
}
115113

116114
REQUIRE_INBUF(2)
117115

118116
if (0xc6 > c || c > 0xc8 || (c < 0xc7 && IN2 < 0xa1)) {
119-
TRYMAP_DEC(big5, **outbuf, c, IN2) {
120-
NEXT(2, 1)
117+
TRYMAP_DEC(big5, writer, c, IN2) {
118+
NEXT_IN(2);
121119
continue;
122120
}
123121
}
124122

125-
TRYMAP_DEC(big5hkscs, decoded, c, IN2)
123+
TRYMAP_DEC_CHAR(big5hkscs, decoded, c, IN2)
126124
{
127125
int s = BH2S(c, IN2);
128126
const unsigned char *hintbase;
@@ -146,25 +144,25 @@ DECODER(big5hkscs)
146144
return MBERR_INTERNAL;
147145

148146
if (hintbase[s >> 3] & (1 << (s & 7))) {
149-
WRITEUCS4(decoded | 0x20000)
150-
NEXT_IN(2)
147+
OUTCHAR(decoded | 0x20000);
148+
NEXT_IN(2);
151149
}
152150
else {
153-
OUT1(decoded)
154-
NEXT(2, 1)
151+
OUTCHAR(decoded);
152+
NEXT_IN(2);
155153
}
156154
continue;
157155
}
158156

159157
switch ((c << 8) | IN2) {
160-
case 0x8862: WRITE2(0x00ca, 0x0304); break;
161-
case 0x8864: WRITE2(0x00ca, 0x030c); break;
162-
case 0x88a3: WRITE2(0x00ea, 0x0304); break;
163-
case 0x88a5: WRITE2(0x00ea, 0x030c); break;
158+
case 0x8862: OUTCHAR2(0x00ca, 0x0304); break;
159+
case 0x8864: OUTCHAR2(0x00ca, 0x030c); break;
160+
case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break;
161+
case 0x88a5: OUTCHAR2(0x00ea, 0x030c); break;
164162
default: return 1;
165163
}
166164

167-
NEXT(2, 2) /* all decoded codepoints are pairs, above. */
165+
NEXT_IN(2); /* all decoded codepoints are pairs, above. */
168166
}
169167

170168
return 0;

0 commit comments

Comments
 (0)