Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d949126

Browse files
committed
Issue #17693: CJK encoders now use the new Unicode API (PEP 393)
1 parent 7155759 commit d949126

9 files changed

Lines changed: 430 additions & 418 deletions

File tree

Modules/cjkcodecs/_codecs_cn.c

Lines changed: 65 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,18 @@
4242

4343
ENCODER(gb2312)
4444
{
45-
while (inleft > 0) {
46-
Py_UCS4 c = IN1;
45+
while (*inpos < inlen) {
46+
Py_UCS4 c = INCHAR1;
4747
DBCHAR code;
4848

4949
if (c < 0x80) {
50-
WRITE1((unsigned char)c)
51-
NEXT(1, 1)
50+
WRITEBYTE1((unsigned char)c)
51+
NEXT(1, 1);
5252
continue;
5353
}
54-
UCS4INVALID(c)
54+
55+
if (c > 0xFFFF)
56+
return 1;
5557

5658
REQUIRE_OUTBUF(2)
5759
TRYMAP_ENC(gbcommon, code, c);
@@ -60,9 +62,9 @@ ENCODER(gb2312)
6062
if (code & 0x8000) /* MSB set: GBK */
6163
return 1;
6264

63-
OUT1((code >> 8) | 0x80)
64-
OUT2((code & 0xFF) | 0x80)
65-
NEXT(1, 2)
65+
OUTBYTE1((code >> 8) | 0x80)
66+
OUTBYTE2((code & 0xFF) | 0x80)
67+
NEXT(1, 2);
6668
}
6769

6870
return 0;
@@ -80,7 +82,7 @@ DECODER(gb2312)
8082
}
8183

8284
REQUIRE_INBUF(2)
83-
TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) {
85+
TRYMAP_DEC(gb2312, writer, c ^ 0x80, INBYTE2 ^ 0x80) {
8486
NEXT_IN(2);
8587
}
8688
else return 1;
@@ -96,28 +98,30 @@ DECODER(gb2312)
9698

9799
ENCODER(gbk)
98100
{
99-
while (inleft > 0) {
100-
Py_UCS4 c = IN1;
101+
while (*inpos < inlen) {
102+
Py_UCS4 c = INCHAR1;
101103
DBCHAR code;
102104

103105
if (c < 0x80) {
104-
WRITE1((unsigned char)c)
105-
NEXT(1, 1)
106+
WRITEBYTE1((unsigned char)c)
107+
NEXT(1, 1);
106108
continue;
107109
}
108-
UCS4INVALID(c)
110+
111+
if (c > 0xFFFF)
112+
return 1;
109113

110114
REQUIRE_OUTBUF(2)
111115

112116
GBK_ENCODE(c, code)
113117
else return 1;
114118

115-
OUT1((code >> 8) | 0x80)
119+
OUTBYTE1((code >> 8) | 0x80)
116120
if (code & 0x8000)
117-
OUT2((code & 0xFF)) /* MSB set: GBK */
121+
OUTBYTE2((code & 0xFF)) /* MSB set: GBK */
118122
else
119-
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
120-
NEXT(1, 2)
123+
OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
124+
NEXT(1, 2);
121125
}
122126

123127
return 0;
@@ -126,7 +130,7 @@ ENCODER(gbk)
126130
DECODER(gbk)
127131
{
128132
while (inleft > 0) {
129-
unsigned char c = IN1;
133+
unsigned char c = INBYTE1;
130134

131135
if (c < 0x80) {
132136
OUTCHAR(c);
@@ -136,7 +140,7 @@ DECODER(gbk)
136140

137141
REQUIRE_INBUF(2)
138142

139-
GBK_DECODE(c, IN2, writer)
143+
GBK_DECODE(c, INBYTE2, writer)
140144
else return 1;
141145

142146
NEXT_IN(2);
@@ -152,41 +156,31 @@ DECODER(gbk)
152156

153157
ENCODER(gb18030)
154158
{
155-
while (inleft > 0) {
156-
Py_UCS4 c = IN1;
159+
while (*inpos < inlen) {
160+
Py_UCS4 c = INCHAR1;
157161
DBCHAR code;
158162

159163
if (c < 0x80) {
160-
WRITE1(c)
161-
NEXT(1, 1)
164+
WRITEBYTE1(c)
165+
NEXT(1, 1);
162166
continue;
163167
}
164168

165-
DECODE_SURROGATE(c)
166-
if (c > 0x10FFFF)
167-
#if Py_UNICODE_SIZE == 2
168-
return 2; /* surrogates pair */
169-
#else
170-
return 1;
171-
#endif
172-
else if (c >= 0x10000) {
169+
if (c >= 0x10000) {
173170
Py_UCS4 tc = c - 0x10000;
171+
assert (c <= 0x10FFFF);
174172

175173
REQUIRE_OUTBUF(4)
176174

177-
OUT4((unsigned char)(tc % 10) + 0x30)
175+
OUTBYTE4((unsigned char)(tc % 10) + 0x30)
178176
tc /= 10;
179-
OUT3((unsigned char)(tc % 126) + 0x81)
177+
OUTBYTE3((unsigned char)(tc % 126) + 0x81)
180178
tc /= 126;
181-
OUT2((unsigned char)(tc % 10) + 0x30)
179+
OUTBYTE2((unsigned char)(tc % 10) + 0x30)
182180
tc /= 10;
183-
OUT1((unsigned char)(tc + 0x90))
181+
OUTBYTE1((unsigned char)(tc + 0x90))
184182

185-
#if Py_UNICODE_SIZE == 2
186-
NEXT(2, 4) /* surrogates pair */
187-
#else
188-
NEXT(1, 4)
189-
#endif
183+
NEXT(1, 4);
190184
continue;
191185
}
192186

@@ -209,15 +203,15 @@ ENCODER(gb18030)
209203
tc = c - utrrange->first +
210204
utrrange->base;
211205

212-
OUT4((unsigned char)(tc % 10) + 0x30)
206+
OUTBYTE4((unsigned char)(tc % 10) + 0x30)
213207
tc /= 10;
214-
OUT3((unsigned char)(tc % 126) + 0x81)
208+
OUTBYTE3((unsigned char)(tc % 126) + 0x81)
215209
tc /= 126;
216-
OUT2((unsigned char)(tc % 10) + 0x30)
210+
OUTBYTE2((unsigned char)(tc % 10) + 0x30)
217211
tc /= 10;
218-
OUT1((unsigned char)tc + 0x81)
212+
OUTBYTE1((unsigned char)tc + 0x81)
219213

220-
NEXT(1, 4)
214+
NEXT(1, 4);
221215
break;
222216
}
223217

@@ -226,13 +220,13 @@ ENCODER(gb18030)
226220
continue;
227221
}
228222

229-
OUT1((code >> 8) | 0x80)
223+
OUTBYTE1((code >> 8) | 0x80)
230224
if (code & 0x8000)
231-
OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
225+
OUTBYTE2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
232226
else
233-
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
227+
OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
234228

235-
NEXT(1, 2)
229+
NEXT(1, 2);
236230
}
237231

238232
return 0;
@@ -241,7 +235,7 @@ ENCODER(gb18030)
241235
DECODER(gb18030)
242236
{
243237
while (inleft > 0) {
244-
unsigned char c = IN1, c2;
238+
unsigned char c = INBYTE1, c2;
245239

246240
if (c < 0x80) {
247241
OUTCHAR(c);
@@ -251,15 +245,15 @@ DECODER(gb18030)
251245

252246
REQUIRE_INBUF(2)
253247

254-
c2 = IN2;
248+
c2 = INBYTE2;
255249
if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
256250
const struct _gb18030_to_unibmp_ranges *utr;
257251
unsigned char c3, c4;
258252
Py_UCS4 lseq;
259253

260254
REQUIRE_INBUF(4)
261-
c3 = IN3;
262-
c4 = IN4;
255+
c3 = INBYTE3;
256+
c4 = INBYTE4;
263257
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
264258
return 1;
265259
c -= 0x81; c2 -= 0x30;
@@ -313,33 +307,34 @@ ENCODER_INIT(hz)
313307
ENCODER_RESET(hz)
314308
{
315309
if (state->i != 0) {
316-
WRITE2('~', '}')
310+
WRITEBYTE2('~', '}')
317311
state->i = 0;
318-
NEXT_OUT(2)
312+
NEXT_OUT(2);
319313
}
320314
return 0;
321315
}
322316

323317
ENCODER(hz)
324318
{
325-
while (inleft > 0) {
326-
Py_UCS4 c = IN1;
319+
while (*inpos < inlen) {
320+
Py_UCS4 c = INCHAR1;
327321
DBCHAR code;
328322

329323
if (c < 0x80) {
330324
if (state->i == 0) {
331-
WRITE1((unsigned char)c)
332-
NEXT(1, 1)
325+
WRITEBYTE1((unsigned char)c)
326+
NEXT(1, 1);
333327
}
334328
else {
335-
WRITE3('~', '}', (unsigned char)c)
336-
NEXT(1, 3)
329+
WRITEBYTE3('~', '}', (unsigned char)c)
330+
NEXT(1, 3);
337331
state->i = 0;
338332
}
339333
continue;
340334
}
341335

342-
UCS4INVALID(c)
336+
if (c > 0xFFFF)
337+
return 1;
343338

344339
TRYMAP_ENC(gbcommon, code, c);
345340
else return 1;
@@ -348,13 +343,13 @@ ENCODER(hz)
348343
return 1;
349344

350345
if (state->i == 0) {
351-
WRITE4('~', '{', code >> 8, code & 0xff)
352-
NEXT(1, 4)
346+
WRITEBYTE4('~', '{', code >> 8, code & 0xff)
347+
NEXT(1, 4);
353348
state->i = 1;
354349
}
355350
else {
356-
WRITE2(code >> 8, code & 0xff)
357-
NEXT(1, 2)
351+
WRITEBYTE2(code >> 8, code & 0xff)
352+
NEXT(1, 2);
358353
}
359354
}
360355

@@ -376,10 +371,10 @@ DECODER_RESET(hz)
376371
DECODER(hz)
377372
{
378373
while (inleft > 0) {
379-
unsigned char c = IN1;
374+
unsigned char c = INBYTE1;
380375

381376
if (c == '~') {
382-
unsigned char c2 = IN2;
377+
unsigned char c2 = INBYTE2;
383378

384379
REQUIRE_INBUF(2)
385380
if (c2 == '~') {
@@ -408,7 +403,7 @@ DECODER(hz)
408403
}
409404
else { /* GB mode */
410405
REQUIRE_INBUF(2)
411-
TRYMAP_DEC(gb2312, writer, c, IN2) {
406+
TRYMAP_DEC(gb2312, writer, c, INBYTE2) {
412407
NEXT_IN(2);
413408
}
414409
else

0 commit comments

Comments
 (0)