Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7b7dd10

Browse files
author
Fredrik Lundh
committed
compress unicode decomposition tables (this saves another 55k)
1 parent f75c9d9 commit 7b7dd10

7 files changed

Lines changed: 7505 additions & 10721 deletions

File tree

Modules/ucnhash.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ getname(Py_UCS4 code, char* buffer, int buflen)
3838

3939
/* get offset into phrasebook */
4040
offset = phrasebook_offset1[(code>>phrasebook_shift)];
41-
offset = phrasebook_offset2[(offset<<phrasebook_shift)+
41+
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
4242
(code&((1<<phrasebook_shift)-1))];
4343
if (!offset)
4444
return 0;
@@ -47,13 +47,12 @@ getname(Py_UCS4 code, char* buffer, int buflen)
4747

4848
for (;;) {
4949
/* get word index */
50-
if (phrasebook[offset] & 128) {
51-
word = phrasebook[offset] & 127;
52-
offset++;
53-
} else {
54-
word = (phrasebook[offset]<<8) + phrasebook[offset+1];
55-
offset+=2;
56-
}
50+
word = phrasebook[offset] - phrasebook_short;
51+
if (word >= 0) {
52+
word = (word << 8) + phrasebook[offset+1];
53+
offset += 2;
54+
} else
55+
word = phrasebook[offset++];
5756
if (i) {
5857
if (i > buflen)
5958
return 0; /* buffer overflow */

Modules/unicodedata.c

Lines changed: 94 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -14,228 +14,243 @@
1414
#include "Python.h"
1515
#include "unicodedatabase.h"
1616

17+
typedef struct {
18+
const unsigned char category; /* index into
19+
_PyUnicode_CategoryNames */
20+
const unsigned char combining; /* combining class value 0 - 255 */
21+
const unsigned char bidirectional; /* index into
22+
_PyUnicode_BidirectionalNames */
23+
const unsigned char mirrored; /* true if mirrored in bidir mode */
24+
} _PyUnicode_DatabaseRecord;
25+
26+
/* data file generated by Tools/unicode/makeunicodedata.py */
27+
#include "unicodedata_db.h"
28+
29+
static const _PyUnicode_DatabaseRecord*
30+
getrecord(PyUnicodeObject* v)
31+
{
32+
int code;
33+
int index;
34+
35+
code = (int) *PyUnicode_AS_UNICODE(v);
36+
37+
if (code < 0 || code >= 65536)
38+
index = 0;
39+
else {
40+
index = index1[(code>>SHIFT)];
41+
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
42+
}
43+
44+
return &_PyUnicode_Database_Records[index];
45+
}
46+
1747
/* --- Module API --------------------------------------------------------- */
1848

1949
static PyObject *
20-
unicodedata_decimal(PyObject *self,
21-
PyObject *args)
50+
unicodedata_decimal(PyObject *self, PyObject *args)
2251
{
2352
PyUnicodeObject *v;
2453
PyObject *defobj = NULL;
2554
long rc;
2655

2756
if (!PyArg_ParseTuple(args, "O!|O:decimal",
2857
&PyUnicode_Type, &v, &defobj))
29-
goto onError;
58+
return NULL;
3059
if (PyUnicode_GET_SIZE(v) != 1) {
3160
PyErr_SetString(PyExc_TypeError,
3261
"need a single Unicode character as parameter");
33-
goto onError;
62+
return NULL;
3463
}
3564
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
3665
if (rc < 0) {
3766
if (defobj == NULL) {
3867
PyErr_SetString(PyExc_ValueError,
3968
"not a decimal");
40-
goto onError;
69+
return NULL;
4170
}
4271
else {
4372
Py_INCREF(defobj);
4473
return defobj;
4574
}
4675
}
4776
return PyInt_FromLong(rc);
48-
49-
onError:
50-
return NULL;
5177
}
5278

5379
static PyObject *
54-
unicodedata_digit(PyObject *self,
55-
PyObject *args)
80+
unicodedata_digit(PyObject *self, PyObject *args)
5681
{
5782
PyUnicodeObject *v;
5883
PyObject *defobj = NULL;
5984
long rc;
6085

6186
if (!PyArg_ParseTuple(args, "O!|O:digit",
6287
&PyUnicode_Type, &v, &defobj))
63-
goto onError;
88+
return NULL;
6489
if (PyUnicode_GET_SIZE(v) != 1) {
6590
PyErr_SetString(PyExc_TypeError,
6691
"need a single Unicode character as parameter");
67-
goto onError;
92+
return NULL;
6893
}
6994
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
7095
if (rc < 0) {
7196
if (defobj == NULL) {
7297
PyErr_SetString(PyExc_ValueError,
7398
"not a digit");
74-
goto onError;
99+
return NULL;
75100
}
76101
else {
77102
Py_INCREF(defobj);
78103
return defobj;
79104
}
80105
}
81106
return PyInt_FromLong(rc);
82-
83-
onError:
84-
return NULL;
85107
}
86108

87109
static PyObject *
88-
unicodedata_numeric(PyObject *self,
89-
PyObject *args)
110+
unicodedata_numeric(PyObject *self, PyObject *args)
90111
{
91112
PyUnicodeObject *v;
92113
PyObject *defobj = NULL;
93114
double rc;
94115

95116
if (!PyArg_ParseTuple(args, "O!|O:numeric",
96117
&PyUnicode_Type, &v, &defobj))
97-
goto onError;
118+
return NULL;
98119
if (PyUnicode_GET_SIZE(v) != 1) {
99120
PyErr_SetString(PyExc_TypeError,
100121
"need a single Unicode character as parameter");
101-
goto onError;
122+
return NULL;
102123
}
103124
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
104125
if (rc < 0) {
105126
if (defobj == NULL) {
106127
PyErr_SetString(PyExc_ValueError,
107128
"not a numeric character");
108-
goto onError;
129+
return NULL;
109130
}
110131
else {
111132
Py_INCREF(defobj);
112133
return defobj;
113134
}
114135
}
115136
return PyFloat_FromDouble(rc);
116-
117-
onError:
118-
return NULL;
119137
}
120138

121139
static PyObject *
122-
unicodedata_category(PyObject *self,
123-
PyObject *args)
140+
unicodedata_category(PyObject *self, PyObject *args)
124141
{
125142
PyUnicodeObject *v;
126143
int index;
127144

128145
if (!PyArg_ParseTuple(args, "O!:category",
129146
&PyUnicode_Type, &v))
130-
goto onError;
147+
return NULL;
131148
if (PyUnicode_GET_SIZE(v) != 1) {
132149
PyErr_SetString(PyExc_TypeError,
133150
"need a single Unicode character as parameter");
134-
goto onError;
151+
return NULL;
135152
}
136-
index = (int) _PyUnicode_Database_GetRecord(
137-
(int) *PyUnicode_AS_UNICODE(v)
138-
)->category;
153+
index = (int) getrecord(v)->category;
139154
return PyString_FromString(_PyUnicode_CategoryNames[index]);
140-
141-
onError:
142-
return NULL;
143155
}
144156

145157
static PyObject *
146-
unicodedata_bidirectional(PyObject *self,
147-
PyObject *args)
158+
unicodedata_bidirectional(PyObject *self, PyObject *args)
148159
{
149160
PyUnicodeObject *v;
150161
int index;
151162

152163
if (!PyArg_ParseTuple(args, "O!:bidirectional",
153164
&PyUnicode_Type, &v))
154-
goto onError;
165+
return NULL;
155166
if (PyUnicode_GET_SIZE(v) != 1) {
156167
PyErr_SetString(PyExc_TypeError,
157168
"need a single Unicode character as parameter");
158-
goto onError;
169+
return NULL;
159170
}
160-
index = (int) _PyUnicode_Database_GetRecord(
161-
(int) *PyUnicode_AS_UNICODE(v)
162-
)->bidirectional;
171+
index = (int) getrecord(v)->bidirectional;
163172
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
164-
165-
onError:
166-
return NULL;
167173
}
168174

169175
static PyObject *
170-
unicodedata_combining(PyObject *self,
171-
PyObject *args)
176+
unicodedata_combining(PyObject *self, PyObject *args)
172177
{
173178
PyUnicodeObject *v;
174-
int value;
175179

176180
if (!PyArg_ParseTuple(args, "O!:combining",
177181
&PyUnicode_Type, &v))
178-
goto onError;
182+
return NULL;
179183
if (PyUnicode_GET_SIZE(v) != 1) {
180184
PyErr_SetString(PyExc_TypeError,
181185
"need a single Unicode character as parameter");
182-
goto onError;
186+
return NULL;
183187
}
184-
value = (int) _PyUnicode_Database_GetRecord(
185-
(int) *PyUnicode_AS_UNICODE(v)
186-
)->combining;
187-
return PyInt_FromLong(value);
188-
189-
onError:
190-
return NULL;
188+
return PyInt_FromLong((int) getrecord(v)->combining);
191189
}
192190

193191
static PyObject *
194-
unicodedata_mirrored(PyObject *self,
195-
PyObject *args)
192+
unicodedata_mirrored(PyObject *self, PyObject *args)
196193
{
197194
PyUnicodeObject *v;
198-
int value;
199195

200196
if (!PyArg_ParseTuple(args, "O!:mirrored",
201197
&PyUnicode_Type, &v))
202-
goto onError;
198+
return NULL;
203199
if (PyUnicode_GET_SIZE(v) != 1) {
204200
PyErr_SetString(PyExc_TypeError,
205201
"need a single Unicode character as parameter");
206-
goto onError;
202+
return NULL;
207203
}
208-
value = (int) _PyUnicode_Database_GetRecord(
209-
(int) *PyUnicode_AS_UNICODE(v)
210-
)->mirrored;
211-
return PyInt_FromLong(value);
212-
213-
onError:
214-
return NULL;
204+
return PyInt_FromLong((int) getrecord(v)->mirrored);
215205
}
216206

217207
static PyObject *
218-
unicodedata_decomposition(PyObject *self,
219-
PyObject *args)
208+
unicodedata_decomposition(PyObject *self, PyObject *args)
220209
{
221210
PyUnicodeObject *v;
222-
const char *value;
211+
char decomp[256];
212+
int code, index, count, i;
223213

224214
if (!PyArg_ParseTuple(args, "O!:decomposition",
225215
&PyUnicode_Type, &v))
226-
goto onError;
216+
return NULL;
227217
if (PyUnicode_GET_SIZE(v) != 1) {
228218
PyErr_SetString(PyExc_TypeError,
229219
"need a single Unicode character as parameter");
230-
goto onError;
220+
return NULL;
221+
}
222+
223+
code = (int) *PyUnicode_AS_UNICODE(v);
224+
225+
if (code < 0 || code >= 65536)
226+
index = 0;
227+
else {
228+
index = decomp_index1[(code>>DECOMP_SHIFT)];
229+
index = decomp_index2[(index<<DECOMP_SHIFT)+
230+
(code&((1<<DECOMP_SHIFT)-1))];
231+
}
232+
233+
/* high byte is of hex bytes (usually one or two), low byte
234+
is prefix code (from*/
235+
count = decomp_data[index] >> 8;
236+
237+
/* XXX: could allocate the PyString up front instead
238+
(strlen(prefix) + 5 * count + 1 bytes) */
239+
240+
/* copy prefix */
241+
i = strlen(decomp_prefix[decomp_data[index] & 255]);
242+
memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
243+
244+
while (count-- > 0) {
245+
if (i)
246+
decomp[i++] = ' ';
247+
sprintf(decomp + i, "%04X", decomp_data[++index]);
248+
i += strlen(decomp + i);
231249
}
232-
value = _PyUnicode_Database_GetDecomposition(
233-
(int) *PyUnicode_AS_UNICODE(v)
234-
);
235-
return PyString_FromString(value);
236250

237-
onError:
238-
return NULL;
251+
decomp[i] = '\0';
252+
253+
return PyString_FromString(decomp);
239254
}
240255

241256
/* XXX Add doc strings. */

0 commit comments

Comments
 (0)