Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7d41e29

Browse files
committed
Patch #626548: Support Hangul syllable names.
1 parent 529ec6a commit 7d41e29

2 files changed

Lines changed: 112 additions & 2 deletions

File tree

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,9 @@ Extension modules
316316
available in source code, but not built automatically anymore, and
317317
is now named bsddb185.
318318

319+
- unicodedata was updated to Unicode 3.2. In now also supports names
320+
for Hangul syllables.
321+
319322
- resource.getrlimit() now returns longs instead of ints.
320323

321324
- readline now dynamically adjusts its input/output stream if

Modules/unicodedata.c

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
/* ------------------------------------------------------------------------
22
3-
unicodedata -- Provides access to the Unicode 3.0 data base.
3+
unicodedata -- Provides access to the Unicode 3.2 data base.
44
5-
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
5+
Data was extracted from the Unicode 3.2 UnicodeData.txt file.
66
77
Written by Marc-Andre Lemburg ([email protected]).
88
Modified for Python 2.0 by Fredrik Lundh ([email protected])
9+
Modified by Martin v. Löwis ([email protected])
910
1011
Copyright (c) Corporation for National Research Initiatives.
1112
@@ -276,6 +277,47 @@ _gethash(const char *s, int len, int scale)
276277
return h;
277278
}
278279

280+
#define SBase 0xAC00
281+
#define LBase 0x1100
282+
#define VBase 0x1161
283+
#define TBase 0x11A7
284+
#define LCount 19
285+
#define VCount 21
286+
#define TCount 28
287+
#define NCount (VCount*TCount)
288+
#define SCount (LCount*NCount)
289+
290+
static char *hangul_syllables[][3] = {
291+
{ "G", "A", "" },
292+
{ "GG", "AE", "G" },
293+
{ "N", "YA", "GG" },
294+
{ "D", "YAE", "GS" },
295+
{ "DD", "EO", "N", },
296+
{ "R", "E", "NJ" },
297+
{ "M", "YEO", "NH" },
298+
{ "B", "YE", "D" },
299+
{ "BB", "O", "L" },
300+
{ "S", "WA", "LG" },
301+
{ "SS", "WAE", "LM" },
302+
{ "", "OE", "LB" },
303+
{ "J", "YO", "LS" },
304+
{ "JJ", "U", "LT" },
305+
{ "C", "WEO", "LP" },
306+
{ "K", "WE", "LH" },
307+
{ "T", "WI", "M" },
308+
{ "P", "YU", "B" },
309+
{ "H", "EU", "BS" },
310+
{ 0, "YI", "S" },
311+
{ 0, "I", "SS" },
312+
{ 0, 0, "NG" },
313+
{ 0, 0, "J" },
314+
{ 0, 0, "C" },
315+
{ 0, 0, "K" },
316+
{ 0, 0, "T" },
317+
{ 0, 0, "P" },
318+
{ 0, 0, "H" }
319+
};
320+
279321
static int
280322
_getucname(Py_UCS4 code, char* buffer, int buflen)
281323
{
@@ -284,6 +326,28 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
284326
int word;
285327
unsigned char* w;
286328

329+
if (SBase <= code && code <= SBase+SCount) {
330+
/* Hangul syllable. */
331+
int SIndex = code - SBase;
332+
int L = SIndex / NCount;
333+
int V = (SIndex % NCount) / TCount;
334+
int T = SIndex % TCount;
335+
336+
if (buflen < 27)
337+
/* Worst case: HANGUL SYLLABLE <10chars>. */
338+
return 0;
339+
strcpy(buffer, "HANGUL SYLLABLE ");
340+
buffer += 16;
341+
strcpy(buffer, hangul_syllables[L][0]);
342+
buffer += strlen(hangul_syllables[L][0]);
343+
strcpy(buffer, hangul_syllables[V][1]);
344+
buffer += strlen(hangul_syllables[V][1]);
345+
strcpy(buffer, hangul_syllables[T][2]);
346+
buffer += strlen(hangul_syllables[T][2]);
347+
*buffer = '\0';
348+
return 1;
349+
}
350+
287351
if (code >= 0x110000)
288352
return 0;
289353

@@ -343,13 +407,50 @@ _cmpname(int code, const char* name, int namelen)
343407
return buffer[namelen] == '\0';
344408
}
345409

410+
static void
411+
find_syllable(const char *str, int *len, int *pos, int count, int column)
412+
{
413+
int i, len1;
414+
*len = -1;
415+
for (i = 0; i < count; i++) {
416+
char *s = hangul_syllables[i][column];
417+
len1 = strlen(s);
418+
if (len1 <= *len)
419+
continue;
420+
if (strncmp(str, s, len1) == 0) {
421+
*len = len1;
422+
*pos = i;
423+
}
424+
}
425+
if (*len == -1) {
426+
*len = 0;
427+
*pos = -1;
428+
}
429+
}
430+
346431
static int
347432
_getcode(const char* name, int namelen, Py_UCS4* code)
348433
{
349434
unsigned int h, v;
350435
unsigned int mask = code_size-1;
351436
unsigned int i, incr;
352437

438+
/* Check for hangul syllables. */
439+
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
440+
int L, V, T, len;
441+
const char *pos = name + 16;
442+
find_syllable(pos, &len, &L, LCount, 0);
443+
pos += len;
444+
find_syllable(pos, &len, &V, VCount, 1);
445+
pos += len;
446+
find_syllable(pos, &len, &T, TCount, 2);
447+
pos += len;
448+
if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
449+
*code = SBase + (L*VCount+V)*TCount + T;
450+
return 1;
451+
}
452+
}
453+
353454
/* the following is the same as python's dictionary lookup, with
354455
only minor changes. see the makeunicodedata script for more
355456
details */
@@ -475,3 +576,9 @@ initunicodedata(void)
475576
if (v != NULL)
476577
PyModule_AddObject(m, "ucnhash_CAPI", v);
477578
}
579+
580+
/*
581+
Local variables:
582+
c-basic-offset: 4
583+
End:
584+
*/

0 commit comments

Comments
 (0)