11/* ------------------------------------------------------------------------
22
3- unicodedata -- Provides access to the Unicode 3.0 data base.
3+ unicodedata -- Provides access to the Unicode 3.2 data base.
44
5- Data was extracted from the Unicode 3.0 UnicodeData.txt file.
5+ Data was extracted from the Unicode 3.2 UnicodeData.txt file.
66
77 Written by Marc-Andre Lemburg ([email protected] ). 88 Modified for Python 2.0 by Fredrik Lundh ([email protected] ) 9+ Modified by Martin v. Löwis ([email protected] ) 910
1011 Copyright (c) Corporation for National Research Initiatives.
1112
@@ -276,6 +277,47 @@ _gethash(const char *s, int len, int scale)
276277 return h ;
277278}
278279
280+ #define SBase 0xAC00
281+ #define LBase 0x1100
282+ #define VBase 0x1161
283+ #define TBase 0x11A7
284+ #define LCount 19
285+ #define VCount 21
286+ #define TCount 28
287+ #define NCount (VCount*TCount)
288+ #define SCount (LCount*NCount)
289+
290+ static char * hangul_syllables [][3 ] = {
291+ { "G" , "A" , "" },
292+ { "GG" , "AE" , "G" },
293+ { "N" , "YA" , "GG" },
294+ { "D" , "YAE" , "GS" },
295+ { "DD" , "EO" , "N" , },
296+ { "R" , "E" , "NJ" },
297+ { "M" , "YEO" , "NH" },
298+ { "B" , "YE" , "D" },
299+ { "BB" , "O" , "L" },
300+ { "S" , "WA" , "LG" },
301+ { "SS" , "WAE" , "LM" },
302+ { "" , "OE" , "LB" },
303+ { "J" , "YO" , "LS" },
304+ { "JJ" , "U" , "LT" },
305+ { "C" , "WEO" , "LP" },
306+ { "K" , "WE" , "LH" },
307+ { "T" , "WI" , "M" },
308+ { "P" , "YU" , "B" },
309+ { "H" , "EU" , "BS" },
310+ { 0 , "YI" , "S" },
311+ { 0 , "I" , "SS" },
312+ { 0 , 0 , "NG" },
313+ { 0 , 0 , "J" },
314+ { 0 , 0 , "C" },
315+ { 0 , 0 , "K" },
316+ { 0 , 0 , "T" },
317+ { 0 , 0 , "P" },
318+ { 0 , 0 , "H" }
319+ };
320+
279321static int
280322_getucname (Py_UCS4 code , char * buffer , int buflen )
281323{
@@ -284,6 +326,28 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
284326 int word ;
285327 unsigned char * w ;
286328
329+ if (SBase <= code && code <= SBase + SCount ) {
330+ /* Hangul syllable. */
331+ int SIndex = code - SBase ;
332+ int L = SIndex / NCount ;
333+ int V = (SIndex % NCount ) / TCount ;
334+ int T = SIndex % TCount ;
335+
336+ if (buflen < 27 )
337+ /* Worst case: HANGUL SYLLABLE <10chars>. */
338+ return 0 ;
339+ strcpy (buffer , "HANGUL SYLLABLE " );
340+ buffer += 16 ;
341+ strcpy (buffer , hangul_syllables [L ][0 ]);
342+ buffer += strlen (hangul_syllables [L ][0 ]);
343+ strcpy (buffer , hangul_syllables [V ][1 ]);
344+ buffer += strlen (hangul_syllables [V ][1 ]);
345+ strcpy (buffer , hangul_syllables [T ][2 ]);
346+ buffer += strlen (hangul_syllables [T ][2 ]);
347+ * buffer = '\0' ;
348+ return 1 ;
349+ }
350+
287351 if (code >= 0x110000 )
288352 return 0 ;
289353
@@ -343,13 +407,50 @@ _cmpname(int code, const char* name, int namelen)
343407 return buffer [namelen ] == '\0' ;
344408}
345409
410+ static void
411+ find_syllable (const char * str , int * len , int * pos , int count , int column )
412+ {
413+ int i , len1 ;
414+ * len = -1 ;
415+ for (i = 0 ; i < count ; i ++ ) {
416+ char * s = hangul_syllables [i ][column ];
417+ len1 = strlen (s );
418+ if (len1 <= * len )
419+ continue ;
420+ if (strncmp (str , s , len1 ) == 0 ) {
421+ * len = len1 ;
422+ * pos = i ;
423+ }
424+ }
425+ if (* len == -1 ) {
426+ * len = 0 ;
427+ * pos = -1 ;
428+ }
429+ }
430+
346431static int
347432_getcode (const char * name , int namelen , Py_UCS4 * code )
348433{
349434 unsigned int h , v ;
350435 unsigned int mask = code_size - 1 ;
351436 unsigned int i , incr ;
352437
438+ /* Check for hangul syllables. */
439+ if (strncmp (name , "HANGUL SYLLABLE " , 16 ) == 0 ) {
440+ int L , V , T , len ;
441+ const char * pos = name + 16 ;
442+ find_syllable (pos , & len , & L , LCount , 0 );
443+ pos += len ;
444+ find_syllable (pos , & len , & V , VCount , 1 );
445+ pos += len ;
446+ find_syllable (pos , & len , & T , TCount , 2 );
447+ pos += len ;
448+ if (V != -1 && V != -1 && T != -1 && pos - name == namelen ) {
449+ * code = SBase + (L * VCount + V )* TCount + T ;
450+ return 1 ;
451+ }
452+ }
453+
353454 /* the following is the same as python's dictionary lookup, with
354455 only minor changes. see the makeunicodedata script for more
355456 details */
@@ -475,3 +576,9 @@ initunicodedata(void)
475576 if (v != NULL )
476577 PyModule_AddObject (m , "ucnhash_CAPI" , v );
477578}
579+
580+ /*
581+ Local variables:
582+ c-basic-offset: 4
583+ End:
584+ */
0 commit comments