Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3e2a306

Browse files
committed
Add CJK codecs support as discussed on python-dev. (SF #873597)
Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks!
1 parent cd1f743 commit 3e2a306

88 files changed

Lines changed: 43278 additions & 34 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Doc/lib/libcodecs.tex

Lines changed: 81 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -212,15 +212,6 @@ \section{\module{codecs} ---
212212
\end{datadesc}
213213

214214

215-
\begin{seealso}
216-
\seeurl{http://sourceforge.net/projects/python-codecs/}{A
217-
SourceForge project working on additional support for Asian
218-
codecs for use with Python. They are in the early stages of
219-
development at the time of this writing --- look in their
220-
FTP area for downloadable files.}
221-
\end{seealso}
222-
223-
224215
\subsection{Codec Base Classes}
225216

226217
The \module{codecs} defines a set of base classes which define the
@@ -553,6 +544,10 @@ \subsection{Standard Encodings}
553544
{646, us-ascii}
554545
{English}
555546

547+
\lineiii{big5}
548+
{big5_tw, csbig5}
549+
{Traditional Chinese}
550+
556551
\lineiii{cp037}
557552
{IBM037, IBM039}
558553
{English}
@@ -633,6 +628,18 @@ \subsection{Standard Encodings}
633628
{}
634629
{Greek}
635630

631+
\lineiii{cp932}
632+
{932, ms932, mskanji, ms_kanji}
633+
{Japanese}
634+
635+
\lineiii{cp949}
636+
{949, ms949, uhc}
637+
{Korean}
638+
639+
\lineiii{cp950}
640+
{950, ms950}
641+
{Traditional Chinese}
642+
636643
\lineiii{cp1006}
637644
{}
638645
{Urdu}
@@ -681,6 +688,59 @@ \subsection{Standard Encodings}
681688
{windows-1258}
682689
{Vietnamese}
683690

691+
\lineiii{euc_jp}
692+
{eucjp, ujis, u_jis}
693+
{Japanese}
694+
695+
\lineiii{euc_jisx0213}
696+
{jisx0213, eucjisx0213}
697+
{Japanese}
698+
699+
\lineiii{euc_kr}
700+
{euckr, korean, ksc5601, ks_c_5601, ks_c_5601_1987, ksx1001, ks_x_1001}
701+
{Korean}
702+
703+
\lineiii{gb2312}
704+
{chinese, csiso58gb231280, euc_cn, euccn, eucgb2312_cn, gb2312_1980,
705+
gb2312_80, iso_ir_58}
706+
{Simplified Chinese}
707+
708+
\lineiii{gbk}
709+
{936, cp936, ms936}
710+
{Unified Chinese}
711+
712+
\lineiii{gb18030}
713+
{gb18030_2000}
714+
{Unified Chinese}
715+
716+
\lineiii{hz}
717+
{hzgb, hz_gb, hz_gb_2312}
718+
{Simplified Chinese}
719+
720+
\lineiii{iso2022_jp}
721+
{csiso2022jp, iso2022jp, iso_2022_jp}
722+
{Japanese}
723+
724+
\lineiii{iso2022_jp_1}
725+
{iso2022jp_1, iso_2022_jp_1}
726+
{Japanese}
727+
728+
\lineiii{iso2022_jp_2}
729+
{iso2022jp_2, iso_2022_jp_2}
730+
{Japanese, Korean, Simplified Chinese, Western Europe, Greek}
731+
732+
\lineiii{iso2022_jp_3}
733+
{iso2022jp_3, iso_2022_jp_3}
734+
{Japanese}
735+
736+
\lineiii{iso2022_jp_ext}
737+
{iso2022jp_ext, iso_2022_jp_ext}
738+
{Japanese}
739+
740+
\lineiii{iso2022_kr}
741+
{csiso2022kr, iso2022kr, iso_2022_kr}
742+
{Korean}
743+
684744
\lineiii{latin_1}
685745
{iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1}
686746
{West Europe}
@@ -733,6 +793,10 @@ \subsection{Standard Encodings}
733793
{iso-8859-15}
734794
{Western Europe}
735795

796+
\lineiii{johab}
797+
{cp1361, ms1361}
798+
{Korean}
799+
736800
\lineiii{koi8_r}
737801
{}
738802
{Russian}
@@ -765,6 +829,14 @@ \subsection{Standard Encodings}
765829
{macturkish}
766830
{Turkish}
767831

832+
\lineiii{shift_jis}
833+
{csshiftjis, shiftjis, sjis, s_jis}
834+
{Japanese}
835+
836+
\lineiii{shift_jisx0213}
837+
{shiftjisx0213, sjisx0213, s_jisx0213}
838+
{Japanese}
839+
768840
\lineiii{utf_16}
769841
{U16, utf16}
770842
{all languages}

Lib/email/test/test_email_codecs.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,6 @@
88
from email.Charset import Charset
99
from email.Header import Header, decode_header
1010

11-
# See if we have the Japanese codecs package installed
12-
try:
13-
unicode('foo', 'japanese.iso-2022-jp')
14-
except LookupError:
15-
raise TestSkipped, 'Optional Japanese codecs not installed'
16-
17-
1811

1912
class TestEmailAsianCodecs(TestEmailBase):
2013
def test_japanese_codecs(self):

Lib/encodings/aliases.py

Lines changed: 100 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,6 @@
1414
codecs. In addition to these, a few Python specific codec
1515
aliases have also been added.
1616
17-
About the CJK codec aliases:
18-
19-
The codecs for these encodings are not distributed with the
20-
Python core, but are included here for reference, since the
21-
locale module relies on having these aliases available.
22-
2317
"""
2418
aliases = {
2519

@@ -41,6 +35,10 @@
4135
'base64' : 'base64_codec',
4236
'base_64' : 'base64_codec',
4337

38+
# big5 codec
39+
'big5_tw' : 'big5',
40+
'csbig5' : 'big5',
41+
4442
# bz2_codec codec
4543
'bz2' : 'bz2_codec',
4644

@@ -168,9 +166,91 @@
168166
'csibm869' : 'cp869',
169167
'ibm869' : 'cp869',
170168

169+
# cp932 codec
170+
'932' : 'cp932',
171+
'ms932' : 'cp932',
172+
'mskanji' : 'cp932',
173+
'ms_kanji' : 'cp932',
174+
175+
# cp949 codec
176+
'949' : 'cp949',
177+
'ms949' : 'cp949',
178+
'uhc' : 'cp949',
179+
180+
# cp950 codec
181+
'950' : 'cp950',
182+
'ms950' : 'cp950',
183+
184+
# euc_jisx0213 codec
185+
'jisx0213' : 'euc_jisx0213',
186+
'eucjisx0213' : 'euc_jisx0213',
187+
188+
# euc_jp codec
189+
'eucjp' : 'euc_jp',
190+
'ujis' : 'euc_jp',
191+
'u_jis' : 'euc_jp',
192+
193+
# euc_kr codec
194+
'euckr' : 'euc_kr',
195+
'korean' : 'euc_kr',
196+
'ksc5601' : 'euc_kr',
197+
'ks_c_5601' : 'euc_kr',
198+
'ks_c_5601_1987' : 'euc_kr',
199+
'ksx1001' : 'euc_kr',
200+
'ks_x_1001' : 'euc_kr',
201+
202+
# gb18030 codec
203+
'gb18030_2000' : 'gb18030',
204+
205+
# gb2312 codec
206+
'chinese' : 'gb2312',
207+
'csiso58gb231280' : 'gb2312',
208+
'euc_cn' : 'gb2312',
209+
'euccn' : 'gb2312',
210+
'eucgb2312_cn' : 'gb2312',
211+
'gb2312_1980' : 'gb2312',
212+
'gb2312_80' : 'gb2312',
213+
'iso_ir_58' : 'gb2312',
214+
215+
# gbk codec
216+
'936' : 'gbk',
217+
'cp936' : 'gbk',
218+
'ms936' : 'gbk',
219+
171220
# hex_codec codec
172221
'hex' : 'hex_codec',
173222

223+
# hz codec
224+
'hzgb' : 'hz',
225+
'hz_gb' : 'hz',
226+
'hz_gb_2312' : 'hz',
227+
228+
# iso2022_jp codec
229+
'csiso2022jp' : 'iso2022_jp',
230+
'iso2022jp' : 'iso2022_jp',
231+
'iso_2022_jp' : 'iso2022_jp',
232+
233+
# iso2022_jp_1 codec
234+
'iso2022jp_1' : 'iso2022_jp_1',
235+
'iso_2022_jp_1' : 'iso2022_jp_1',
236+
237+
# iso2022_jp_2 codec
238+
'iso2022jp_2' : 'iso2022_jp_2',
239+
'iso_2022_jp_2' : 'iso2022_jp_2',
240+
241+
# iso_3022_jp_3 codec
242+
'iso2022jp_3' : 'iso2022_jp_3',
243+
'iso_2022_jp_3' : 'iso2022_jp_3',
244+
245+
# iso2022_jp_ext codec
246+
'iso2022jp_ext' : 'iso2022_jp_ext',
247+
'iso_2022_jp_ext' : 'iso2022_jp_ext',
248+
249+
# iso2022_kr codec
250+
'csiso2022kr' : 'iso2022_kr',
251+
'iso2022kr' : 'iso2022_kr',
252+
'iso_2022_kr' : 'iso2022_kr',
253+
174254
# iso8859_10 codec
175255
'csisolatin6' : 'iso8859_10',
176256
'iso_8859_10' : 'iso8859_10',
@@ -258,9 +338,9 @@
258338
'l5' : 'iso8859_9',
259339
'latin5' : 'iso8859_9',
260340

261-
# jis_7 codec
262-
'csiso2022jp' : 'jis_7',
263-
'iso_2022_jp' : 'jis_7',
341+
# johab codec
342+
'cp1361' : 'johab',
343+
'ms1361' : 'johab',
264344

265345
# koi8_r codec
266346
'cskoi8r' : 'koi8_r',
@@ -308,6 +388,17 @@
308388
# rot_13 codec
309389
'rot13' : 'rot_13',
310390

391+
# shift_jis codec
392+
'csshiftjis' : 'shift_jis',
393+
'shiftjis' : 'shift_jis',
394+
'sjis' : 'shift_jis',
395+
's_jis' : 'shift_jis',
396+
397+
# shift_jisx0213 codec
398+
'shiftjisx0213' : 'shift_jisx0213',
399+
'sjisx0213' : 'shift_jisx0213',
400+
's_jisx0213' : 'shift_jisx0213',
401+
311402
# tactis codec
312403
'tis260' : 'tactis',
313404

Lib/encodings/big5.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#
2+
# big5.py: Python Unicode Codec for BIG5
3+
#
4+
# Written by Hye-Shik Chang <[email protected]>
5+
# $CJKCodecs: big5.py,v 1.3 2004/01/17 11:26:10 perky Exp $
6+
#
7+
8+
from _codecs_big5 import codec
9+
import codecs
10+
11+
class Codec(codecs.Codec):
12+
encode = codec.encode
13+
decode = codec.decode
14+
15+
class StreamReader(Codec, codecs.StreamReader):
16+
def __init__(self, stream, errors='strict'):
17+
codecs.StreamReader.__init__(self, stream, errors)
18+
__codec = codec.StreamReader(stream, errors)
19+
self.read = __codec.read
20+
self.readline = __codec.readline
21+
self.readlines = __codec.readlines
22+
self.reset = __codec.reset
23+
24+
class StreamWriter(Codec, codecs.StreamWriter):
25+
def __init__(self, stream, errors='strict'):
26+
codecs.StreamWriter.__init__(self, stream, errors)
27+
__codec = codec.StreamWriter(stream, errors)
28+
self.write = __codec.write
29+
self.writelines = __codec.writelines
30+
self.reset = __codec.reset
31+
32+
def getregentry():
33+
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
34+

Lib/encodings/cp932.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#
2+
# cp932.py: Python Unicode Codec for CP932
3+
#
4+
# Written by Hye-Shik Chang <[email protected]>
5+
# $CJKCodecs: cp932.py,v 1.3 2004/01/17 11:26:10 perky Exp $
6+
#
7+
8+
from _codecs_cp932 import codec
9+
import codecs
10+
11+
class Codec(codecs.Codec):
12+
encode = codec.encode
13+
decode = codec.decode
14+
15+
class StreamReader(Codec, codecs.StreamReader):
16+
def __init__(self, stream, errors='strict'):
17+
codecs.StreamReader.__init__(self, stream, errors)
18+
__codec = codec.StreamReader(stream, errors)
19+
self.read = __codec.read
20+
self.readline = __codec.readline
21+
self.readlines = __codec.readlines
22+
self.reset = __codec.reset
23+
24+
class StreamWriter(Codec, codecs.StreamWriter):
25+
def __init__(self, stream, errors='strict'):
26+
codecs.StreamWriter.__init__(self, stream, errors)
27+
__codec = codec.StreamWriter(stream, errors)
28+
self.write = __codec.write
29+
self.writelines = __codec.writelines
30+
self.reset = __codec.reset
31+
32+
def getregentry():
33+
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
34+

Lib/encodings/cp949.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#
2+
# cp949.py: Python Unicode Codec for CP949
3+
#
4+
# Written by Hye-Shik Chang <[email protected]>
5+
# $CJKCodecs: cp949.py,v 1.3 2004/01/17 11:26:10 perky Exp $
6+
#
7+
8+
from _codecs_cp949 import codec
9+
import codecs
10+
11+
class Codec(codecs.Codec):
12+
encode = codec.encode
13+
decode = codec.decode
14+
15+
class StreamReader(Codec, codecs.StreamReader):
16+
def __init__(self, stream, errors='strict'):
17+
codecs.StreamReader.__init__(self, stream, errors)
18+
__codec = codec.StreamReader(stream, errors)
19+
self.read = __codec.read
20+
self.readline = __codec.readline
21+
self.readlines = __codec.readlines
22+
self.reset = __codec.reset
23+
24+
class StreamWriter(Codec, codecs.StreamWriter):
25+
def __init__(self, stream, errors='strict'):
26+
codecs.StreamWriter.__init__(self, stream, errors)
27+
__codec = codec.StreamWriter(stream, errors)
28+
self.write = __codec.write
29+
self.writelines = __codec.writelines
30+
self.reset = __codec.reset
31+
32+
def getregentry():
33+
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
34+

0 commit comments

Comments
 (0)