Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a866df8

Browse files
committed
This patch changes the default behaviour of the builtin charmap
codec to not apply Latin-1 mappings for keys which are not found in the mapping dictionaries, but instead treat them as undefined mappings. The patch was originally written by Martin v. Loewis with some additional (cosmetic) changes and an updated test script by Marc-Andre Lemburg. The standard codecs were recreated from the most current files available at the Unicode.org site using the Tools/scripts/gencodec.py tool. This patch closes the bugs #116285 and #119960.
1 parent b55b7bb commit a866df8

56 files changed

Lines changed: 424 additions & 293 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Lib/codecs.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,21 @@ def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
539539
sr.file_encoding = file_encoding
540540
return sr
541541

542+
### Helpers for charmap-based codecs
543+
544+
def make_identity_dict(rng):
545+
546+
""" make_identity_dict(rng) -> dict
547+
548+
Return a dictionary where elements of the rng sequence are
549+
mapped to themselves.
550+
551+
"""
552+
res = {}
553+
for i in rng:
554+
res[i]=i
555+
return res
556+
542557
### Tests
543558

544559
if __name__ == '__main__':

Lib/encodings/cp037.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP037.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP037.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0004: 0x009c, # CONTROL
4141
0x0005: 0x0009, # HORIZONTAL TABULATION
4242
0x0006: 0x0086, # CONTROL
@@ -273,7 +273,7 @@ def getregentry():
273273
0x00fd: 0x00d9, # LATIN CAPITAL LETTER U WITH GRAVE
274274
0x00fe: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE
275275
0x00ff: 0x009f, # CONTROL
276-
}
276+
})
277277

278278
### Encoding Map
279279

Lib/encodings/cp1006.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1006.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1006.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x00a1: 0x06f0, # EXTENDED ARABIC-INDIC DIGIT ZERO
4141
0x00a2: 0x06f1, # EXTENDED ARABIC-INDIC DIGIT ONE
4242
0x00a3: 0x06f2, # EXTENDED ARABIC-INDIC DIGIT TWO
@@ -131,7 +131,7 @@ def getregentry():
131131
0x00fd: 0xfbae, # ARABIC LETTER YEH BARREE ISOLATED FORM
132132
0x00fe: 0xfe7c, # ARABIC SHADDA ISOLATED FORM
133133
0x00ff: 0xfe7d, # ARABIC SHADDA MEDIAL FORM
134-
}
134+
})
135135

136136
### Encoding Map
137137

Lib/encodings/cp1026.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1026.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1026.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0004: 0x009c, # CONTROL
4141
0x0005: 0x0009, # HORIZONTAL TABULATION
4242
0x0006: 0x0086, # CONTROL
@@ -273,7 +273,7 @@ def getregentry():
273273
0x00fd: 0x00d9, # LATIN CAPITAL LETTER U WITH GRAVE
274274
0x00fe: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE
275275
0x00ff: 0x009f, # CONTROL
276-
}
276+
})
277277

278278
### Encoding Map
279279

Lib/encodings/cp1250.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1250.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1250.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0080: 0x20ac, # EURO SIGN
4141
0x0081: None, # UNDEFINED
4242
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -116,7 +116,7 @@ def getregentry():
116116
0x00fb: 0x0171, # LATIN SMALL LETTER U WITH DOUBLE ACUTE
117117
0x00fe: 0x0163, # LATIN SMALL LETTER T WITH CEDILLA
118118
0x00ff: 0x02d9, # DOT ABOVE
119-
}
119+
})
120120

121121
### Encoding Map
122122

Lib/encodings/cp1251.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1251.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1251.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0080: 0x0402, # CYRILLIC CAPITAL LETTER DJE
4141
0x0081: 0x0403, # CYRILLIC CAPITAL LETTER GJE
4242
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -150,7 +150,7 @@ def getregentry():
150150
0x00fd: 0x044d, # CYRILLIC SMALL LETTER E
151151
0x00fe: 0x044e, # CYRILLIC SMALL LETTER YU
152152
0x00ff: 0x044f, # CYRILLIC SMALL LETTER YA
153-
}
153+
})
154154

155155
### Encoding Map
156156

Lib/encodings/cp1252.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1252.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1252.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0080: 0x20ac, # EURO SIGN
4141
0x0081: None, # UNDEFINED
4242
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -69,7 +69,7 @@ def getregentry():
6969
0x009d: None, # UNDEFINED
7070
0x009e: 0x017e, # LATIN SMALL LETTER Z WITH CARON
7171
0x009f: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
72-
}
72+
})
7373

7474
### Encoding Map
7575

Lib/encodings/cp1253.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1253.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1253.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0080: 0x20ac, # EURO SIGN
4141
0x0081: None, # UNDEFINED
4242
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -144,7 +144,7 @@ def getregentry():
144144
0x00fd: 0x03cd, # GREEK SMALL LETTER UPSILON WITH TONOS
145145
0x00fe: 0x03ce, # GREEK SMALL LETTER OMEGA WITH TONOS
146146
0x00ff: None, # UNDEFINED
147-
}
147+
})
148148

149149
### Encoding Map
150150

Lib/encodings/cp1254.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1254.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1254.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0080: 0x20ac, # EURO SIGN
4141
0x0081: None, # UNDEFINED
4242
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -75,7 +75,7 @@ def getregentry():
7575
0x00f0: 0x011f, # LATIN SMALL LETTER G WITH BREVE
7676
0x00fd: 0x0131, # LATIN SMALL LETTER DOTLESS I
7777
0x00fe: 0x015f, # LATIN SMALL LETTER S WITH CEDILLA
78-
}
78+
})
7979

8080
### Encoding Map
8181

Lib/encodings/cp1255.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
""" Python Character Mapping Codec generated from 'CP1255.TXT'.
2-
1+
""" Python Character Mapping Codec generated from 'CP1255.TXT' with gencodec.py.
32
43
Written by Marc-Andre Lemburg ([email protected]).
54
65
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6+
(c) Copyright 2000 Guido van Rossum.
77
88
"""#"
99

@@ -35,8 +35,8 @@ def getregentry():
3535

3636
### Decoding Map
3737

38-
decoding_map = {
39-
38+
decoding_map = codecs.make_identity_dict(range(256))
39+
decoding_map.update({
4040
0x0080: 0x20ac, # EURO SIGN
4141
0x0081: None, # UNDEFINED
4242
0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK
@@ -136,7 +136,7 @@ def getregentry():
136136
0x00fd: 0x200e, # LEFT-TO-RIGHT MARK
137137
0x00fe: 0x200f, # RIGHT-TO-LEFT MARK
138138
0x00ff: None, # UNDEFINED
139-
}
139+
})
140140

141141
### Encoding Map
142142

0 commit comments

Comments
 (0)