Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6dd0d46

Browse files
committed
Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.
1 parent 1a048f9 commit 6dd0d46

4 files changed

Lines changed: 514 additions & 3 deletions

File tree

Doc/library/base64.rst

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,76 @@ The modern interface provides:
132132
string.
133133

134134

135+
.. function:: a85encode(s, *, foldspaces=False, wrapcol=0, pad=False, adobe=False)
136+
137+
Encode a byte string using Ascii85.
138+
139+
*s* is the string to encode. The encoded byte string is returned.
140+
141+
*foldspaces* is an optional flag that uses the special short sequence 'y'
142+
instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
143+
feature is not supported by the "standard" Ascii85 encoding.
144+
145+
*wrapcol* controls whether the output should have newline ('\n')
146+
characters added to it. If this is non-zero, each output line will be
147+
at most this many characters long.
148+
149+
*pad* controls whether the input string is padded to a multiple of 4
150+
before encoding. Note that the ``btoa`` implementation always pads.
151+
152+
*adobe* controls whether the encoded byte sequence is framed with ``<~``
153+
and ``~>``, which is used by the Adobe implementation.
154+
155+
.. versionadded:: 3.4
156+
157+
158+
.. function:: a85decode(s, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v')
159+
160+
Decode an Ascii85 encoded byte string.
161+
162+
*s* is the byte string to decode.
163+
164+
*foldspaces* is a flag that specifies whether the 'y' short sequence
165+
should be accepted as shorthand for 4 consecutive spaces (ASCII 0x20).
166+
This feature is not supported by the "standard" Ascii85 encoding.
167+
168+
*adobe* controls whether the input sequence is in Adobe Ascii85 format
169+
(i.e. is framed with <~ and ~>).
170+
171+
*ignorechars* should be a byte string containing characters to ignore
172+
from the input. This should only contain whitespace characters, and by
173+
default contains all whitespace characters in ASCII.
174+
175+
.. versionadded:: 3.4
176+
177+
178+
.. function:: b85encode(s, pad=False)
179+
180+
Encode a byte string using base85, as used in e.g. git-style binary
181+
diffs.
182+
183+
If *pad* is true, the input is padded with "\\0" so its length is a
184+
multiple of 4 characters before encoding.
185+
186+
.. versionadded:: 3.4
187+
188+
189+
.. function:: b85decode(b)
190+
191+
Decode base85-encoded byte string. Padding is implicitly removed, if
192+
necessary.
193+
194+
.. versionadded:: 3.4
195+
196+
197+
.. note::
198+
Both Base85 and Ascii85 have an expansion factor of 5 to 4 (5 Base85 or
199+
Ascii85 characters can encode 4 binary bytes), while the better-known
200+
Base64 has an expansion factor of 6 to 4. They are therefore more
201+
efficient when space expensive. They differ by details such as the
202+
character map used for encoding.
203+
204+
135205
The legacy interface:
136206

137207
.. function:: decode(input, output)

Lib/base64.py

Lines changed: 190 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#! /usr/bin/env python3
22

3-
"""RFC 3548: Base16, Base32, Base64 Data Encodings"""
3+
"""Base16, Base32, Base64 (RFC 3548), Base85 and Ascii85 data encodings"""
44

55
# Modified 04-Oct-1995 by Jack Jansen to use binascii module
66
# Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support
@@ -9,6 +9,7 @@
99
import re
1010
import struct
1111
import binascii
12+
import itertools
1213

1314

1415
__all__ = [
@@ -17,6 +18,8 @@
1718
# Generalized interface for other encodings
1819
'b64encode', 'b64decode', 'b32encode', 'b32decode',
1920
'b16encode', 'b16decode',
21+
# Base85 and Ascii85 encodings
22+
'b85encode', 'b85decode', 'a85encode', 'a85decode',
2023
# Standard Base64 encoding
2124
'standard_b64encode', 'standard_b64decode',
2225
# Some common Base64 alternatives. As referenced by RFC 3458, see thread
@@ -268,7 +271,193 @@ def b16decode(s, casefold=False):
268271
raise binascii.Error('Non-base16 digit found')
269272
return binascii.unhexlify(s)
270273

274+
#
275+
# Ascii85 encoding/decoding
276+
#
271277

278+
def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
279+
# Helper function for a85encode and b85encode
280+
if not isinstance(b, bytes_types):
281+
b = memoryview(b).tobytes()
282+
283+
padding = (-len(b)) % 4
284+
if padding:
285+
b = b + b'\0' * padding
286+
words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)
287+
288+
a85chars2 = _a85chars2
289+
a85chars = _a85chars
290+
chunks = [b'z' if foldnuls and not word else
291+
b'y' if foldspaces and word == 0x20202020 else
292+
(chars2[word // 614125] +
293+
chars2[word // 85 % 7225] +
294+
chars[word % 85])
295+
for word in words]
296+
297+
if padding and not pad:
298+
if chunks[-1] == b'z':
299+
chunks[-1] = chars[0] * 5
300+
chunks[-1] = chunks[-1][:-padding]
301+
302+
return b''.join(chunks)
303+
304+
_A85START = b"<~"
305+
_A85END = b"~>"
306+
_a85chars = [bytes([i]) for i in range(33, 118)]
307+
_a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]
308+
309+
def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
310+
"""Encode a byte string using Ascii85.
311+
312+
b is the byte string to encode. The encoded byte string is returned.
313+
314+
foldspaces is an optional flag that uses the special short sequence 'y'
315+
instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
316+
feature is not supported by the "standard" Adobe encoding.
317+
318+
wrapcol controls whether the output should have newline ('\n') characters
319+
added to it. If this is non-zero, each output line will be at most this
320+
many characters long.
321+
322+
pad controls whether the input string is padded to a multiple of 4 before
323+
encoding. Note that the btoa implementation always pads.
324+
325+
adobe controls whether the encoded byte sequence is framed with <~ and ~>,
326+
which is used by the Adobe implementation.
327+
"""
328+
result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
329+
330+
if adobe:
331+
result = _A85START + result
332+
if wrapcol:
333+
wrapcol = max(2 if adobe else 1, wrapcol)
334+
chunks = [result[i: i + wrapcol]
335+
for i in range(0, len(result), wrapcol)]
336+
if adobe:
337+
if len(chunks[-1]) + 2 > wrapcol:
338+
chunks.append(b'')
339+
result = b'\n'.join(chunks)
340+
if adobe:
341+
result += _A85END
342+
343+
return result
344+
345+
def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'):
346+
"""Decode an Ascii85 encoded byte string.
347+
348+
s is the byte string to decode.
349+
350+
foldspaces is a flag that specifies whether the 'y' short sequence should be
351+
accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is
352+
not supported by the "standard" Adobe encoding.
353+
354+
adobe controls whether the input sequence is in Adobe Ascii85 format (i.e.
355+
is framed with <~ and ~>).
356+
357+
ignorechars should be a byte string containing characters to ignore from the
358+
input. This should only contain whitespace characters, and by default
359+
contains all whitespace characters in ASCII.
360+
"""
361+
b = _bytes_from_decode_data(b)
362+
if adobe:
363+
if not (b.startswith(_A85START) and b.endswith(_A85END)):
364+
raise ValueError("Ascii85 encoded byte sequences must be bracketed "
365+
"by {} and {}".format(_A85START, _A85END))
366+
b = b[2:-2] # Strip off start/end markers
367+
#
368+
# We have to go through this stepwise, so as to ignore spaces and handle
369+
# special short sequences
370+
#
371+
packI = struct.Struct('!I').pack
372+
decoded = []
373+
decoded_append = decoded.append
374+
curr = []
375+
curr_append = curr.append
376+
curr_clear = curr.clear
377+
for x in b + b'u' * 4:
378+
if b'!'[0] <= x <= b'u'[0]:
379+
curr_append(x)
380+
if len(curr) == 5:
381+
acc = 0
382+
for x in curr:
383+
acc = 85 * acc + (x - 33)
384+
try:
385+
decoded_append(packI(acc))
386+
except struct.error:
387+
raise ValueError('Ascii85 overflow') from None
388+
curr_clear()
389+
elif x == b'z'[0]:
390+
if curr:
391+
raise ValueError('z inside Ascii85 5-tuple')
392+
decoded_append(b'\0\0\0\0')
393+
elif foldspaces and x == b'y'[0]:
394+
if curr:
395+
raise ValueError('y inside Ascii85 5-tuple')
396+
decoded_append(b'\x20\x20\x20\x20')
397+
elif x in ignorechars:
398+
# Skip whitespace
399+
continue
400+
else:
401+
raise ValueError('Non-Ascii85 digit found: %c' % x)
402+
403+
result = b''.join(decoded)
404+
padding = 4 - len(curr)
405+
if padding:
406+
# Throw away the extra padding
407+
result = result[:-padding]
408+
return result
409+
410+
# The following code is originally taken (with permission) from Mercurial
411+
412+
_b85chars = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
413+
b"abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"
414+
_b85chars = [bytes([i]) for i in _b85chars]
415+
_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
416+
_b85dec = None
417+
418+
def b85encode(b, pad=False):
419+
"""Encode an ASCII-encoded byte array in base85 format.
420+
421+
If pad is true, the input is padded with "\0" so its length is a multiple of
422+
4 characters before encoding.
423+
"""
424+
return _85encode(b, _b85chars, _b85chars2, pad)
425+
426+
def b85decode(b):
427+
"""Decode base85-encoded byte array"""
428+
b = _bytes_from_decode_data(b)
429+
global _b85dec
430+
if _b85dec is None:
431+
_b85dec = [None] * 256
432+
for i, c in enumerate(_b85chars):
433+
_b85dec[c[0]] = i
434+
435+
padding = (-len(b)) % 5
436+
b = b + b'~' * padding
437+
out = []
438+
packI = struct.Struct('!I').pack
439+
for i in range(0, len(b), 5):
440+
chunk = b[i:i + 5]
441+
acc = 0
442+
try:
443+
for c in chunk:
444+
acc = acc * 85 + _b85dec[c]
445+
except TypeError:
446+
for j, c in enumerate(chunk):
447+
if _b85dec[c] is None:
448+
raise ValueError('bad base85 character at position %d'
449+
% (i + j)) from None
450+
raise
451+
try:
452+
out.append(packI(acc))
453+
except struct.error:
454+
raise ValueError('base85 overflow in hunk starting at byte %d'
455+
% i) from None
456+
457+
result = b''.join(out)
458+
if padding:
459+
result = result[:-padding]
460+
return result
272461

273462
# Legacy interface. This code could be cleaned up since I don't believe
274463
# binascii has any line length limitations. It just doesn't seem worth it

0 commit comments

Comments
 (0)