Skip to content

Commit b4e5bc2

Browse files
authored
gh-146192: Add base32 support to binascii (GH-146193)
Add base32 encoder and decoder functions implemented in C to the binascii module and use them to greatly improve the performance and reduce the memory usage of the existing base32 codec functions in the base64 module.
1 parent a17301a commit b4e5bc2

File tree

7 files changed

+837
-87
lines changed

7 files changed

+837
-87
lines changed

Doc/library/binascii.rst

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,38 @@ The :mod:`!binascii` module defines the following functions:
183183
.. versionadded:: 3.15
184184

185185

186+
.. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET)
187+
188+
Convert base32 data back to binary and return the binary data.
189+
190+
Valid base32 data contains characters from the base32 alphabet specified
191+
in :rfc:`4648` in groups of eight (if necessary, the final group is padded
192+
to eight characters with ``=``). Each group encodes 40 bits of binary data
193+
in the range from ``0`` to ``2 ** 40 - 1``, inclusive.
194+
195+
.. note::
196+
This function does not map lowercase characters (which are invalid in
197+
standard base32) to their uppercase counterparts, nor does it
198+
contextually map ``0`` to ``O`` and ``1`` to ``I``/``L`` as :rfc:`4648`
199+
allows.
200+
201+
Optional *alphabet* must be a :class:`bytes` object of length 32 which
202+
specifies an alternative alphabet.
203+
204+
Invalid base32 data will raise :exc:`binascii.Error`.
205+
206+
.. versionadded:: next
207+
208+
.. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET)
209+
210+
Convert binary data to a line of ASCII characters in base32 coding,
211+
as specified in :rfc:`4648`. The return value is the converted line.
212+
213+
Optional *alphabet* must be a :term:`bytes-like object` of length 32 which
214+
specifies an alternative alphabet.
215+
216+
.. versionadded:: next
217+
186218
.. function:: a2b_qp(data, header=False)
187219

188220
Convert a block of quoted-printable data back to binary and return the binary
@@ -327,6 +359,20 @@ The :mod:`!binascii` module defines the following functions:
327359

328360
.. versionadded:: next
329361

362+
.. data:: BASE32_ALPHABET
363+
364+
The Base 32 alphabet according to :rfc:`4648`.
365+
366+
.. versionadded:: next
367+
368+
.. data:: BASE32HEX_ALPHABET
369+
370+
The "Extended Hex" Base 32 alphabet according to :rfc:`4648`.
371+
Data encoded with this alphabet maintains its sort order during bitwise
372+
comparisons.
373+
374+
.. versionadded:: next
375+
330376

331377
.. seealso::
332378

Doc/whatsnew/3.15.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,12 @@ binascii
662662
* Added the *ignorechars* parameter in :func:`~binascii.a2b_base64`.
663663
(Contributed by Serhiy Storchaka in :gh:`144001`.)
664664

665+
* Added functions for Base32 encoding:
666+
667+
- :func:`~binascii.b2a_base32` and :func:`~binascii.a2b_base32`
668+
669+
(Contributed by James Seo in :gh:`146192`.)
670+
665671

666672
calendar
667673
--------
@@ -1279,6 +1285,10 @@ base64 & binascii
12791285
two orders of magnitude less memory.
12801286
(Contributed by James Seo and Serhiy Storchaka in :gh:`101178`.)
12811287

1288+
* Implementation for Base32 has been rewritten in C.
1289+
Encoding and decoding is now two orders of magnitude faster.
1290+
(Contributed by James Seo in :gh:`146192`)
1291+
12821292

12831293
csv
12841294
---

Lib/base64.py

Lines changed: 12 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -206,54 +206,13 @@ def urlsafe_b64decode(s):
206206
the letter O). For security purposes the default is None, so that
207207
0 and 1 are not allowed in the input.
208208
'''
209-
_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
210-
_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
211-
_b32tab2 = {}
212-
_b32rev = {}
213-
214-
def _b32encode(alphabet, s):
215-
# Delay the initialization of the table to not waste memory
216-
# if the function is never called
217-
if alphabet not in _b32tab2:
218-
b32tab = [bytes((i,)) for i in alphabet]
219-
_b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab]
220-
b32tab = None
221-
222-
if not isinstance(s, bytes_types):
223-
s = memoryview(s).tobytes()
224-
leftover = len(s) % 5
225-
# Pad the last quantum with zero bits if necessary
226-
if leftover:
227-
s = s + b'\0' * (5 - leftover) # Don't use += !
228-
encoded = bytearray()
229-
from_bytes = int.from_bytes
230-
b32tab2 = _b32tab2[alphabet]
231-
for i in range(0, len(s), 5):
232-
c = from_bytes(s[i: i + 5]) # big endian
233-
encoded += (b32tab2[c >> 30] + # bits 1 - 10
234-
b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20
235-
b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30
236-
b32tab2[c & 0x3ff] # bits 31 - 40
237-
)
238-
# Adjust for any leftover partial quanta
239-
if leftover == 1:
240-
encoded[-6:] = b'======'
241-
elif leftover == 2:
242-
encoded[-4:] = b'===='
243-
elif leftover == 3:
244-
encoded[-3:] = b'==='
245-
elif leftover == 4:
246-
encoded[-1:] = b'='
247-
return encoded.take_bytes()
248-
249-
def _b32decode(alphabet, s, casefold=False, map01=None):
250-
# Delay the initialization of the table to not waste memory
251-
# if the function is never called
252-
if alphabet not in _b32rev:
253-
_b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)}
209+
210+
def b32encode(s):
211+
return binascii.b2a_base32(s)
212+
b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32')
213+
214+
def b32decode(s, casefold=False, map01=None):
254215
s = _bytes_from_decode_data(s)
255-
if len(s) % 8:
256-
raise binascii.Error('Incorrect padding')
257216
# Handle section 2.4 zero and one mapping. The flag map01 will be either
258217
# False, or the character to map the digit 1 (one) to. It should be
259218
# either L (el) or I (eye).
@@ -263,51 +222,20 @@ def _b32decode(alphabet, s, casefold=False, map01=None):
263222
s = s.translate(bytes.maketrans(b'01', b'O' + map01))
264223
if casefold:
265224
s = s.upper()
266-
# Strip off pad characters from the right. We need to count the pad
267-
# characters because this will tell us how many null bytes to remove from
268-
# the end of the decoded string.
269-
l = len(s)
270-
s = s.rstrip(b'=')
271-
padchars = l - len(s)
272-
# Now decode the full quanta
273-
decoded = bytearray()
274-
b32rev = _b32rev[alphabet]
275-
for i in range(0, len(s), 8):
276-
quanta = s[i: i + 8]
277-
acc = 0
278-
try:
279-
for c in quanta:
280-
acc = (acc << 5) + b32rev[c]
281-
except KeyError:
282-
raise binascii.Error('Non-base32 digit found') from None
283-
decoded += acc.to_bytes(5) # big endian
284-
# Process the last, partial quanta
285-
if l % 8 or padchars not in {0, 1, 3, 4, 6}:
286-
raise binascii.Error('Incorrect padding')
287-
if padchars and decoded:
288-
acc <<= 5 * padchars
289-
last = acc.to_bytes(5) # big endian
290-
leftover = (43 - 5 * padchars) // 8 # 1: 4, 3: 3, 4: 2, 6: 1
291-
decoded[-5:] = last[:leftover]
292-
return decoded.take_bytes()
293-
294-
295-
def b32encode(s):
296-
return _b32encode(_b32alphabet, s)
297-
b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32')
298-
299-
def b32decode(s, casefold=False, map01=None):
300-
return _b32decode(_b32alphabet, s, casefold, map01)
225+
return binascii.a2b_base32(s)
301226
b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32',
302227
extra_args=_B32_DECODE_MAP01_DOCSTRING)
303228

304229
def b32hexencode(s):
305-
return _b32encode(_b32hexalphabet, s)
230+
return binascii.b2a_base32(s, alphabet=binascii.BASE32HEX_ALPHABET)
306231
b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex')
307232

308233
def b32hexdecode(s, casefold=False):
234+
s = _bytes_from_decode_data(s)
309235
# base32hex does not have the 01 mapping
310-
return _b32decode(_b32hexalphabet, s, casefold)
236+
if casefold:
237+
s = s.upper()
238+
return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET)
311239
b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex',
312240
extra_args='')
313241

0 commit comments

Comments
 (0)