This repository has been archived by the owner on Nov 3, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
buffer.py
87 lines (70 loc) · 2.07 KB
/
buffer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import unicode_literals
try:
import re2 as re
except:
import re
from bs4 import UnicodeDammit
class LineBuffer(object):
r"""
Buffer bytes read in from a connection and serve complete lines back.
>>> b = LineBuffer()
>>> len(b)
0
>>> b.feed(b'foo\nbar')
>>> len(b)
7
>>> list(b.lines()) == [b'foo']
True
>>> len(b)
3
>>> b.feed(b'bar\r\nbaz\n')
>>> list(b.lines()) == [b'barbar', b'baz']
True
>>> len(b)
0
The buffer will not perform any decoding.
>>> b.feed(b'Ol\xe9\n')
>>> list(b.lines()) == [b'Ol\xe9']
True
The LineBuffer should also act as an iterable.
>>> b.feed(b'iterate\nthis\n')
>>> for line, expected in zip(b, [b'iterate', b'this']):
... assert line == expected
"""
line_sep_exp = re.compile(b'\r?\n')
def __init__(self):
self.buffer = b''
def feed(self, bytes):
self.buffer += bytes
def lines(self):
lines = self.line_sep_exp.split(self.buffer)
# save the last, unfinished, possibly empty line
self.buffer = lines.pop()
return iter(lines)
def __iter__(self):
return self.lines()
def __len__(self):
return len(self.buffer)
class DecodingLineBuffer(LineBuffer):
r"""
Like LineBuffer, but decode the output (default assumes UTF-8).
>>> utf8_word = b'Ol\xc3\xa9'
>>> b = DecodingLineBuffer()
>>> b.feed(b'bar\r\nbaz\n' + utf8_word + b'\n')
>>> list(b.lines()) == ['bar', 'baz', utf8_word.decode('utf-8')]
True
>>> len(b)
0
Some clients will feed latin-1 or other encodings. If your client should
support docoding from these clients (and not raise a UnicodeDecodeError),
set errors='replace':
>>> rb = DecodingLineBuffer()
>>> b.errors = 'replace'
>>> b.feed(b'Ol\xe9\n')
>>> list(b.lines()) == ['Ol\ufffd']
True
"""
encoding = 'utf-8'
errors = 'strict'
def lines(self):
return (UnicodeDammit(line, ["iso-8859-1", "latin-1"]).unicode_markup for line in super(DecodingLineBuffer, self).lines())