diff --git a/warc/arc.py b/warc/arc.py index 5889587..fcc986e 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -4,13 +4,18 @@ :copyright: (c) 2012 Internet Archive """ -import __builtin__ import datetime import os import re -import StringIO import warnings +try: + import __builtin__ + from StringIO import StringIO +except ImportError: + import builtins as __builtin__ + from io import StringIO + from .utils import CaseInsensitiveDict ARC1_HEADER_RE = re.compile('(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)') @@ -135,7 +140,7 @@ def length(self): return int(self["length"]) def __str__(self): - f = StringIO.StringIO() + f = StringIO() self.write_to(f) return f.getvalue() @@ -184,6 +189,8 @@ def write_to(self, f, version = None): f.write("\n") # This separates the header and the body if isinstance(self.payload, str): #Usually used for small payloads f.write(self.payload) + elif isinstance(self.payload, bytes): + f.write(self.payload.decode('utf-8')) elif hasattr(self.payload, "read"): #Used for large payloads where we give a file like object chunk_size = 10 * 1024 * 1024 # Read 10MB by 10MB d = self.payload.read(chunk_size) @@ -200,7 +207,7 @@ def __setitem__(self, name, value): def __str__(self): - f = StringIO.StringIO() + f = StringIO() self.write_to(f) return f.getvalue() @@ -318,15 +325,19 @@ def _read_file_header(self): # print "--------------------------------------------------" if self.version and int(self.version) != version: raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version)) - - if version == '1': + + if int(version) == 1: url, ip_address, date, content_type, length = header.split() + if isinstance(date, bytes): + date = date.decode('utf-8') self.file_headers = {"ip_address" : ip_address, "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), "org" : organisation} self.version = 1 - elif version == '2': + elif int(version) == 2: url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length = header.split() + if isinstance(date, bytes): + date = date.decode('utf-8') self.file_headers = {"ip_address" : ip_address, "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), "org" : organisation} @@ -355,6 +366,8 @@ def _read_arc_record(self): elif int(self.version) == 2: arc_header_re = ARC2_HEADER_RE + if isinstance(header, bytes): + header = header.decode('utf-8') matches = arc_header_re.search(header) headers = matches.groupdict() arc_header = ARCHeader(**headers) diff --git a/warc/gzip2.py b/warc/gzip2.py index fcd6b48..2cf2b1e 100644 --- a/warc/gzip2.py +++ b/warc/gzip2.py @@ -24,18 +24,19 @@ class GzipFile(BaseGzipFile): """ def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None): - BaseGzipFile.__init__(self, - filename=filename, + BaseGzipFile.__init__(self, + filename=filename, mode=mode, compresslevel=compresslevel, fileobj=fileobj) - + if self.mode == WRITE: # Indicates the start of a new member if value is True. # The BaseGzipFile constructor already wrote the header for new # member, so marking as False. self._new_member = False - + if not hasattr(self, '_new_member'): + self._new_member = True # When _member_lock is True, only one member in gzip file is read self._member_lock = False @@ -49,7 +50,7 @@ def close_member(self): self.fileobj.write(self.compress.flush()) write32u(self.fileobj, self.crc) # self.size may exceed 2GB, or even 4GB - write32u(self.fileobj, self.size & 0xffffffffL) + write32u(self.fileobj, self.size & 0xffffffff) self.size = 0 self.compress = zlib.compressobj(9, zlib.DEFLATED, @@ -95,6 +96,8 @@ def _read(self, size): def read_member(self): """Returns a file-like object to read one member from the gzip file. """ + if hasattr(self, '_buffer'): + return self._buffer if self._member_lock is False: self._member_lock = True diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index 11305e5..1d04bb1 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -1,6 +1,9 @@ import datetime import hashlib -import StringIO +try: + from StringIO import StringIO +except ImportError: + from io import StringIO from .. import arc @@ -49,7 +52,7 @@ def test_arc_v1_header_creation(): location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - f = StringIO.StringIO() + f = StringIO() header.write_to(f, 1) header_v1_string = f.getvalue() assert header_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500" @@ -67,7 +70,7 @@ def test_arc_v2_header_creation(): location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - f = StringIO.StringIO() + f = StringIO() header.write_to(f) header_v2_string = f.getvalue() assert header_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500" @@ -86,7 +89,7 @@ def test_arc_v1_record_creation(): offset = "300", filename = "sample.arc.gz") record_v1 = arc.ARCRecord(header, "BlahBlah") - f = StringIO.StringIO() + f = StringIO() record_v1.write_to(f, 1) record_v1_string = f.getvalue() assert record_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n" @@ -104,7 +107,7 @@ def test_arc_v2_record_creation(): offset = "300", filename = "sample.arc.gz") record_v2 = arc.ARCRecord(payload = "BlahBlah", headers = header) - f = StringIO.StringIO() + f = StringIO() record_v2.write_to(f) record_v2_string = f.getvalue() assert record_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" @@ -116,7 +119,7 @@ def test_arc_v1_writer(): date = now, org = "Internet Archive") - opfile = StringIO.StringIO() + opfile = StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) @@ -137,7 +140,7 @@ def test_arc1_v1_writer_default_headers(): now = datetime.datetime(year = 2012, month = 3, day = 2, hour = 19, minute = 32, second = 10) file_headers = dict(date = now) - opfile = StringIO.StringIO() + opfile = StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) @@ -161,7 +164,7 @@ def test_arc_v2_writer(): date = now, org = "Internet Archive") - opfile = StringIO.StringIO() + opfile = StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, file_headers = file_headers) @@ -183,8 +186,8 @@ def test_arc_v2_writer(): def test_arc_reader_guess_version(): "Make sure that the ARCFile object automatically detects the file version" - v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") - v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") + v1 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") + v2 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") arc_v1 = arc.ARCFile(fileobj = v1) arc_v2 = arc.ARCFile(fileobj = v2) @@ -197,7 +200,7 @@ def test_arc_reader_guess_version(): def test_arc_reader_read_file_headers(): "Make sure that the parser is reading file headers properly" - ip = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") + ip = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") arc_file = arc.ARCFile(fileobj = ip) arc_file.read() arc_file.file_headers['ip_address'] == "127.0.0.1" @@ -207,7 +210,7 @@ def test_arc_reader_read_file_headers(): def test_arc_reader_v1(): "Make sure that the parser reads out V1 ARC records. (Also tests iterator behaviour)" - v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") + v1 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") arc_file = arc.ARCFile(fileobj = v1) r1 = arc_file.read() @@ -230,7 +233,7 @@ def test_arc_reader_v1(): def test_arc_reader_v2(): "Make sure that the parser reads out V2 ARC records. (Also tests iterator behaviour)" - v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") + v2 = StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") arc_file = arc.ARCFile(fileobj = v2) r1, r2 = list(arc_file) @@ -288,12 +291,12 @@ def test_arc_record_versions(): filename = "sample.arc.gz") record_1 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 1) record_2 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 2) - f = StringIO.StringIO() + f = StringIO() record_1.write_to(f) record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n" - f = StringIO.StringIO() + f = StringIO() record_2.write_to(f) record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" diff --git a/warc/tests/test_common.py b/warc/tests/test_common.py index d2c2353..743e737 100644 --- a/warc/tests/test_common.py +++ b/warc/tests/test_common.py @@ -38,7 +38,3 @@ def test_sample_data(): expected = """http://www.killerjo.net:80/robots.txt 211.111.217.29 20110804181142 39 SSH-2.0-OpenSSH_5.3p1 Debian-3ubuntu3\r\n\n""" assert record == expected - - - - diff --git a/warc/tests/test_utils.py b/warc/tests/test_utils.py index c155e6e..652b20c 100644 --- a/warc/tests/test_utils.py +++ b/warc/tests/test_utils.py @@ -1,5 +1,8 @@ from ..utils import FilePart, CaseInsensitiveDict -from cStringIO import StringIO +try: + from cStringIO import StringIO +except ImportError: + from io import StringIO class TestCaseInsensitiveDict: def test_all(self): @@ -53,4 +56,4 @@ def test_readline(self): def test_iter(self): part = FilePart(StringIO(self.text), 11) - assert list(part) == ["aaaa\n", "bbbb\n", "c"] \ No newline at end of file + assert list(part) == ["aaaa\n", "bbbb\n", "c"] diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 92545ba..d2ed9ee 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -1,6 +1,9 @@ from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile -from StringIO import StringIO +try: + from cStringIO import StringIO +except ImportError: + from io import StringIO class TestWARCHeader: def test_attrs(self): @@ -99,13 +102,14 @@ def test_read(self): def test_write_gz(self): """Test writing multiple member gzip file.""" - buffer = StringIO() - f = WARCFile(fileobj=buffer, mode="w", compress=True) + from io import BytesIO + buffer = BytesIO() + f = WARCFile(fileobj=buffer, mode="wb", compress=True) for i in range(10): - record = WARCRecord(payload="hello %d" % i) + record = WARCRecord(payload=b"hello %d" % i) f.write_record(record) - GZIP_MAGIC_NUMBER = '\037\213' + GZIP_MAGIC_NUMBER = b'\037\213' assert buffer.getvalue().count(GZIP_MAGIC_NUMBER) == 10 def test_long_header(self): diff --git a/warc/utils.py b/warc/utils.py index 8620e8e..8fb783b 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -7,7 +7,10 @@ :copyright: (c) 2012 Internet Archive """ -from UserDict import DictMixin +try: + from UserDict import DictMixin +except ImportError: + from collections import MutableMapping as DictMixin class CaseInsensitiveDict(DictMixin): """Almost like a dictionary, but keys are case-insensitive. @@ -23,9 +26,9 @@ class CaseInsensitiveDict(DictMixin): >>> d.keys() ["foo", "bar"] """ - def __init__(self, mapping=None, **kwargs): + def __init__(self, *args, **kwargs): self._d = {} - self.update(mapping, **kwargs) + self.update(*args, **kwargs) def __setitem__(self, name, value): self._d[name.lower()] = value @@ -38,7 +41,14 @@ def __delitem__(self, name): def __eq__(self, other): return isinstance(other, CaseInsensitiveDict) and other._d == self._d - + + def __len__(self): + return len(self._d) + + def __iter__(self): + for i in self._d: + yield i + def keys(self): return self._d.keys() diff --git a/warc/warc.py b/warc/warc.py index 0c762a6..d15d839 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -7,14 +7,19 @@ :copyright: (c) 2012 Internet Archive """ -import __builtin__ import datetime import uuid import logging import re -from cStringIO import StringIO import hashlib +try: + import __builtin__ + from cStringIO import StringIO +except ImportError: + import builtins as __builtin__ + from io import StringIO + from . import gzip2 from .utils import CaseInsensitiveDict, FilePart @@ -67,7 +72,7 @@ class WARCHeader(CaseInsensitiveDict): } def __init__(self, headers, defaults=False): - self.version = "WARC/1.0" + self.version = b"WARC/1.0" CaseInsensitiveDict.__init__(self, headers) if defaults: self.init_defaults() @@ -90,29 +95,29 @@ def init_defaults(self): def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ - f.write(self.version + "\r\n") + f.write(self.version + b"\r\n") for name, value in self.items(): name = name.title() # Use standard forms for commonly used patterns name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI") - f.write(name) - f.write(": ") - f.write(value) - f.write("\r\n") + f.write(name.encode('utf-8')) + f.write(b": ") + f.write(value.encode('utf-8')) + f.write(b"\r\n") # Header ends with an extra CRLF - f.write("\r\n") + f.write(b"\r\n") @property def content_length(self): """The Content-Length header as int.""" return int(self['Content-Length']) - + @property def type(self): """The value of WARC-Type header.""" return self.get('WARC-Type') - + @property def record_id(self): """The value of WARC-Record-ID header.""" @@ -124,9 +129,10 @@ def date(self): return self['WARC-Date'] def __str__(self): - f = StringIO() + from io import BytesIO + f = BytesIO() self.write_to(f) - return f.getvalue() + return f.getvalue().decode('utf-8') def __repr__(self): return "" % (self.type, self.record_id) @@ -157,10 +163,14 @@ def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() def write_to(self, f): + if isinstance(self.payload, bytes): + line_break = b"\r\n" + else: + line_break = "\r\n" self.header.write_to(f) f.write(self.payload) - f.write("\r\n") - f.write("\r\n") + f.write(line_break) + f.write(line_break) f.flush() @property @@ -325,7 +335,8 @@ def read_header(self, fileobj): version_line = fileobj.readline() if not version_line: return None - + if isinstance(version_line, bytes): + version_line = version_line.decode('utf-8') m = self.RE_VERSION.match(version_line) if not m: raise IOError("Bad version line: %r" % version_line) @@ -336,6 +347,8 @@ def read_header(self, fileobj): headers = {} while True: line = fileobj.readline() + if isinstance(line, bytes): + line = line.decode('utf-8') if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line)