From b41edd9e505d2e8c44dc53c01d155ac9f2121b3d Mon Sep 17 00:00:00 2001 From: Sanchit Karve Date: Tue, 17 Nov 2015 13:25:42 -0800 Subject: [PATCH 1/2] Removed unnecessary global IMPORTS list Replaced with sys.modules https://docs.python.org/2/library/sys.html#sys.modules --- iocp.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/iocp.py b/iocp.py index 9968cc2..65f0fb2 100755 --- a/iocp.py +++ b/iocp.py @@ -47,10 +47,8 @@ import ConfigParser # Import optional third-party libraries -IMPORTS = [] try: from PyPDF2 import PdfFileReader - IMPORTS.append('pypdf2') except ImportError: pass try: @@ -59,17 +57,14 @@ from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.layout import LAParams - IMPORTS.append('pdfminer') except ImportError: pass try: from bs4 import BeautifulSoup - IMPORTS.append('beautifulsoup') except ImportError: pass try: import requests - IMPORTS.append('requests') except ImportError: pass @@ -104,11 +99,11 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' self.library = library if input_format == 'pdf': - if library not in IMPORTS: + if library not in sys.modules: e = 'Selected PDF parser library not found: %s' % (library) raise ImportError(e) elif input_format == 'html': - if 'beautifulsoup' not in IMPORTS: + if 'beautifulsoup' not in sys.modules: e = 'HTML parser library not found: BeautifulSoup' raise ImportError(e) @@ -269,7 +264,7 @@ def parse_html(self, f, fpath): def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): - if 'requests' not in IMPORTS: + if 'requests' not in sys.modules: e = 'HTTP library not found: requests' raise ImportError(e) headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' } From 432c1b79498198a420ba9c5a1be43bc10a316c55 Mon Sep 17 00:00:00 2001 From: Sanchit Karve Date: Tue, 17 Nov 2015 13:32:15 -0800 Subject: [PATCH 2/2] Added quoting for non-numeric data Fixes CSV reading issues (with appropriate quotes) that arise when values (such as filenames) contains spaces. --- output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/output.py b/output.py index d71c92c..0503774 100644 --- a/output.py +++ b/output.py @@ -31,7 +31,7 @@ def print_error(self, fpath, exception): class OutputHandler_csv(OutputHandler): def __init__(self): - self.csv_writer = csv.writer(sys.stdout, delimiter = '\t') + self.csv_writer = csv.writer(sys.stdout, delimiter = '\t', quoting=csv.QUOTE_NONNUMERIC) def print_match(self, fpath, page, name, match): self.csv_writer.writerow((fpath, page, name, match))