From 1cb10e2c2687626205aa7155d523e8866e33af4c Mon Sep 17 00:00:00 2001 From: Ajda Pretnar Date: Fri, 7 Jun 2019 11:54:53 +0200 Subject: [PATCH] Import Documents sanitizes pdfs --- orangecontrib/text/import_documents.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 840a66366..a08c15c03 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -138,8 +138,7 @@ def read_file(self): for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text.append(lt_obj.get_text()) - self.content = ' '.join(extracted_text) - + self.content = ' '.join(extracted_text).replace('\x00', '') class XmlReader(Reader): ext = [".xml"]