Feature:added support for pdfs inside google drive to be processed

mahmoodfathy · mahmoodfathy · commit fcb1090d7828 · 2023-04-18T10:27:19.000+02:00
diff --git a/app/data_source/api/basic_document.py b/app/data_source/api/basic_document.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from typing import Union, List
+from langchain.schema import Document as PDFDocument
 
 
 class DocumentType(Enum):
@@ -24,6 +25,7 @@ class FileType(Enum):
     DOCX = "docx"
     PPTX = "pptx"
     TXT = "txt"
+    PDF = "pdf"
 
     @classmethod
     def from_mime_type(cls, mime_type: str):
@@ -35,6 +37,8 @@ def from_mime_type(cls, mime_type: str):
             return cls.PPTX
         elif mime_type == 'text/plain':
             return cls.TXT
+        elif mime_type == 'application/pdf':
+            return cls.PDF
         else:
             return None
 
@@ -45,7 +49,7 @@ class BasicDocument:
     data_source_id: int  # data source id in database
     type: DocumentType
     title: str
-    content: str
+    content: Union[str, List[PDFDocument]]
     timestamp: datetime
     author: str
     author_image_url: str
diff --git a/app/data_source/sources/google_drive/google_drive.py b/app/data_source/sources/google_drive/google_drive.py
@@ -18,6 +18,7 @@
 from parsers.docx import docx_to_html
 from parsers.html import html_to_text
 from parsers.pptx import pptx_to_text
+from parsers.pdf import pdf_to_textV2
 from queues.index_queue import IndexQueue
 
 logger = logging.getLogger(__name__)
@@ -33,6 +34,7 @@ class GoogleDriveDataSource(BaseDataSource):
         'application/vnd.openxmlformats-officedocument.wordprocessingml.document': lambda content: html_to_text(
             docx_to_html(content)),
         'application/vnd.openxmlformats-officedocument.presentationml.presentation': pptx_to_text,
+        'application/pdf': pdf_to_textV2
     }
 
     @staticmethod
@@ -68,7 +70,8 @@ def __init__(self, *args, **kwargs):
         self._supported_mime_types = [
             'application/vnd.google-apps.document',
             'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-            'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+            'application/pdf'
         ]
 
     def _should_index_file(self, file):
@@ -159,6 +162,8 @@ def _feed_file(self, file):
                 elif file['mimeType'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                     content = docx_to_html(file_to_download)
                     content = html_to_text(content)
+                elif file['mimeType'] == 'application/pdf':
+                    content = pdf_to_textV2(file_to_download)
                 else:
                     logger.error(f'Unsupported mime type {file["mimeType"]}')
                     return
@@ -181,7 +186,6 @@ def _feed_file(self, file):
 
         # title is file name without extension
         title = file['name'].split('.')[0]
-
         doc = BasicDocument(
             id=file_id,
             data_source_id=self._data_source_id,
diff --git a/app/indexing/index_documents.py b/app/indexing/index_documents.py
@@ -3,13 +3,15 @@
 from enum import Enum
 from typing import List, Optional
 
-from data_source.api.basic_document import BasicDocument
+from data_source.api.basic_document import BasicDocument, FileType
 from db_engine import Session
 from indexing.bm25_index import Bm25Index
 from indexing.faiss_index import FaissIndex
 from models import bi_encoder
 from paths import IS_IN_DOCKER
 from schemas import Document, Paragraph
+from langchain.schema import Document as PDFDocument
+
 
 logger = logging.getLogger(__name__)
 
@@ -25,7 +27,10 @@ class Indexer:
 
     @staticmethod
     def basic_to_document(document: BasicDocument, parent: Document = None) -> Document:
-        paragraphs = Indexer._split_into_paragraphs(document.content)
+        if document.file_type != FileType.PDF:
+            paragraphs = Indexer._split_into_paragraphs(document.content)
+        else:
+            paragraphs = Indexer._split_PDF_into_paragraphs(document.content)
         return Document(
             data_source_id=document.data_source_id,
             id_in_data_source=document.id_in_data_source,
@@ -68,7 +73,10 @@ def index_documents(documents: List[BasicDocument]):
             db_documents = []
             for document in documents:
                 # Split the content into paragraphs that fit inside the database
-                paragraphs = Indexer._split_into_paragraphs(document.content)
+                if document.file_type != FileType.PDF:
+                    paragraphs = Indexer._split_into_paragraphs(document.content)
+                else:
+                    paragraphs = Indexer._split_PDF_into_paragraphs(document.content)
                 # Create a new document in the database
                 db_document = Indexer.basic_to_document(document)
                 children = []
@@ -128,7 +136,6 @@ def _split_into_paragraphs(text, minimum_length=256):
 
         if len(current_paragraph) > 0:
             paragraphs.append(current_paragraph)
-
         return paragraphs
 
     @staticmethod
@@ -155,3 +162,26 @@ def remove_documents(documents: List[Document], session=None):
         Bm25Index.get().update(session=session)
 
         logger.info(f"Finished removing {len(documents)} documents => {len(db_paragraphs)} paragraphs")
+
+
+    @staticmethod
+    def _split_PDF_into_paragraphs(texts:List[PDFDocument],minimum_length=256):
+        if texts is None:
+            return []
+        paragraphs= []
+        current_paragraph = ''
+        for text in texts:
+            paragraph = text.page_content
+            if len(current_paragraph) > 0:
+                current_paragraph += ' '
+            current_paragraph += paragraph.strip()
+            if len(current_paragraph) > minimum_length:
+                paragraphs.append(current_paragraph)
+                current_paragraph = ''
+
+        if len(current_paragraph) > 0:
+            paragraphs.append(current_paragraph)
+        return paragraphs
+
+
+
diff --git a/app/parsers/pdf.py b/app/parsers/pdf.py
@@ -1,10 +1,22 @@
 from PyPDF2 import PdfReader
-
+from typing import List
+from langchain.document_loaders import PyPDFLoader
+from langchain.schema import Document
+from langchain.text_splitter import CharacterTextSplitter
 def pdf_to_text(input_filename: str) -> str:
 	pdf_file = PdfReader(input_filename)
 	text=''
 	
 	for page in pdf_file.pages:
 		text = text + page.extract_text()
 	
-	return text
+	return text
+
+
+def pdf_to_textV2(input_filename: str) -> List[Document]:
+	loader = PyPDFLoader(input_filename)
+	documents = loader.load()
+	text_split = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
+	texts = text_split.split_documents(documents)
+
+	return texts
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -26,4 +26,14 @@ persistqueue
 retry
 PyPDF2
 pytz
-aiosqlite
+aiosqlite
+starlette
+torch
+langchain~=0.0.141
+nltk
+numpy
+requests
+python-dateutil
+httplib2
+pypdf
+pycryptodome
diff --git a/ui/src/assets/images/pdf.svg b/ui/src/assets/images/pdf.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" aria-label="PDF" viewBox="0 0 512 512" id="pdf"><rect width="512" height="512" fill="#c80a0a" rx="15%"></rect><path fill="#fff" d="M413 302c-9-10-29-15-56-15-16 0-33 2-53 5a252 252 0 0 1-52-69c10-30 17-59 17-81 0-17-6-44-30-44-7 0-13 4-17 10-10 18-6 58 13 100a898 898 0 0 1-50 117c-53 22-88 46-91 65-2 9 4 24 25 24 31 0 65-45 91-91a626 626 0 0 1 92-24c38 33 71 38 87 38 32 0 35-23 24-35zM227 111c8-12 26-8 26 16 0 16-5 42-15 72-18-42-18-75-11-88zM100 391c3-16 33-38 80-57-26 44-52 72-68 72-10 0-13-9-12-15zm197-98a574 574 0 0 0-83 22 453 453 0 0 0 36-84 327 327 0 0 0 47 62zm13 4c32-5 59-4 71-2 29 6 19 41-13 33-23-5-42-18-58-31z"></path></svg>
diff --git a/ui/src/components/search-result.tsx b/ui/src/components/search-result.tsx
@@ -5,6 +5,7 @@ import { Img } from 'react-image'
 import PurpleFolder from '../assets/images/pur-dir.svg';
 import GoogleDoc from '../assets/images/google-doc.svg';
 import Docx from '../assets/images/docx.svg';
+import Pdf from '../assets/images/pdf.svg';
 import Pptx from '../assets/images/pptx.svg';
 import DefaultUserImage from '../assets/images/user.webp';
 import Calendar from '../assets/images/calendar.svg';
@@ -32,6 +33,7 @@ export enum FileType {
     Docx = "docx",
     Pptx = "pptx",
     GoogleDoc = "doc",
+    Pdf = "pdf",
 }
 
 export interface SearchResultDetails {
@@ -242,6 +244,9 @@ function getBigIcon(props: SearchResultProps) {
                     case FileType.GoogleDoc:
                         containingImage = GoogleDoc;
                         break;
+                    case FileType.Pdf:
+                        containingImage = Pdf;
+                        break;
                 }
             }
             break;