Skip to content

Commit fcb1090

Browse files
committed
Feature:added support for pdfs inside google drive to be processed
1 parent 579aa4a commit fcb1090

File tree

7 files changed

+76
-10
lines changed

7 files changed

+76
-10
lines changed

app/data_source/api/basic_document.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from dataclasses import dataclass
33
from enum import Enum
44
from typing import Union, List
5+
from langchain.schema import Document as PDFDocument
56

67

78
class DocumentType(Enum):
@@ -24,6 +25,7 @@ class FileType(Enum):
2425
DOCX = "docx"
2526
PPTX = "pptx"
2627
TXT = "txt"
28+
PDF = "pdf"
2729

2830
@classmethod
2931
def from_mime_type(cls, mime_type: str):
@@ -35,6 +37,8 @@ def from_mime_type(cls, mime_type: str):
3537
return cls.PPTX
3638
elif mime_type == 'text/plain':
3739
return cls.TXT
40+
elif mime_type == 'application/pdf':
41+
return cls.PDF
3842
else:
3943
return None
4044

@@ -45,7 +49,7 @@ class BasicDocument:
4549
data_source_id: int # data source id in database
4650
type: DocumentType
4751
title: str
48-
content: str
52+
content: Union[str, List[PDFDocument]]
4953
timestamp: datetime
5054
author: str
5155
author_image_url: str

app/data_source/sources/google_drive/google_drive.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from parsers.docx import docx_to_html
1919
from parsers.html import html_to_text
2020
from parsers.pptx import pptx_to_text
21+
from parsers.pdf import pdf_to_textV2
2122
from queues.index_queue import IndexQueue
2223

2324
logger = logging.getLogger(__name__)
@@ -33,6 +34,7 @@ class GoogleDriveDataSource(BaseDataSource):
3334
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': lambda content: html_to_text(
3435
docx_to_html(content)),
3536
'application/vnd.openxmlformats-officedocument.presentationml.presentation': pptx_to_text,
37+
'application/pdf': pdf_to_textV2
3638
}
3739

3840
@staticmethod
@@ -68,7 +70,8 @@ def __init__(self, *args, **kwargs):
6870
self._supported_mime_types = [
6971
'application/vnd.google-apps.document',
7072
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
71-
'application/vnd.openxmlformats-officedocument.presentationml.presentation'
73+
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
74+
'application/pdf'
7275
]
7376

7477
def _should_index_file(self, file):
@@ -159,6 +162,8 @@ def _feed_file(self, file):
159162
elif file['mimeType'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
160163
content = docx_to_html(file_to_download)
161164
content = html_to_text(content)
165+
elif file['mimeType'] == 'application/pdf':
166+
content = pdf_to_textV2(file_to_download)
162167
else:
163168
logger.error(f'Unsupported mime type {file["mimeType"]}')
164169
return
@@ -181,7 +186,6 @@ def _feed_file(self, file):
181186

182187
# title is file name without extension
183188
title = file['name'].split('.')[0]
184-
185189
doc = BasicDocument(
186190
id=file_id,
187191
data_source_id=self._data_source_id,

app/indexing/index_documents.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
from enum import Enum
44
from typing import List, Optional
55

6-
from data_source.api.basic_document import BasicDocument
6+
from data_source.api.basic_document import BasicDocument, FileType
77
from db_engine import Session
88
from indexing.bm25_index import Bm25Index
99
from indexing.faiss_index import FaissIndex
1010
from models import bi_encoder
1111
from paths import IS_IN_DOCKER
1212
from schemas import Document, Paragraph
13+
from langchain.schema import Document as PDFDocument
14+
1315

1416
logger = logging.getLogger(__name__)
1517

@@ -25,7 +27,10 @@ class Indexer:
2527

2628
@staticmethod
2729
def basic_to_document(document: BasicDocument, parent: Document = None) -> Document:
28-
paragraphs = Indexer._split_into_paragraphs(document.content)
30+
if document.file_type != FileType.PDF:
31+
paragraphs = Indexer._split_into_paragraphs(document.content)
32+
else:
33+
paragraphs = Indexer._split_PDF_into_paragraphs(document.content)
2934
return Document(
3035
data_source_id=document.data_source_id,
3136
id_in_data_source=document.id_in_data_source,
@@ -68,7 +73,10 @@ def index_documents(documents: List[BasicDocument]):
6873
db_documents = []
6974
for document in documents:
7075
# Split the content into paragraphs that fit inside the database
71-
paragraphs = Indexer._split_into_paragraphs(document.content)
76+
if document.file_type != FileType.PDF:
77+
paragraphs = Indexer._split_into_paragraphs(document.content)
78+
else:
79+
paragraphs = Indexer._split_PDF_into_paragraphs(document.content)
7280
# Create a new document in the database
7381
db_document = Indexer.basic_to_document(document)
7482
children = []
@@ -128,7 +136,6 @@ def _split_into_paragraphs(text, minimum_length=256):
128136

129137
if len(current_paragraph) > 0:
130138
paragraphs.append(current_paragraph)
131-
132139
return paragraphs
133140

134141
@staticmethod
@@ -155,3 +162,26 @@ def remove_documents(documents: List[Document], session=None):
155162
Bm25Index.get().update(session=session)
156163

157164
logger.info(f"Finished removing {len(documents)} documents => {len(db_paragraphs)} paragraphs")
165+
166+
167+
@staticmethod
168+
def _split_PDF_into_paragraphs(texts:List[PDFDocument],minimum_length=256):
169+
if texts is None:
170+
return []
171+
paragraphs= []
172+
current_paragraph = ''
173+
for text in texts:
174+
paragraph = text.page_content
175+
if len(current_paragraph) > 0:
176+
current_paragraph += ' '
177+
current_paragraph += paragraph.strip()
178+
if len(current_paragraph) > minimum_length:
179+
paragraphs.append(current_paragraph)
180+
current_paragraph = ''
181+
182+
if len(current_paragraph) > 0:
183+
paragraphs.append(current_paragraph)
184+
return paragraphs
185+
186+
187+

app/parsers/pdf.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
from PyPDF2 import PdfReader
2-
2+
from typing import List
3+
from langchain.document_loaders import PyPDFLoader
4+
from langchain.schema import Document
5+
from langchain.text_splitter import CharacterTextSplitter
36
def pdf_to_text(input_filename: str) -> str:
47
pdf_file = PdfReader(input_filename)
58
text=''
69

710
for page in pdf_file.pages:
811
text = text + page.extract_text()
912

10-
return text
13+
return text
14+
15+
16+
def pdf_to_textV2(input_filename: str) -> List[Document]:
17+
loader = PyPDFLoader(input_filename)
18+
documents = loader.load()
19+
text_split = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
20+
texts = text_split.split_documents(documents)
21+
22+
return texts

app/requirements.txt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,14 @@ persistqueue
2626
retry
2727
PyPDF2
2828
pytz
29-
aiosqlite
29+
aiosqlite
30+
starlette
31+
torch
32+
langchain~=0.0.141
33+
nltk
34+
numpy
35+
requests
36+
python-dateutil
37+
httplib2
38+
pypdf
39+
pycryptodome

ui/src/assets/images/pdf.svg

Lines changed: 1 addition & 0 deletions
Loading

ui/src/components/search-result.tsx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { Img } from 'react-image'
55
import PurpleFolder from '../assets/images/pur-dir.svg';
66
import GoogleDoc from '../assets/images/google-doc.svg';
77
import Docx from '../assets/images/docx.svg';
8+
import Pdf from '../assets/images/pdf.svg';
89
import Pptx from '../assets/images/pptx.svg';
910
import DefaultUserImage from '../assets/images/user.webp';
1011
import Calendar from '../assets/images/calendar.svg';
@@ -32,6 +33,7 @@ export enum FileType {
3233
Docx = "docx",
3334
Pptx = "pptx",
3435
GoogleDoc = "doc",
36+
Pdf = "pdf",
3537
}
3638

3739
export interface SearchResultDetails {
@@ -242,6 +244,9 @@ function getBigIcon(props: SearchResultProps) {
242244
case FileType.GoogleDoc:
243245
containingImage = GoogleDoc;
244246
break;
247+
case FileType.Pdf:
248+
containingImage = Pdf;
249+
break;
245250
}
246251
}
247252
break;

0 commit comments

Comments
 (0)