Skip to content

improve extracting PDF text #572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,8 @@
)

import numpy as np

import tiledb

# from tiledb.vector_search.object_readers import ObjectPartition, ObjectReader

import re

# Util functions for file matching and recursive listing
def match_uri(
Expand All @@ -39,7 +36,6 @@ def match_uri(
return True
return True


def find_uris_vfs(
search_uri: str,
include: str = "*",
Expand Down Expand Up @@ -103,7 +99,6 @@ def find(
else:
return results


def find_uris_tiledb_group(
search_uri: str,
include: str = "*",
Expand Down Expand Up @@ -172,7 +167,6 @@ def find(
else:
return results


def find_uris_aws(
search_uri: str,
include: str = "*",
Expand All @@ -187,8 +181,7 @@ def find_uris_aws(
import boto3
except ImportError:
raise ImportError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
"Could not import boto3 python package. Please install it with `pip install boto3`."
)

search_uri = search_uri.rstrip("/") + "/"
Expand Down Expand Up @@ -241,8 +234,7 @@ def find_uris_aws(
return results
return results


# class DirectoryPartition(ObjectPartition):
# DirectoryPartition class
class DirectoryPartition:
def __init__(
self,
Expand All @@ -264,8 +256,7 @@ def init_kwargs(self) -> Dict:
def id(self) -> int:
return self.partition_id


# class DirectoryReader(ObjectReader):
# DirectoryReader class
class DirectoryReader:
def __init__(
self,
Expand All @@ -275,14 +266,12 @@ def __init__(
suffixes: Optional[Sequence[str]] = None,
max_files: Optional[int] = None,
):
"""Initialize with a path to directory and how to glob over it.

"""
Initialize with a path to directory and how to glob over it.
Args:
search_uri: Path of directory to load files from.
include: File pattern to iclude relative to `search_uri`. By default set
to include all files.
exclude: File patterns to exclude relative to `search_uri`. By default set
to ignore all hidden files.
include: File pattern to iclude relative to `search_uri`. By default set to include all files.
exclude: File patterns to exclude relative to `search_uri`. By default set to ignore all hidden files.
suffixes: Provide to keep only files with these suffixes
Useful when wanting to keep files with different suffixes
Suffixes must include the dot, e.g. ".txt"
Expand Down Expand Up @@ -348,31 +337,101 @@ def get_partitions(
partition_id += 1
return partitions

# Custom PyMuPDFParser to add unique page markers to PDF content
from langchain_community.docstore.document import Document
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
import pymupdf

class CustomPyMuPDFParser(PyMuPDFParser):
def _lazy_parse(self, blob, text_kwargs: Optional[Dict] = None) -> Iterator[Document]:
# Acquire the parent lock for thread safety
with PyMuPDFParser._lock:
with blob.as_bytes_io() as file_path:
# Open the document using pymupdf
doc = pymupdf.open(stream=file_path, filetype="pdf") if blob.data else pymupdf.open(file_path)
full_content = []
for page in doc:
page_content = self._get_page_content(doc, page, text_kwargs)
# Only add page marker and content if there's actual content
if page_content.strip():
full_content.append(f"[[page{page.number + 1}]]{page_content}")
yield Document(
page_content="".join(full_content),
metadata=self._extract_metadata(doc, blob)
)

# Custom PyMuPDFLoader using the custom parser
from langchain_community.document_loaders import PyMuPDFLoader
class CustomPyMuPDFLoader(PyMuPDFLoader):
def __init__(self, file_path, **kwargs):
super().__init__(file_path, **kwargs)
self.parser = CustomPyMuPDFParser(
text_kwargs=kwargs.get('text_kwargs'),
extract_images=kwargs.get('extract_images', False)
)

# Custom splitting function to assign page metadata from custom page markers
def add_pymupdf_page_metadata(doc: Document, chunk_size: int = 1200, chunk_overlap: int = 600) -> List[Document]:
"""
Chunks the body of text returned by the custom PyMuPDFParser.
Uses custom page markers (e.g. [[page1]]) to:
1. Remove the markers from the text.
2. Split the clean text into chunks.
3. Determine the originating page for each chunk based on marker positions.
Returns a list of Documents with updated metadata.
"""
def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[Tuple[str, int]]:
page_markers = [(m.start(), int(m.group(1))) for m in re.finditer(r'\[\[page(\d+)\]\]', text)]
clean_text = re.sub(r'\[\[page\d+\]\]', '', text)
chunks = []
start = 0
while start < len(clean_text):
end = start + chunk_size
if end > len(clean_text):
end = len(clean_text)
chunk = clean_text[start:end].strip()
page_num = None
for marker_pos, page in reversed(page_markers):
if marker_pos <= start:
page_num = page
break
if chunk and page_num is not None:
chunks.append((chunk, page_num))
start += chunk_size - chunk_overlap
return chunks

chunks = split_text(doc.page_content, chunk_size, chunk_overlap)
new_docs = []
for chunk, page_num in chunks:
new_metadata = doc.metadata.copy()
new_metadata['page'] = page_num
new_doc = Document(
page_content=chunk,
metadata=new_metadata
)
new_docs.append(new_doc)
return new_docs

# TileDBLoader class using the custom parser for PDFs
class TileDBLoader:
def __init__(
self,
uri: str,
) -> None:
from langchain_community.document_loaders.parsers.generic import (
MimeTypeBasedParser,
)
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain_community.document_loaders.parsers.html import BS4HTMLParser
from langchain_community.document_loaders.parsers.msword import MsWordParser
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
from langchain_community.document_loaders.parsers.txt import TextParser

self.uri = uri
self.parser = MimeTypeBasedParser(
handlers={
"application/pdf": PyMuPDFParser(),
"application/pdf": CustomPyMuPDFParser(),
"text/plain": TextParser(),
"text/markdown": TextParser(),
"text/html": BS4HTMLParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
MsWordParser()
),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": MsWordParser(),
},
fallback_parser=None,
)
Expand All @@ -386,7 +445,6 @@ def lazy_load(
) -> Iterator:
"""A lazy loader for Documents."""
import mimetypes

from langchain_community.document_loaders.blob_loaders import Blob

vfs = tiledb.VFS()
Expand Down Expand Up @@ -415,7 +473,7 @@ def load_and_split(self, text_splitter) -> List:
"""Load all documents and split them into sentences."""
return text_splitter.split_documents(self.load())


# DirectoryTextReader class
class DirectoryTextReader(DirectoryReader):
MAX_OBJECTS_PER_FILE = 10000

Expand Down Expand Up @@ -477,7 +535,16 @@ def read_objects(
for uri in partition.paths:
try:
loader = TileDBLoader(uri=uri)
documents = loader.load_and_split(text_splitter=text_splitter)
# For PDFs, use the custom page-marker based splitting
if uri.lower().endswith(".pdf"):
document = next(loader.lazy_load())
documents = add_pymupdf_page_metadata(
document,
chunk_size=self.text_splitter_kwargs["chunk_size"],
chunk_overlap=self.text_splitter_kwargs["chunk_overlap"],
)
else:
documents = loader.load_and_split(text_splitter=text_splitter)
text_chunk_id = 0
for d in documents:
file_paths[write_id] = str(uri)
Expand All @@ -503,7 +570,7 @@ def read_objects_by_external_ids(self, ids: List[int]) -> OrderedDict:
# TODO implement
return None


# DirectoryImageReader class
class DirectoryImageReader(DirectoryReader):
def __init__(
self,
Expand Down Expand Up @@ -546,9 +613,7 @@ def read_objects(
self, partition: DirectoryPartition
) -> Tuple[OrderedDict, OrderedDict]:
import mimetypes

from PIL import Image

import tiledb

max_size = DirectoryTextReader.MAX_OBJECTS_PER_FILE * len(partition.paths)
Expand All @@ -569,16 +634,13 @@ def read_objects(
fp = vfs.open(path)
if path.endswith(".pdf") or mime_type == "application/pdf":
from io import BytesIO

import fitz

doc = fitz.open(stream=fp.read())
p = 0
for page in doc.pages():
zoom = 1
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)

image = np.array(
Image.open(BytesIO(pix.tobytes(output="png", jpg_quality=95)))
)[:, :, :3]
Expand Down Expand Up @@ -614,7 +676,6 @@ def read_objects(

def read_objects_by_external_ids(self, ids: List[int]) -> OrderedDict:
from PIL import Image

import tiledb

if self.paths is None:
Expand Down