From 87705eca43f3c52c3ec6bd33aabbb8e2efbc2874 Mon Sep 17 00:00:00 2001 From: "William W." Date: Wed, 19 Nov 2025 08:54:42 -0800 Subject: [PATCH 1/3] Fix quotation marks in installation instructions Fixed Quotation marks because original format copy/pasted did not allow for an installation to happen. Quality of life update. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 652afc057..2f84c52c7 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ conda activate markitdown ## Installation -To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source: +To install MarkItDown, use pip: `pip install "markitdown[all]"`. Alternatively, you can install it from the source: ```bash git clone git@github.com:microsoft/markitdown.git From 61327b25420c8e52a64286c15336ee7d67d067fd Mon Sep 17 00:00:00 2001 From: "William W." Date: Wed, 19 Nov 2025 09:00:34 -0800 Subject: [PATCH 2/3] fix-install-instructions Fix incorrect pip install syntax in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f84c52c7..11a3b30b4 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ To install MarkItDown, use pip: `pip install "markitdown[all]"`. Alternatively, ```bash git clone git@github.com:microsoft/markitdown.git cd markitdown -pip install -e 'packages/markitdown[all]' +pip install -e "packages/markitdown[all]" ``` ## Usage From 24da8ef5c36b58c8d724321acfb93f2c4171ddc1 Mon Sep 17 00:00:00 2001 From: "William W." Date: Wed, 19 Nov 2025 09:49:22 -0800 Subject: [PATCH 3/3] Switch PDF converter to pdfplumber implementation Updated PDF conversion to utilize pdfplumber for improved layout support and added smart header/footer detection. --- .../markitdown/converters/_pdf_converter.py | 128 ++++++++++-------- 1 file changed, 73 insertions(+), 55 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d523..a29553f34 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,77 +1,95 @@ -import sys import io - +import re from typing import BinaryIO, Any - from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._exceptions import MissingDependencyException - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_dependency_exc_info = None try: - import pdfminer - import pdfminer.high_level -except ImportError: - # Preserve the error and stack trace for later - _dependency_exc_info = sys.exc_info() - - -ACCEPTED_MIME_TYPE_PREFIXES = [ - "application/pdf", - "application/x-pdf", -] + import pdfplumber +except ImportError as e: + raise MissingDependencyException( + "PDF conversion with layout support requires: pip install 'markitdown[pdf]'" + ) from e +ACCEPTED_MIME_TYPE_PREFIXES = ["application/pdf", "application/x-pdf"] ACCEPTED_FILE_EXTENSIONS = [".pdf"] class PdfConverter(DocumentConverter): - """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. - """ - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: + def accepts(self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + return extension in ACCEPTED_FILE_EXTENSIONS or any( + mimetype.startswith(prefix) for prefix in ACCEPTED_MIME_TYPE_PREFIXES + ) def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + remove_headers_footers: bool = True, # We turn this ON by default! + **kwargs: Any, ) -> DocumentConverterResult: - # Check the dependencies - if _dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".pdf", - feature="pdf", - ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _dependency_exc_info[2] + assert isinstance(file_stream, io.IOBase) + + file_stream.seek(0) # Important: reset stream position + with pdfplumber.open(file_stream) as pdf: + pages_text = [] + + # === Smart header/footer detection (only on multi-page docs) === + header = footer = None + if remove_headers_footers and len(pdf.pages) > 3: + top_lines = {} + bottom_lines = {} + sample_pages = pdf.pages[:min(20, len(pdf.pages))] + for page in sample_pages: + lines = page.extract_text_lines() or [] + if not lines: + continue + top_text = lines[0]["text"].strip() + bottom_text = lines[-1]["text"].strip() + top_lines[top_text] = top_lines.get(top_text, 0) + 1 + bottom_lines[bottom_text] = bottom_lines.get(bottom_text, 0) + 1 + + if top_lines: + header = max(top_lines, key=top_lines.get) if max(top_lines.values()) > 2 else None + if bottom_lines: + footer = max(bottom_lines, key=bottom_lines.get) if max(bottom_lines.values()) > 2 else None + + # Common page number patterns + page_number_re = re.compile( + r"^\s*\d+\s*$|^Page\s*\d+.*|^-\s*\d+\s*-$|^\d+\s+of\s+\d+$" ) - assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + for page in pdf.pages: + lines = page.extract_text_lines() or [] + clean = [] + + for line in lines: + text = line["text"].rstrip() + if not text.strip(): + continue + + skip = False + if remove_headers_footers: + # Remove detected header/footer + if text.strip() == header or text.strip() == footer: + skip = True + # Remove obvious page numbers + elif page_number_re.match(text.strip()): + skip = True + # Remove by position (top/bottom 8% of page) + elif line["top"] < page.height * 0.08 or line["top"] > page.height * 0.92: + skip = True + + if not skip: + clean.append(text) + + page_text = "\n".join(clean).strip() + if page_text: + pages_text.append(page_text) + + final_markdown = "\n\n---\n\n".join(pages_text) if pages_text else "No text extracted." + return DocumentConverterResult(markdown=final_markdown)