From 87705eca43f3c52c3ec6bd33aabbb8e2efbc2874 Mon Sep 17 00:00:00 2001
From: "William W." <willcodesinlife@gmail.com>
Date: Wed, 19 Nov 2025 08:54:42 -0800
Subject: [PATCH 1/3] Fix quotation marks in installation instructions

Fixed Quotation marks because original format copy/pasted did not allow for an installation to happen. Quality of life update.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 652afc057..2f84c52c7 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ conda activate markitdown
 
 ## Installation
 
-To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
+To install MarkItDown, use pip: `pip install "markitdown[all]"`. Alternatively, you can install it from the source:
 
 ```bash
 git clone git@github.com:microsoft/markitdown.git

From 61327b25420c8e52a64286c15336ee7d67d067fd Mon Sep 17 00:00:00 2001
From: "William W." <willcodesinlife@gmail.com>
Date: Wed, 19 Nov 2025 09:00:34 -0800
Subject: [PATCH 2/3] fix-install-instructions

Fix incorrect pip install syntax in README
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2f84c52c7..11a3b30b4 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ To install MarkItDown, use pip: `pip install "markitdown[all]"`. Alternatively,
 ```bash
 git clone git@github.com:microsoft/markitdown.git
 cd markitdown
-pip install -e 'packages/markitdown[all]'
+pip install -e "packages/markitdown[all]"
 ```
 
 ## Usage

From 24da8ef5c36b58c8d724321acfb93f2c4171ddc1 Mon Sep 17 00:00:00 2001
From: "William W." <willcodesinlife@gmail.com>
Date: Wed, 19 Nov 2025 09:49:22 -0800
Subject: [PATCH 3/3] Switch PDF converter to pdfplumber implementation

Updated PDF conversion to utilize pdfplumber for improved layout support and added smart header/footer detection.
---
 .../markitdown/converters/_pdf_converter.py   | 128 ++++++++++--------
 1 file changed, 73 insertions(+), 55 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index 63162d523..a29553f34 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,77 +1,95 @@
-import sys
 import io
-
+import re
 from typing import BinaryIO, Any
 
-
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from .._exceptions import MissingDependencyException
 
-
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
-_dependency_exc_info = None
 try:
-    import pdfminer
-    import pdfminer.high_level
-except ImportError:
-    # Preserve the error and stack trace for later
-    _dependency_exc_info = sys.exc_info()
-
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "application/pdf",
-    "application/x-pdf",
-]
+    import pdfplumber
+except ImportError as e:
+    raise MissingDependencyException(
+        "PDF conversion with layout support requires: pip install 'markitdown[pdf]'"
+    ) from e
 
+ACCEPTED_MIME_TYPE_PREFIXES = ["application/pdf", "application/x-pdf"]
 ACCEPTED_FILE_EXTENSIONS = [".pdf"]
 
 
 class PdfConverter(DocumentConverter):
-    """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
-    """
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
+    def accepts(self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any) -> bool:
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+        return extension in ACCEPTED_FILE_EXTENSIONS or any(
+            mimetype.startswith(prefix) for prefix in ACCEPTED_MIME_TYPE_PREFIXES
+        )
 
     def convert(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        remove_headers_footers: bool = True,   # We turn this ON by default!
+        **kwargs: Any,
     ) -> DocumentConverterResult:
-        # Check the dependencies
-        if _dependency_exc_info is not None:
-            raise MissingDependencyException(
-                MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".pdf",
-                    feature="pdf",
-                )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _dependency_exc_info[2]
+        assert isinstance(file_stream, io.IOBase)
+
+        file_stream.seek(0)  # Important: reset stream position
+        with pdfplumber.open(file_stream) as pdf:
+            pages_text = []
+
+            # === Smart header/footer detection (only on multi-page docs) ===
+            header = footer = None
+            if remove_headers_footers and len(pdf.pages) > 3:
+                top_lines = {}
+                bottom_lines = {}
+                sample_pages = pdf.pages[:min(20, len(pdf.pages))]
+                for page in sample_pages:
+                    lines = page.extract_text_lines() or []
+                    if not lines:
+                        continue
+                    top_text = lines[0]["text"].strip()
+                    bottom_text = lines[-1]["text"].strip()
+                    top_lines[top_text] = top_lines.get(top_text, 0) + 1
+                    bottom_lines[bottom_text] = bottom_lines.get(bottom_text, 0) + 1
+
+                if top_lines:
+                    header = max(top_lines, key=top_lines.get) if max(top_lines.values()) > 2 else None
+                if bottom_lines:
+                    footer = max(bottom_lines, key=bottom_lines.get) if max(bottom_lines.values()) > 2 else None
+
+            # Common page number patterns
+            page_number_re = re.compile(
+                r"^\s*\d+\s*$|^Page\s*\d+.*|^-\s*\d+\s*-$|^\d+\s+of\s+\d+$"
             )
 
-        assert isinstance(file_stream, io.IOBase)  # for mypy
-        return DocumentConverterResult(
-            markdown=pdfminer.high_level.extract_text(file_stream),
-        )
+            for page in pdf.pages:
+                lines = page.extract_text_lines() or []
+                clean = []
+
+                for line in lines:
+                    text = line["text"].rstrip()
+                    if not text.strip():
+                        continue
+
+                    skip = False
+                    if remove_headers_footers:
+                        # Remove detected header/footer
+                        if text.strip() == header or text.strip() == footer:
+                            skip = True
+                        # Remove obvious page numbers
+                        elif page_number_re.match(text.strip()):
+                            skip = True
+                        # Remove by position (top/bottom 8% of page)
+                        elif line["top"] < page.height * 0.08 or line["top"] > page.height * 0.92:
+                            skip = True
+
+                    if not skip:
+                        clean.append(text)
+
+                page_text = "\n".join(clean).strip()
+                if page_text:
+                    pages_text.append(page_text)
+
+            final_markdown = "\n\n---\n\n".join(pages_text) if pages_text else "No text extracted."
+            return DocumentConverterResult(markdown=final_markdown)