Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/markitdown/src/markitdown/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.4"
__version__ = "0.1.5b1"
68 changes: 68 additions & 0 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,62 @@
import sys
import io
import re
from typing import BinaryIO, Any

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")


def _merge_partial_numbering_lines(text: str) -> str:
"""
Post-process extracted text to merge MasterFormat-style partial numbering
with the following text line.

MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...

Some PDF extractors split these into separate lines:
.1
The intent of this Request for Proposal...

This function merges them back together.
"""
lines = text.split("\n")
result_lines: list[str] = []
i = 0

while i < len(lines):
line = lines[i]
stripped = line.strip()

# Check if this line is ONLY a partial numbering
if PARTIAL_NUMBERING_PATTERN.match(stripped):
# Look for the next non-empty line to merge with
j = i + 1
while j < len(lines) and not lines[j].strip():
j += 1

if j < len(lines):
# Merge the partial numbering with the next line
next_line = lines[j].strip()
result_lines.append(f"{stripped} {next_line}")
i = j + 1 # Skip past the merged line
else:
# No next line to merge with, keep as is
result_lines.append(line)
i += 1
else:
result_lines.append(line)
i += 1

return "\n".join(result_lines)


# Load dependencies
_dependency_exc_info = None
try:
Expand Down Expand Up @@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
# Determine row type
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60

# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
# These should be treated as list items, not table rows
has_partial_numbering = False
if row_words:
first_word = row_words[0]["text"].strip()
if PARTIAL_NUMBERING_PATTERN.match(first_word):
has_partial_numbering = True

row_info.append(
{
"y_key": y_key,
Expand All @@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
"x_groups": x_groups,
"is_paragraph": is_paragraph,
"num_columns": len(x_groups),
"has_partial_numbering": has_partial_numbering,
}
)

Expand Down Expand Up @@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
info["is_table_row"] = False
continue

# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
if info["has_partial_numbering"]:
info["is_table_row"] = False
continue

# Count how many global columns this row's words align with
aligned_columns: set[int] = set()
for word in info["words"]:
Expand Down Expand Up @@ -469,4 +534,7 @@ def convert(
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)

# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)

return DocumentConverterResult(markdown=markdown)
Binary file not shown.
171 changes: 171 additions & 0 deletions packages/markitdown/tests/test_pdf_masterformat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/usr/bin/env python3 -m pytest
"""Tests for MasterFormat-style partial numbering in PDF conversion."""

import os
import re
import pytest

from markitdown import MarkItDown
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")


class TestMasterFormatPartialNumbering:
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""

def test_partial_numbering_pattern_regex(self):
"""Test that the partial numbering regex pattern correctly matches."""

# Should match partial numbering patterns
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None

# Should NOT match other patterns
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
assert PARTIAL_NUMBERING_PATTERN.match("") is None

def test_masterformat_partial_numbering_not_split(self):
"""Test that MasterFormat partial numbering stays with associated text.

MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...

These should NOT be split into separate table columns, but kept
as coherent text lines with the number followed by its description.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")

markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content

# Partial numberings should NOT appear isolated on their own lines
# If they're isolated, it means the parser incorrectly split them from their text
lines = text_content.split("\n")
isolated_numberings = []
for line in lines:
stripped = line.strip()
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
cleaned = stripped.replace("|", "").strip()
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
isolated_numberings.append(stripped)

assert len(isolated_numberings) == 0, (
f"Partial numberings should not be isolated from their text. "
f"Found isolated: {isolated_numberings}"
)

# Verify that partial numberings appear WITH following text on the same line
# Look for patterns like ".1 The intent" or ".1 Some text"
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
assert (
len(partial_with_text) > 0
), "Expected to find partial numberings followed by text on the same line"

def test_masterformat_content_preserved(self):
"""Test that MasterFormat document content is fully preserved."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")

markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content

# Verify key content from the MasterFormat document is preserved
expected_content = [
"RFP for Construction Management Services",
"Section 00 00 43",
"Instructions to Respondents",
"Ken Sargent House",
"INTENT",
"Request for Proposal",
"KEN SARGENT HOUSE",
"GRANDE PRAIRIE, ALBERTA",
"Section 00 00 45",
]

for content in expected_content:
assert (
content in text_content
), f"Expected content '{content}' not found in extracted text"

# Verify partial numbering is followed by text on the same line
# .1 should be followed by "The intent" on the same line
assert re.search(
r"\.1\s+The intent", text_content
), "Partial numbering .1 should be followed by 'The intent' text"

# .2 should be followed by "Available information" on the same line
assert re.search(
r"\.2\s+Available information", text_content
), "Partial numbering .2 should be followed by 'Available information' text"

# Ensure text content is not empty and has reasonable length
assert (
len(text_content.strip()) > 100
), "MasterFormat document should have substantial text content"

def test_merge_partial_numbering_with_empty_lines_between(self):
"""Test that partial numberings merge correctly even with empty lines between.

When PDF extractors produce output like:
.1

The intent of this Request...

The merge logic should still combine them properly.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")

markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content

# The merged result should have .1 and .2 followed by text
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
lines = text_content.split("\n")

for i, line in enumerate(lines):
stripped = line.strip()
# If we find an isolated partial numbering, the merge failed
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
# Check if next non-empty line exists and wasn't merged
for j in range(i + 1, min(i + 3, len(lines))):
if lines[j].strip():
pytest.fail(
f"Partial numbering '{stripped}' on line {i} was not "
f"merged with following text '{lines[j].strip()[:30]}...'"
)
break

def test_multiple_partial_numberings_all_merged(self):
"""Test that all partial numberings in a document are properly merged."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")

markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content

# Count occurrences of merged partial numberings (number followed by text)
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))

# Count isolated partial numberings (number alone on a line)
isolated_count = 0
for line in text_content.split("\n"):
stripped = line.strip()
if re.match(r"^\.\d+$", stripped):
isolated_count += 1

assert (
merged_count >= 2
), f"Expected at least 2 merged partial numberings, found {merged_count}"
assert (
isolated_count == 0
), f"Found {isolated_count} isolated partial numberings that weren't merged"