Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 30 additions & 28 deletions Hunyuan-OCR-master/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@
import numpy as np
from PIL import Image, ImageDraw, ImageFont

_SPOTTING_COORD_PATTERN = re.compile(r'\((\d+),(\d+)\),\((\d+),(\d+)\)')


def _iter_spotting_items(response: str):
"""Yield text spans and coordinate groups from a spotting response."""
text_start = 0
for match in _SPOTTING_COORD_PATTERN.finditer(response):
text = response[text_start:match.start()].strip()
coord_matches = (
(match.group(1), match.group(2)),
(match.group(3), match.group(4)),
)
yield text, coord_matches, match
text_start = match.end()


def clean_repeated_substrings(text):
"""Clean repeated substrings in text"""
n = len(text)
Expand Down Expand Up @@ -69,18 +85,12 @@ def denormalize_coordinates(coord: Tuple[float, float], image_width: int, image_
def process_spotting_response(response: str, image_width: int, image_height: int) -> str:
"""Process spotting task response and denormalize coordinates"""
try:
# Find all text and coordinate pairs using regex
pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))'
matches = re.finditer(pattern, response)

new_response = response
for match in matches:
text = match.group(1).strip()
coords = match.group(2)

new_response_parts = []
last_end = 0
for _text, coord_matches, match in _iter_spotting_items(response):
new_response_parts.append(response[last_end:match.start()])

# Parse the two coordinate points
coord_pattern = r'\((\d+),(\d+)\)'
coord_matches = re.findall(coord_pattern, coords)
if len(coord_matches) == 2:
start_coord = (float(coord_matches[0][0]), float(coord_matches[0][1]))
end_coord = (float(coord_matches[1][0]), float(coord_matches[1][1]))
Expand All @@ -91,11 +101,13 @@ def process_spotting_response(response: str, image_width: int, image_height: int

# Build new coordinate string
new_coords = f"({denorm_start[0]},{denorm_start[1]}),({denorm_end[0]},{denorm_end[1]})"

# Replace coordinates in original response
new_response = new_response.replace(coords, new_coords)

return new_response
new_response_parts.append(new_coords)
else:
new_response_parts.append(match.group(0))
last_end = match.end()

new_response_parts.append(response[last_end:])
return ''.join(new_response_parts)

except Exception as e:
print(f"Error processing response: {str(e)}")
Expand All @@ -116,19 +128,9 @@ def draw_text_detection_boxes(image: Image, response: str) -> Image:
except IOError:
font = ImageFont.load_default()

# Extract text and coordinates using regex
pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))'
matches = re.finditer(pattern, response)

for match in matches:
# Extract text and coordinates using bbox delimiters, so text may contain parentheses.
for text, coord_matches, _match in _iter_spotting_items(response):
try:
text = match.group(1).strip()
coords = match.group(2)

# Parse coordinates
coord_pattern = r'\((\d+),(\d+)\)'
coord_matches = re.findall(coord_pattern, coords)

if len(coord_matches) == 2:
x1, y1 = int(coord_matches[0][0]), int(coord_matches[0][1])
x2, y2 = int(coord_matches[1][0]), int(coord_matches[1][1])
Expand Down
45 changes: 45 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import importlib.util
import unittest
from pathlib import Path


REPO_ROOT = Path(__file__).resolve().parents[1]
UTILS_PATH = REPO_ROOT / "Hunyuan-OCR-master" / "utils.py"

spec = importlib.util.spec_from_file_location("hyocr_utils", UTILS_PATH)
utils = importlib.util.module_from_spec(spec)
spec.loader.exec_module(utils)


class SpottingResponseParserTest(unittest.TestCase):
def test_spotting_text_can_contain_parentheses(self):
response = (
"ATLANTA (AP) - Human(43,500),(325,512)"
"they (detectives)(371,523),(653,534)"
)

items = list(utils._iter_spotting_items(response))

self.assertEqual(
[item[0] for item in items],
["ATLANTA (AP) - Human", "they (detectives)"],
)

def test_process_spotting_response_preserves_parenthesized_text(self):
response = (
"ATLANTA (AP) - Human(43,500),(325,512)"
"they (detectives)(371,523),(653,534)"
)

processed = utils.process_spotting_response(
response,
image_width=2000,
image_height=1000,
)

self.assertIn("ATLANTA (AP) - Human(86,500),(650,512)", processed)
self.assertIn("they (detectives)(742,523),(1306,534)", processed)


if __name__ == "__main__":
unittest.main()