diff --git a/Hunyuan-OCR-master/utils.py b/Hunyuan-OCR-master/utils.py index c8c3779..45891f6 100644 --- a/Hunyuan-OCR-master/utils.py +++ b/Hunyuan-OCR-master/utils.py @@ -6,6 +6,22 @@ import numpy as np from PIL import Image, ImageDraw, ImageFont +_SPOTTING_COORD_PATTERN = re.compile(r'\((\d+),(\d+)\),\((\d+),(\d+)\)') + + +def _iter_spotting_items(response: str): + """Yield text spans and coordinate groups from a spotting response.""" + text_start = 0 + for match in _SPOTTING_COORD_PATTERN.finditer(response): + text = response[text_start:match.start()].strip() + coord_matches = ( + (match.group(1), match.group(2)), + (match.group(3), match.group(4)), + ) + yield text, coord_matches, match + text_start = match.end() + + def clean_repeated_substrings(text): """Clean repeated substrings in text""" n = len(text) @@ -69,18 +85,12 @@ def denormalize_coordinates(coord: Tuple[float, float], image_width: int, image_ def process_spotting_response(response: str, image_width: int, image_height: int) -> str: """Process spotting task response and denormalize coordinates""" try: - # Find all text and coordinate pairs using regex - pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))' - matches = re.finditer(pattern, response) - - new_response = response - for match in matches: - text = match.group(1).strip() - coords = match.group(2) - + new_response_parts = [] + last_end = 0 + for _text, coord_matches, match in _iter_spotting_items(response): + new_response_parts.append(response[last_end:match.start()]) + # Parse the two coordinate points - coord_pattern = r'\((\d+),(\d+)\)' - coord_matches = re.findall(coord_pattern, coords) if len(coord_matches) == 2: start_coord = (float(coord_matches[0][0]), float(coord_matches[0][1])) end_coord = (float(coord_matches[1][0]), float(coord_matches[1][1])) @@ -91,11 +101,13 @@ def process_spotting_response(response: str, image_width: int, image_height: int # Build new coordinate string new_coords = f"({denorm_start[0]},{denorm_start[1]}),({denorm_end[0]},{denorm_end[1]})" - - # Replace coordinates in original response - new_response = new_response.replace(coords, new_coords) - - return new_response + new_response_parts.append(new_coords) + else: + new_response_parts.append(match.group(0)) + last_end = match.end() + + new_response_parts.append(response[last_end:]) + return ''.join(new_response_parts) except Exception as e: print(f"Error processing response: {str(e)}") @@ -116,19 +128,9 @@ def draw_text_detection_boxes(image: Image, response: str) -> Image: except IOError: font = ImageFont.load_default() - # Extract text and coordinates using regex - pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))' - matches = re.finditer(pattern, response) - - for match in matches: + # Extract text and coordinates using bbox delimiters, so text may contain parentheses. + for text, coord_matches, _match in _iter_spotting_items(response): try: - text = match.group(1).strip() - coords = match.group(2) - - # Parse coordinates - coord_pattern = r'\((\d+),(\d+)\)' - coord_matches = re.findall(coord_pattern, coords) - if len(coord_matches) == 2: x1, y1 = int(coord_matches[0][0]), int(coord_matches[0][1]) x2, y2 = int(coord_matches[1][0]), int(coord_matches[1][1]) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..04dd3d8 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,45 @@ +import importlib.util +import unittest +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +UTILS_PATH = REPO_ROOT / "Hunyuan-OCR-master" / "utils.py" + +spec = importlib.util.spec_from_file_location("hyocr_utils", UTILS_PATH) +utils = importlib.util.module_from_spec(spec) +spec.loader.exec_module(utils) + + +class SpottingResponseParserTest(unittest.TestCase): + def test_spotting_text_can_contain_parentheses(self): + response = ( + "ATLANTA (AP) - Human(43,500),(325,512)" + "they (detectives)(371,523),(653,534)" + ) + + items = list(utils._iter_spotting_items(response)) + + self.assertEqual( + [item[0] for item in items], + ["ATLANTA (AP) - Human", "they (detectives)"], + ) + + def test_process_spotting_response_preserves_parenthesized_text(self): + response = ( + "ATLANTA (AP) - Human(43,500),(325,512)" + "they (detectives)(371,523),(653,534)" + ) + + processed = utils.process_spotting_response( + response, + image_width=2000, + image_height=1000, + ) + + self.assertIn("ATLANTA (AP) - Human(86,500),(650,512)", processed) + self.assertIn("they (detectives)(742,523),(1306,534)", processed) + + +if __name__ == "__main__": + unittest.main()