Team-SEBAF · YUDINDIN1005 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/src/ansimon_ai/ocr/from_ocr.py b/src/ansimon_ai/ocr/from_ocr.py
@@ -0,0 +1,53 @@
+import os
+import pytesseract
+from .types import OCRResult
+from .types import OCRSegment
+from PIL import Image
+
+from ansimon_ai.structuring.types import StructuringInput, StructuringSegment
+
+def preprocess_ocr_segments(segments):
+    processed = []
+    for seg in segments:
+        text = seg.text.strip()
+        if not text or all(c in "!@#$%^&*()_+=[]{}|;:'\",.<>?/\\ " for c in text):
+            continue
+        text = text.replace("\n", " ").replace("\r", " ").replace("  ", " ")
+        start = seg.start if seg.start is not None else 0.0
+        end = seg.end if seg.end is not None else 0.0
+        processed.append({**seg.model_dump(), "text": text, "start": start, "end": end})
+    return processed
+
+def build_structuring_input_from_ocr(ocr: OCRResult) -> StructuringInput:
+    segments = preprocess_ocr_segments(ocr.segments)
+    return StructuringInput(
+        modality="text",
+        source_type="ocr",
+        language=ocr.language,
+        full_text=ocr.full_text,
+        segments=[
+            StructuringSegment(
+                text=seg.get("text", ""),
+                start=seg.get("start") if seg.get("start") is not None else 0.0,
+                end=seg.get("end") if seg.get("end") is not None else 0.0,
+            )
+            for seg in segments
+        ],
+    )
+
+def ocr_image_to_result(image_path: str) -> OCRResult:
+    tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+    if os.name == "nt" and os.path.exists(tesseract_cmd):
+        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
+    img = Image.open(image_path)
+    text = pytesseract.image_to_string(img, lang="kor")
+    segments = [
+        OCRSegment(text=line.strip())
+        for line in text.splitlines() if line.strip()
+    ]
+    return OCRResult(
+        full_text=text,
+        segments=segments,
+        language="ko",
+        engine="tesseract"
+    )
diff --git a/src/ansimon_ai/ocr/types.py b/src/ansimon_ai/ocr/types.py
@@ -0,0 +1,15 @@
+from pydantic import BaseModel
+from typing import List, Optional
+
+class OCRSegment(BaseModel):
+    text: str
+    page: Optional[int] = None
+    line: Optional[int] = None
+    start: Optional[float] = None
+    end: Optional[float] = None
+
+class OCRResult(BaseModel):
+    full_text: str
+    segments: List[OCRSegment]
+    language: Optional[str] = None
+    engine: str
diff --git a/src/ansimon_ai/structuring/types.py b/src/ansimon_ai/structuring/types.py
@@ -8,7 +8,7 @@ class StructuringSegment(BaseModel):
 
 class StructuringInput(BaseModel):
     modality: Literal["text"]
-    source_type: Literal["stt"]
+    source_type: Literal["stt", "ocr"]
     language: Optional[str]
     full_text: str
     segments: List[StructuringSegment]

diff --git a/tests/ocr/test_ocr.py b/tests/ocr/test_ocr.py
@@ -0,0 +1,73 @@
+import os
+import pytest
+from PIL import Image
+
+from ansimon_ai.ocr.types import OCRSegment, OCRResult
+from ansimon_ai.ocr.from_ocr import build_structuring_input_from_ocr, ocr_image_to_result
+
+def make_ocr_result(segments, full_text):
+    return OCRResult(
+        full_text=full_text,
+        segments=segments,
+        language="ko",
+        engine="mock"
+    )
+
+def test_threat_kakao():
+    segments = [
+        OCRSegment(text="2024.06.01 15:20\n김철수: 너 오늘 집에 안 들어오면 가만 안 둘 거야.", page=1, line=1),
+        OCRSegment(text="2024.06.01 15:21\n김철수: 경찰에 신고해봐야 소용없어.", page=1, line=2),
+        OCRSegment(text="2024.06.01 15:22\n김철수: 네가 한 일 다 알고 있어.", page=1, line=3),
+    ]
+    full_text = "2024.06.01 15:20 김철수: 너 오늘 집에 안 들어오면 가만 안 둘 거야. 2024.06.01 15:21 김철수: 경찰에 신고해봐야 소용없어. 2024.06.01 15:22 김철수: 네가 한 일 다 알고 있어."
+    struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
+    assert len(struct_input.segments) == 3
+    assert "협박" not in struct_input.full_text
+
+def test_missed_calls():
+    segments = [
+        OCRSegment(text="2024.06.01 13:10 부재중 전화 (홍길동)", page=1, line=1),
+        OCRSegment(text="2024.06.01 13:12 부재중 전화 (홍길동)", page=1, line=2),
+        OCRSegment(text="2024.06.01 14:00 부재중 전화 (홍길동)", page=1, line=3),
+    ]
+    full_text = "2024.06.01 13:10 부재중 전화 (홍길동) 2024.06.01 13:12 부재중 전화 (홍길동) 2024.06.01 14:00 부재중 전화 (홍길동)"
+    struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
+    assert len(struct_input.segments) == 3
+    assert "부재중" in struct_input.full_text
+
+def test_medical_record():
+    segments = [
+        OCRSegment(text="진단명: 외상성 스트레스 장애", page=1, line=1),
+        OCRSegment(text="진단일: 2024년 6월 2일", page=1, line=2),
+        OCRSegment(text="환자명: 홍길동", page=1, line=3),
+        OCRSegment(text="의사: 박의사", page=1, line=4),
+    ]
+    full_text = "진단명: 외상성 스트레스 장애 진단일: 2024년 6월 2일 환자명: 홍길동 의사: 박의사"
+    struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
+    assert "스트레스 장애" in struct_input.full_text
+    assert len(struct_input.segments) == 4
+
+def test_counseling_record():
+    segments = [
+        OCRSegment(text="2024.06.01 16:00 상담센터: 피해자가 심리적 불안 호소", page=1, line=1),
+        OCRSegment(text="2024.06.01 16:10 상담센터: 가족과의 갈등 언급", page=1, line=2),
+    ]
+    full_text = "2024.06.01 16:00 상담센터: 피해자가 심리적 불안 호소 2024.06.01 16:10 상담센터: 가족과의 갈등 언급"
+    struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
+    assert "불안" in struct_input.full_text
+    assert len(struct_input.segments) == 2
+
+@pytest.mark.skipif(
+    not os.path.exists("D:\\sample.png"),
+    reason="예시 이미지 파일이 존재하지 않음"
+)
+
+def test_ocr_image_integration():
+    image_path = "D:\\sample.png"
+    if not os.path.exists(image_path):
+        pytest.skip(f"테스트 이미지 없음: {image_path}")
+    result = ocr_image_to_result(image_path)
+    print(result)
+    assert hasattr(result, "full_text")
+    assert hasattr(result, "segments")
+    assert isinstance(result.segments, list)