Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions src/ansimon_ai/ocr/from_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import pytesseract
from .types import OCRResult
from .types import OCRSegment
from PIL import Image

from ansimon_ai.structuring.types import StructuringInput, StructuringSegment

def preprocess_ocr_segments(segments):
processed = []
for seg in segments:
text = seg.text.strip()
if not text or all(c in "!@#$%^&*()_+=[]{}|;:'\",.<>?/\\ " for c in text):
continue
text = text.replace("\n", " ").replace("\r", " ").replace(" ", " ")
start = seg.start if seg.start is not None else 0.0
end = seg.end if seg.end is not None else 0.0
processed.append({**seg.model_dump(), "text": text, "start": start, "end": end})
return processed

def build_structuring_input_from_ocr(ocr: OCRResult) -> StructuringInput:
segments = preprocess_ocr_segments(ocr.segments)
return StructuringInput(
modality="text",
source_type="ocr",
language=ocr.language,
full_text=ocr.full_text,
segments=[
StructuringSegment(
text=seg.get("text", ""),
start=seg.get("start") if seg.get("start") is not None else 0.0,
end=seg.get("end") if seg.get("end") is not None else 0.0,
)
for seg in segments
],
)

def ocr_image_to_result(image_path: str) -> OCRResult:
tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
if os.name == "nt" and os.path.exists(tesseract_cmd):
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang="kor")
segments = [
OCRSegment(text=line.strip())
for line in text.splitlines() if line.strip()
]
return OCRResult(
full_text=text,
segments=segments,
language="ko",
engine="tesseract"
)
15 changes: 15 additions & 0 deletions src/ansimon_ai/ocr/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from pydantic import BaseModel
from typing import List, Optional

class OCRSegment(BaseModel):
text: str
page: Optional[int] = None
line: Optional[int] = None
start: Optional[float] = None
end: Optional[float] = None

class OCRResult(BaseModel):
full_text: str
segments: List[OCRSegment]
language: Optional[str] = None
engine: str
2 changes: 1 addition & 1 deletion src/ansimon_ai/structuring/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class StructuringSegment(BaseModel):

class StructuringInput(BaseModel):
modality: Literal["text"]
source_type: Literal["stt"]
source_type: Literal["stt", "ocr"]
language: Optional[str]
full_text: str
segments: List[StructuringSegment]
Expand Down
73 changes: 73 additions & 0 deletions tests/ocr/test_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import pytest
from PIL import Image

from ansimon_ai.ocr.types import OCRSegment, OCRResult
from ansimon_ai.ocr.from_ocr import build_structuring_input_from_ocr, ocr_image_to_result

def make_ocr_result(segments, full_text):
return OCRResult(
full_text=full_text,
segments=segments,
language="ko",
engine="mock"
)

def test_threat_kakao():
segments = [
OCRSegment(text="2024.06.01 15:20\n김철수: 너 오늘 집에 안 들어오면 가만 안 둘 거야.", page=1, line=1),
OCRSegment(text="2024.06.01 15:21\n김철수: 경찰에 신고해봐야 소용없어.", page=1, line=2),
OCRSegment(text="2024.06.01 15:22\n김철수: 네가 한 일 다 알고 있어.", page=1, line=3),
]
full_text = "2024.06.01 15:20 김철수: 너 오늘 집에 안 들어오면 가만 안 둘 거야. 2024.06.01 15:21 김철수: 경찰에 신고해봐야 소용없어. 2024.06.01 15:22 김철수: 네가 한 일 다 알고 있어."
struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
assert len(struct_input.segments) == 3
assert "협박" not in struct_input.full_text

def test_missed_calls():
segments = [
OCRSegment(text="2024.06.01 13:10 부재중 전화 (홍길동)", page=1, line=1),
OCRSegment(text="2024.06.01 13:12 부재중 전화 (홍길동)", page=1, line=2),
OCRSegment(text="2024.06.01 14:00 부재중 전화 (홍길동)", page=1, line=3),
]
full_text = "2024.06.01 13:10 부재중 전화 (홍길동) 2024.06.01 13:12 부재중 전화 (홍길동) 2024.06.01 14:00 부재중 전화 (홍길동)"
struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
assert len(struct_input.segments) == 3
assert "부재중" in struct_input.full_text

def test_medical_record():
segments = [
OCRSegment(text="진단명: 외상성 스트레스 장애", page=1, line=1),
OCRSegment(text="진단일: 2024년 6월 2일", page=1, line=2),
OCRSegment(text="환자명: 홍길동", page=1, line=3),
OCRSegment(text="의사: 박의사", page=1, line=4),
]
full_text = "진단명: 외상성 스트레스 장애 진단일: 2024년 6월 2일 환자명: 홍길동 의사: 박의사"
struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
assert "스트레스 장애" in struct_input.full_text
assert len(struct_input.segments) == 4

def test_counseling_record():
segments = [
OCRSegment(text="2024.06.01 16:00 상담센터: 피해자가 심리적 불안 호소", page=1, line=1),
OCRSegment(text="2024.06.01 16:10 상담센터: 가족과의 갈등 언급", page=1, line=2),
]
full_text = "2024.06.01 16:00 상담센터: 피해자가 심리적 불안 호소 2024.06.01 16:10 상담센터: 가족과의 갈등 언급"
struct_input = build_structuring_input_from_ocr(make_ocr_result(segments, full_text))
assert "불안" in struct_input.full_text
assert len(struct_input.segments) == 2

@pytest.mark.skipif(
not os.path.exists("D:\\sample.png"),
reason="예시 이미지 파일이 존재하지 않음"
)

def test_ocr_image_integration():
image_path = "D:\\sample.png"
if not os.path.exists(image_path):
pytest.skip(f"테스트 이미지 없음: {image_path}")
result = ocr_image_to_result(image_path)
print(result)
assert hasattr(result, "full_text")
assert hasattr(result, "segments")
assert isinstance(result.segments, list)
Loading