diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ef05389..46fed2e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,8 @@ jobs: python -m pip install -e . python -m pip install pytest ruff python -m pip install pytesseract + python -m pip install openai-whisper + python -m pip install ffmpeg-python - name: Lint (ruff) run: ruff check src tests diff --git a/src/ansimon_ai/stt/__init__.py b/src/ansimon_ai/stt/__init__.py index 1c01608..02484f0 100644 --- a/src/ansimon_ai/stt/__init__.py +++ b/src/ansimon_ai/stt/__init__.py @@ -1,10 +1,12 @@ from .types import STTResult, STTSegment from .base import STTEngine from .mock import MockSTT +from .whisper_stt import WhisperSTT __all__ = [ "STTResult", "STTSegment", "STTEngine", "MockSTT", + "WhisperSTT", ] \ No newline at end of file diff --git a/src/ansimon_ai/stt/whisper_stt.py b/src/ansimon_ai/stt/whisper_stt.py new file mode 100644 index 0000000..bbdd19b --- /dev/null +++ b/src/ansimon_ai/stt/whisper_stt.py @@ -0,0 +1,25 @@ +from .base import STTEngine +from .types import STTResult, STTSegment +import whisper + +class WhisperSTT(STTEngine): + def __init__(self, model_size: str = "base"): + self.model = whisper.load_model(model_size) + self.engine_name = f"whisper-{model_size}" + + def transcribe(self, audio_path: str) -> STTResult: + result = self.model.transcribe(audio_path, language="ko") + segments = [ + STTSegment( + start=seg["start"], + end=seg["end"], + text=seg["text"] + ) + for seg in result["segments"] + ] + return STTResult( + full_text=result["text"], + segments=segments, + language=result.get("language", "ko"), + engine=self.engine_name + ) \ No newline at end of file diff --git a/tests/stt/test_whisper_stt_tags.py b/tests/stt/test_whisper_stt_tags.py new file mode 100644 index 0000000..804bbd2 --- /dev/null +++ b/tests/stt/test_whisper_stt_tags.py @@ -0,0 +1,20 @@ +import os +import pytest +from ansimon_ai.stt import WhisperSTT +from ansimon_ai.structuring.tag_patterns import extract_tags_from_structuring_input + +SAMPLE_AUDIO = os.environ.get("WHISPER_TEST_AUDIO", "D:/Project/AnsimOn/call_sample2.m4a") + +@pytest.mark.skipif(not os.path.exists(SAMPLE_AUDIO), reason="샘플 오디오 파일이 존재하지 않음") +def test_whisper_stt_and_tag_extraction(): + stt = WhisperSTT() + stt_result = stt.transcribe(SAMPLE_AUDIO) + assert stt_result.full_text, "전체 텍스트가 비어 있지 않아야 함" + assert stt_result.segments, "segment가 1개 이상 있어야 함" + tags = extract_tags_from_structuring_input(stt_result) + assert isinstance(tags, list) + + for seg in stt_result.segments: + seg_result = stt_result.model_copy(update={"segments": [seg]}) + seg_tags = extract_tags_from_structuring_input(seg_result) + assert isinstance(seg_tags, list) \ No newline at end of file