diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index b4031bd4..2dba4266 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -46,16 +46,16 @@ jobs: - name: Install Python dependencies (Ubuntu, <=3.12) if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13' run: | - python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq,vosk] + python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq,vosk,cohere-api] - name: Install Python dependencies (Ubuntu, 3.13) if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' run: | python -m pip install standard-aifc setuptools - python -m pip install .[dev,audio,pocketsphinx,google-cloud,openai,groq,vosk] + python -m pip install .[dev,audio,pocketsphinx,google-cloud,openai,groq,vosk,cohere-api] - name: Install Python dependencies (Windows) if: matrix.os == 'windows-latest' run: | - python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq,vosk] + python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq,vosk,cohere-api] - name: Set up vosk model run: python -m speech_recognition.cli download vosk - name: Test with unittest diff --git a/README.rst b/README.rst index 06fedc78..7ac407b5 100644 --- a/README.rst +++ b/README.rst @@ -65,6 +65,7 @@ Speech recognition engine/API support: * `OpenAI Whisper API `__ * OpenAI compatible self-hosted endpoints (e.g. vLLM, Ollama) * `Groq Whisper API `__ +* `Cohere Transcribe API `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -123,6 +124,7 @@ To use all of the functionality of the library, you should have: * **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``) * includes OpenAI compatible self-hosted endpoints (e.g. vLLM, Ollama) * **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) +* **cohere** (required only if you need to use Cohere Transcribe API speech recognition ``recognizer_instance.recognize_cohere_api``; install with ``pip install SpeechRecognition[cohere-api]``. Set ``CO_API_KEY`` as documented by the Cohere SDK.) The following requirements are optional, but can improve or extend functionality in some situations: diff --git a/pyproject.toml b/pyproject.toml index 2fd4f9b6..bb52f65a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,9 @@ groq = [ "groq", "httpx < 0.28", ] +cohere-api = [ + "cohere>=5.21.0", +] assemblyai = ["requests"] vosk = ["vosk"] diff --git a/reference/library-reference.rst b/reference/library-reference.rst index e96fde87..ee772ef3 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -291,6 +291,11 @@ Raises a ``speech_recognition.UnknownValueError`` exception if the speech is uni .. autofunction:: speech_recognition.recognizers.whisper_api.groq.recognize +``recognizer_instance.recognize_cohere_api(audio_data: AudioData, *, language: str, model = "cohere-transcribe-03-2026")`` +-------------------------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.cohere_api.recognize + ``AudioSource`` --------------- diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 4b83b6c5..d0eb2f60 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1281,7 +1281,7 @@ def flush(self, *args, **kwargs): # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. # This is a workaround to resolve this issue try: - from .recognizers import google, google_cloud, pocketsphinx, vosk + from .recognizers import cohere_api, google, google_cloud, pocketsphinx, vosk from .recognizers.whisper_api import groq, openai from .recognizers.whisper_local import faster_whisper, whisper except (ModuleNotFoundError, ImportError): @@ -1293,6 +1293,7 @@ def flush(self, *args, **kwargs): Recognizer.recognize_faster_whisper = faster_whisper.recognize # type: ignore[attr-defined] Recognizer.recognize_openai = openai.recognize # type: ignore[attr-defined] Recognizer.recognize_groq = groq.recognize # type: ignore[attr-defined] + Recognizer.recognize_cohere_api = cohere_api.recognize # type: ignore[attr-defined] Recognizer.recognize_sphinx = pocketsphinx.recognize # type: ignore[attr-defined] Recognizer.recognize_vosk = vosk.recognize # type: ignore[attr-defined] diff --git a/speech_recognition/recognizers/cohere_api.py b/speech_recognition/recognizers/cohere_api.py new file mode 100644 index 00000000..32742753 --- /dev/null +++ b/speech_recognition/recognizers/cohere_api.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import logging +from io import BytesIO + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import SetupError + +logger = logging.getLogger(__name__) + + +def recognize( + recognizer, + audio_data: AudioData, + *, + language: str, + model: str = "cohere-transcribe-03-2026", +) -> str: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the `Cohere Transcribe `__ API via the official Python SDK. + + Requires the ``cohere`` package (install with ``pip install SpeechRecognition[cohere-api]``). + Set environment variable ``CO_API_KEY`` as documented by Cohere; this library does not read or override it in code. + + ``language`` is required by the Cohere transcription API (e.g. ``\"en\"``, ``\"ja\"``). + + Detail: https://docs.cohere.com/reference/create-audio-transcription + """ + try: + import cohere + except ImportError: + raise SetupError( + "missing cohere module: ensure that cohere is set up correctly " + "(e.g. pip install SpeechRecognition[cohere-api])." + ) + + if not isinstance(audio_data, AudioData): + raise ValueError("``audio_data`` must be an ``AudioData`` instance") + + wav_data = BytesIO(audio_data.get_wav_data()) + wav_data.name = "SpeechRecognition_audio.wav" + + client = cohere.ClientV2() + logger.debug( + "cohere audio.transcriptions.create: model=%r language=%r", + model, + language, + ) + response = client.audio.transcriptions.create( + model=model, + file=wav_data, + language=language, + ) + return response.text diff --git a/tests/recognizers/test_cohere_api.py b/tests/recognizers/test_cohere_api.py new file mode 100644 index 00000000..ce840c1d --- /dev/null +++ b/tests/recognizers/test_cohere_api.py @@ -0,0 +1,50 @@ +from unittest.mock import MagicMock, patch + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers import cohere_api + + +@patch("cohere.ClientV2") +def test_transcribe_default_model(mock_client_cls): + mock_response = MagicMock() + mock_response.text = "Transcription by Cohere" + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = mock_response + mock_client_cls.return_value = mock_client + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"fake_wav" + + actual = cohere_api.recognize( + MagicMock(spec=Recognizer), audio_data, language="en" + ) + + assert actual == "Transcription by Cohere" + audio_data.get_wav_data.assert_called_once() + mock_client_cls.assert_called_once_with() + mock_client.audio.transcriptions.create.assert_called_once() + call_kw = mock_client.audio.transcriptions.create.call_args.kwargs + assert call_kw["model"] == "cohere-transcribe-03-2026" + assert call_kw["language"] == "en" + assert "file" in call_kw + + +@patch("cohere.ClientV2") +def test_transcribe_with_language(mock_client_cls): + mock_response = MagicMock() + mock_response.text = "Japanese transcription" + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = mock_response + mock_client_cls.return_value = mock_client + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"fake_wav" + + actual = cohere_api.recognize( + MagicMock(spec=Recognizer), audio_data, language="ja" + ) + + assert actual == "Japanese transcription" + call_kw = mock_client.audio.transcriptions.create.call_args.kwargs + assert call_kw["model"] == "cohere-transcribe-03-2026" + assert call_kw["language"] == "ja"