diff --git a/README.rst b/README.rst index f9bde14e..604bc1a4 100644 --- a/README.rst +++ b/README.rst @@ -39,6 +39,7 @@ Speech recognition engine/API support: * `Tensorflow `__ * `Vosk API `__ (works offline) * `OpenAI whisper `__ (works offline) +* `Speechmatics ASR API `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -95,6 +96,7 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) +* **Speechmatics** (required only if you need to use Speechmatics ``recognizer_instance.recognize_speechmatics``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -169,6 +171,12 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins You can install it with ``python3 -m pip install git+https://github.com/openai/whisper.git soundfile``. +Speechmatics (for Speechmatics users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Speechmatics is **required if and only if you want to use speechmatics** (``recognizer_instance.recognize_speechmatics``). + +You can install it with ``python3 -m pip install speechmatics-python``. You will also need an API key from `__`. + Troubleshooting --------------- diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 7806023f..0736b02a 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -13,6 +13,16 @@ with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file +# recognize speech using Speechmatics +SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" +try: + print("Speechmatics thinks you said " + r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY)) +except sr.UnknownValueError: + print("Speechmatics could not understand audio") +except sr.RequestError as e: + print("Could not request results from the Speechmatics service; {0}".format(e)) + + # recognize speech using Sphinx try: print("Sphinx thinks you said " + r.recognize_sphinx(audio)) diff --git a/examples/extended_results.py b/examples/extended_results.py index 599c67f2..c848212b 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -16,6 +16,16 @@ with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file +# recognize speech using Speechmatics +SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" +try: + print("Speechmatics results:") + pprint(r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY, transcript_format="json-v2")) +except sr.UnknownValueError: + print("Speechmatics could not understand audio") +except sr.RequestError as e: + print("Speechmatics error; {0}".format(e)) + # recognize speech using Sphinx try: print("Sphinx thinks you said " + r.recognize_sphinx(audio)) diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 56168b29..863abe87 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -10,6 +10,15 @@ print("Say something!") audio = r.listen(source) +# recognize speech using Speechmatics +SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" +try: + print("Speechmatics thinks you said " + r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY)) +except sr.UnknownValueError: + print("Speechmatics could not understand audio") +except sr.RequestError as e: + print("Could not request results from Speechmatics service; {0}".format(e)) + # recognize speech using Sphinx try: print("Sphinx thinks you said " + r.recognize_sphinx(audio)) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 7323bd9b..c6bb2b43 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -314,6 +314,19 @@ You can translate the result to english with Whisper by passing translate=True Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options +``recognize_speechmatics(self, audio_data, key=None, language="en", transcript_format="txt")`` +---------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Speechmatics ASR + +The key value is your speechmatics API key. You can get an API key by creating an account and signing into the portal at https://portal.speechmatics.com/manage-access/. + +The recognition language is determined by ``language``, an RFC5646 language tag like "en" or "es". The full list of supported languages can be found at https://docs.speechmatics.com/introduction/supported-languages. + +Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'` + +Raises errors directly from the speechmatics-python package. Read more at https://speechmatics.github.io/speechmatics-python/exceptions.html. + ``AudioSource`` --------------- diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 66ebc04c..2f3e62bb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1702,6 +1702,45 @@ def recognize_vosk(self, audio_data, language='en'): return finalRecognition + def recognize_speechmatics(self, audio_data, key=None, language="en", transcript_format="txt"): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Speechmatics ASR + + The key value is your speechmatics API key. You can get an API key by creating an account and signing into the portal at https://portal.speechmatics.com/manage-access/. + + The recognition language is determined by ``language``, an RFC5646 language tag like "en" or "es". The full list of supported languages can be found at https://docs.speechmatics.com/introduction/supported-languages. + + Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'` + + Raises errors directly from the speechmatics-python package. Read more at https://speechmatics.github.io/speechmatics-python/exceptions.html. + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(key, str), "``key`` must be a string" + + try: + from speechmatics.models import ConnectionSettings, BatchTranscriptionConfig + from speechmatics.batch_client import BatchClient + from speechmatics.constants import BATCH_SELF_SERVICE_URL + except: + raise RequestError("missing speechmatics python module: install using `pip install speechmatics-python`") + + wav_data = audio_data.get_wav_data() + audio_input = ("audio_file.wav", wav_data) + settings = ConnectionSettings( + url=BATCH_SELF_SERVICE_URL, + auth_token=key, + ) + conf = BatchTranscriptionConfig( + language=language, + ) + with BatchClient(settings) as client: + job_id = client.submit_job( + audio=audio_input, + transcription_config=conf, + ) + transcript = client.wait_for_completion(job_id, transcription_format=transcript_format) + return transcript + def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" flac_converter = shutil_which("flac") # check for installed version first diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 5759d657..96fade84 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -34,6 +34,24 @@ def test_google_chinese(self): with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚") + @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") + def test_speechmatics_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"]), "One, two, three.") + + @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") + def test_speechmatics_french(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="fr"), u"C'est la dictée numéro un.") + + @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") + def test_speechmatics_mandarin(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="cmn"), u"砸自己的脚。") + @unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable") def test_wit_english(self): r = sr.Recognizer()