EducationalTestingService · mulhod · Dec 12, 2021 · Dec 13, 2021 · Dec 17, 2021 · Dec 17, 2021
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 # Vosk Speech Recognition Toolkit
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 18 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
 Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
-Ukrainian.
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable

diff --git a/android/lib/build-vosk.sh b/android/lib/build-vosk.sh
@@ -123,7 +123,13 @@ make -j 8 online2 lm rnnlm
 # Vosk-api
 cd $WORKDIR
 mkdir -p $WORKDIR/vosk
-make -j 8 -C ${WORKDIR_BASE}/../../../src OUTDIR=$WORKDIR/vosk KALDI_ROOT=${WORKDIR}/kaldi OPENFST_ROOT=${WORKDIR}/local OPENBLAS_ROOT=${WORKDIR}/local CXX=$CXX EXTRA_LDFLAGS="-llog -static-libstdc++"
+make -j 8 -C ${WORKDIR_BASE}/../../../src \
+    OUTDIR=$WORKDIR/vosk \
+    KALDI_ROOT=${WORKDIR}/kaldi \
+    OPENFST_ROOT=${WORKDIR}/local \
+    OPENBLAS_ROOT=${WORKDIR}/local \
+    CXX=$CXX \
+    EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so"
 cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so
 
 done
diff --git a/nodejs/README.md b/nodejs/README.md
@@ -2,18 +2,18 @@ This is an FFI-NAPI wrapper for the Vosk library.
 
 ## Usage
 
-It mostly follows Vosk interface, some methods are not yet fully implemented.
+Bindings mostly follow Vosk interface, some methods are not yet fully implemented.
 
-To use it you need to compile libvosk library, see Python module build
-instructions for details. You can find prebuilt library inside python
-wheel.
+See [demo folder](https://github.com/alphacep/vosk-api/tree/master/nodejs/demo) for
+details.
 
 ## About
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 17 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
-Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino.
+Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable

diff --git a/python/README.md b/python/README.md
@@ -1,9 +1,10 @@
 This is a Python module for Vosk.
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 17 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
-Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino.
+Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable

diff --git a/python/example/test_gpu_batch.py b/python/example/test_gpu_batch.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import wave
+from time import sleep
+import json
+from timeit import default_timer as timer
+
+
+from vosk import Model, BatchRecognizer, GpuInit
+
+GpuInit()
+
+rec = BatchRecognizer()
+
+# Read list of files from the file
+fnames = open(sys.argv[1]).readlines()
+fds = [open(x.strip(), "rb") for x in fnames]
+uids = [fname.strip().split('/')[-1][:-4] for fname in fnames]
+results = [""] * len(fnames)
+ended = set()
+tot_samples = 0
+
+start_time = timer()
+
+while True:
+
+    # Feed in the data
+    for i, fd in enumerate(fds):
+        if i in ended:
+            continue
+        data = fd.read(16000)
+        if len(data) == 0:
+            rec.FinishStream(i)
+            ended.add(i)
+            continue
+        rec.AcceptWaveform(i, data)
+        tot_samples += len(data)
+
+    # Wait for results from CUDA
+    rec.Wait()
+
+    # Retrieve and add results
+    for i, fd in enumerate(fds):
+       res = rec.Result(i)
+       if len(res) != 0:
+           results[i] = results[i] + " " + json.loads(res)['text']
+
+    if len(ended) == len(fds):
+        break
+
+end_time = timer()
+
+for i in range(len(results)):
+    print (uids[i], results[i].strip())
+
+print ("Processed %d seconds of audio in %d seconds (%f xRT)" % (tot_samples / 16000.0 / 2, end_time - start_time, 
+    (tot_samples / 16000.0 / 2 / (end_time - start_time))), file=sys.stderr)
diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+import sys
+import os
+import wave
+
+SetLogLevel(0)
+
+if not os.path.exists("model"):
+    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
+    exit (1)
+
+wf = wave.open(sys.argv[1], "rb")
+if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+    print ("Audio file must be WAV format mono PCM.")
+    exit (1)
+
+model = Model("model")
+rec = KaldiRecognizer(model, wf.getframerate())
+rec.SetMaxAlternatives(10)
+rec.SetNLSML(True)
+
+while True:
+    data = wf.readframes(4000)
+    if len(data) == 0:
+        break
+    if rec.AcceptWaveform(data):
+        print(rec.Result())
+
+print(rec.FinalResult())
diff --git a/python/setup.py b/python/setup.py
@@ -44,7 +44,7 @@ def get_tag(self):
 
 setuptools.setup(
     name="vosk",
-    version="0.3.41",
+    version="0.3.42",
     author="Educational Testing Service",
     author_email="rubale@ets.org",
     description="Offline open source speech recognition API based on Kaldi and Vosk with additional features from ETS",

diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
@@ -72,6 +72,9 @@ def SetResultOptions(self, result_opts):
     def SetWords(self, enable_words):
         _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
 
+    def SetNLSML(self, enable_nlsml):
+        _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
+
     def SetSpkModel(self, spk_model):
         _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
 
@@ -104,3 +107,32 @@ def GpuInit():
 
 def GpuThreadInit():
     _c.vosk_gpu_thread_init()
+
+class BatchRecognizer(object):
+
+    def __init__(self, *args):
+        self._handle = _c.vosk_batch_recognizer_new()
+
+        if self._handle == _ffi.NULL:
+            raise Exception("Failed to create a recognizer")
+
+    def __del__(self):
+        _c.vosk_batch_recognizer_free(self._handle)
+
+    def AcceptWaveform(self, uid, data):
+        res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
+
+    def Result(self, uid):
+        ptr = _c.vosk_batch_recognizer_front_result(self._handle, uid)
+        res = _ffi.string(ptr).decode('utf-8')
+        _c.vosk_batch_recognizer_pop(self._handle, uid)
+        return res
+
+    def FinishStream(self, uid):
+        _c.vosk_batch_recognizer_finish_stream(self._handle, uid)
+
+    def Wait(self):
+        _c.vosk_batch_recognizer_wait(self._handle)
+
+    def GetPendingChunks(self, uid):
+        return _c.vosk_batch_recognizer_get_pending_chunks(self._handle, uid)
diff --git a/src/Makefile b/src/Makefile
@@ -18,14 +18,14 @@ EXTRA_LDFLAGS?=
 OUTDIR?=.
 
 VOSK_SOURCES= \
-	kaldi_recognizer.cc \
+	recognizer.cc \
 	language_model.cc \
 	model.cc \
 	spk_model.cc \
 	vosk_api.cc
 
 VOSK_HEADERS= \
-	kaldi_recognizer.h \
+	recognizer.h \
 	language_model.h \
 	model.h \
 	spk_model.h \
@@ -39,13 +39,13 @@ LIBS= \
 	$(KALDI_ROOT)/src/decoder/kaldi-decoder.a \
 	$(KALDI_ROOT)/src/ivector/kaldi-ivector.a \
 	$(KALDI_ROOT)/src/gmm/kaldi-gmm.a \
-	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/tree/kaldi-tree.a \
 	$(KALDI_ROOT)/src/feat/kaldi-feat.a \
 	$(KALDI_ROOT)/src/lat/kaldi-lat.a \
 	$(KALDI_ROOT)/src/lm/kaldi-lm.a \
 	$(KALDI_ROOT)/src/rnnlm/kaldi-rnnlm.a \
 	$(KALDI_ROOT)/src/hmm/kaldi-hmm.a \
+	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/transform/kaldi-transform.a \
 	$(KALDI_ROOT)/src/cudamatrix/kaldi-cudamatrix.a \
 	$(KALDI_ROOT)/src/matrix/kaldi-matrix.a \
@@ -55,6 +55,8 @@ LIBS= \
 	$(OPENFST_ROOT)/lib/libfst.a \
 	$(OPENFST_ROOT)/lib/libfstngram.a
 
+LDFLAGS =
+
 
 ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
     CFLAGS += -I$(OPENBLAS_ROOT)/include
@@ -66,23 +68,32 @@ ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
 endif
 
 ifeq ($(HAVE_MKL), 1)
-    CFLAGS += -I$(MKL_ROOT)/include
-    LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
+    CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include
+    LDFLAGS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
 endif
 
 ifeq ($(HAVE_ACCELERATE), 1)
-    LIBS += -framework Accelerate
+    LDFLAGS += -framework Accelerate
 endif
 
 ifeq ($(HAVE_CUDA), 1)
+    VOSK_SOURCES += batch_recognizer.cc
+    VOSK_HEADERS += batch_recognizer.h
+
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
-    LIBS+=-L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
+
+    LIBS := \
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
+        $(LIBS)
+
+    LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
 all: $(OUTDIR)/libvosk.$(EXT)
 
-$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o)
-	$(CXX) --shared -s -o $@ $^ $(LIBS) -lm -latomic $(EXTRA_LDFLAGS)
+$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) $(LIBS)
+	$(CXX) --shared -s -o $@ $^ $(LDFLAGS) -lm -latomic $(EXTRA_LDFLAGS)
 
 $(OUTDIR)/%.o: %.cc $(VOSK_HEADERS)
 	$(CXX) $(CFLAGS) -c -o $@ $<