From 6977be7fb78a82dbae7c80eeab360f46c2035c5e Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Sun, 12 Dec 2021 21:37:44 +0100
Subject: [PATCH 01/25] Batch recognizer draft

---
 python/example/batch/test_batch.py         |  26 +++++
 python/vosk/__init__.py                    |  20 ++++
 src/Makefile                               |  11 ++-
 src/batch_recognizer.cc                    | 107 +++++++++++++++++++++
 src/batch_recognizer.h                     |  67 +++++++++++++
 src/model.h                                |   6 +-
 src/{kaldi_recognizer.cc => recognizer.cc} |  52 +++++-----
 src/{kaldi_recognizer.h => recognizer.h}   |  14 +--
 src/spk_model.h                            |   4 +-
 src/vosk_api.cc                            |  57 ++++++++---
 src/vosk_api.h                             |  24 +++++
 11 files changed, 333 insertions(+), 55 deletions(-)
 create mode 100755 python/example/batch/test_batch.py
 create mode 100644 src/batch_recognizer.cc
 create mode 100644 src/batch_recognizer.h
 rename src/{kaldi_recognizer.cc => recognizer.cc} (93%)
 rename src/{kaldi_recognizer.h => recognizer.h} (91%)

diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
new file mode 100755
index 00000000..fb1bb7e9
--- /dev/null
+++ b/python/example/batch/test_batch.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+from vosk import Model, BatchRecognizer
+import sys
+import os
+import wave
+
+model = Model("model")
+rec = BatchRecognizer(model, 16000.0)
+
+fnames = open("tedlium.list").readlines()
+fds = [open(x) for x in fnames]
+ended = set()
+while True:
+    for i, fd in fds:
+        if i in ended():
+            continue
+        data = fd.read(4000)
+        if len(data) == 0:
+            rec.FinishStream(i)
+            ended.add(i)
+        else:
+            rec.AcceptWaveform(i, data)
+    rec.Results()
+    if len(ended) == len(fds):
+        break
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index cf39a472..02e1df97 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -101,3 +101,23 @@ def GpuInit():
 
 def GpuThreadInit():
     _c.vosk_gpu_thread_init()
+
+class BatchRecognizer(object):
+
+    def __init__(self, *args):
+        self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
+
+        if self._handle == _ffi.NULL:
+            raise Exception("Failed to create a recognizer")
+
+    def __del__(self):
+        _c.vosk_batch_recognizer_free(self._handle)
+
+    def AcceptWaveform(self, uid, data):
+        res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
+
+    def Results(self):
+        return _ffi.string(_c.vosk_batch_recognizer_result(self._handle)).decode('utf-8')
+
+    def FinishStream(self, uid):
+        _c.vosk_recognizer_final_result(self._handle, uid)
diff --git a/src/Makefile b/src/Makefile
index 54e96ca7..96c21949 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -18,16 +18,18 @@ EXTRA_LDFLAGS?=
 OUTDIR?=.
 
 VOSK_SOURCES= \
-	kaldi_recognizer.cc \
+	recognizer.cc \
 	language_model.cc \
 	model.cc \
 	spk_model.cc \
+	batch_recognizer.cc \
 	vosk_api.cc
 
 VOSK_HEADERS= \
-	kaldi_recognizer.h \
+	recognizer.h \
 	language_model.h \
 	model.h \
+	batch_recognizer.h \
 	spk_model.h \
 	vosk_api.h
 
@@ -76,7 +78,10 @@ endif
 
 ifeq ($(HAVE_CUDA), 1)
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
-    LIBS+=-L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
+    LIBS+=\
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
+        -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
 all: $(OUTDIR)/libvosk.$(EXT)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
new file mode 100644
index 00000000..112f1fe9
--- /dev/null
+++ b/src/batch_recognizer.cc
@@ -0,0 +1,107 @@
+// Copyright 2019-2020 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "batch_recognizer.h"
+
+#include "fstext/fstext-utils.h"
+#include "lat/sausages.h"
+
+using namespace fst;
+using namespace kaldi::nnet3;
+using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID;
+
+BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(model), sample_frequency_(sample_frequency) {
+    model_->Ref();
+
+    BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+
+    cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
+         (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_);
+
+    CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
+    dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
+                                                            *cuda_pipeline_);
+
+    InitRescoring();
+}
+
+BatchRecognizer::~BatchRecognizer() {
+    delete lm_to_subtract_;
+    delete carpa_to_add_;
+    delete carpa_to_add_scale_;
+
+    delete cuda_pipeline_;
+    delete dynamic_batcher_;
+
+    model_->Unref();
+}
+
+void BatchRecognizer::InitRescoring()
+{
+    if (model_->graph_lm_fst_) {
+        fst::CacheOptions cache_opts(true, -1);
+        fst::ArcMapFstOptions mapfst_opts(cache_opts);
+        fst::StdToLatticeMapper<BaseFloat> mapper;
+        lm_to_subtract_ = new fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >(*model_->graph_lm_fst_, mapper, mapfst_opts);
+        carpa_to_add_ = new ConstArpaLmDeterministicFst(model_->const_arpa_);
+    }
+}
+
+void BatchRecognizer::FinishStream(uint64_t id)
+{
+    streams_.erase(id);
+}
+
+void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
+{
+    bool first = false;
+
+    if (streams_.find(id) == streams_.end()) {
+        first = true;
+        streams_.insert(id);
+
+        // Define the callback for results.
+        cuda_pipeline_->SetBestPathCallback(
+          id,
+          [&, id](const std::string &str, bool partial,
+                       bool endpoint_detected) {
+              if (partial) {
+                  KALDI_LOG << "id #" << id << " [partial] : " << str;
+              }
+
+              if (endpoint_detected) {
+                  KALDI_LOG << "id #" << id << " [endpoint detected]";
+              }
+
+              if (!partial) {
+                  KALDI_LOG << "id #" << id << " : " << str;
+              }
+            });
+    }
+
+    Vector<BaseFloat> wave;
+    wave.Resize(len / 2, kUndefined);
+    for (int i = 0; i < len / 2; i++)
+        wave(i) = *(((short *)data) + i);
+    SubVector<BaseFloat> chunk(wave.Data(), 0);
+
+    dynamic_batcher_->Push(id, first, false, chunk);
+}
+
+const char* BatchRecognizer::PullResults()
+{
+    dynamic_batcher_->WaitForCompletion();
+    cudaDeviceSynchronize();
+    return "";
+}
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
new file mode 100644
index 00000000..00f4a0db
--- /dev/null
+++ b/src/batch_recognizer.h
@@ -0,0 +1,67 @@
+// Copyright 2019 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VOSK_GPU_RECOGNIZER_H
+#define VOSK_GPU_RECOGNIZER_H
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "fstext/fstext-utils.h"
+#include "decoder/lattice-faster-decoder.h"
+#include "feat/feature-mfcc.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/word-align-lattice.h"
+#include "lat/compose-lattice-pruned.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
+
+#include "cudadecoder/cuda-online-pipeline-dynamic-batcher.h"
+#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"
+#include "cudadecoder/cuda-pipeline-common.h"
+
+#include "model.h"
+
+using namespace kaldi;
+using namespace kaldi::cuda_decoder;
+
+class BatchRecognizer {
+    public:
+        BatchRecognizer(Model *model, float sample_frequency);
+        ~BatchRecognizer();
+
+        void FinishStream(uint64_t id);
+        void AcceptWaveform(uint64_t id, const char *data, int len);
+        const char* PullResults();
+
+    private:
+        void InitRescoring();
+
+        Model *model_ = nullptr;
+        BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
+        CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
+
+        std::set<int> streams_;
+
+        // Rescoring
+        fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;
+        kaldi::ConstArpaLmDeterministicFst *carpa_to_add_ = nullptr;
+        fst::ScaleDeterministicOnDemandFst *carpa_to_add_scale_ = nullptr;
+
+        float sample_frequency_;
+};
+
+#endif /* VOSK_GPU_RECOGNIZER_H */
diff --git a/src/model.h b/src/model.h
index d5feedd0..c36a96aa 100644
--- a/src/model.h
+++ b/src/model.h
@@ -36,7 +36,8 @@
 using namespace kaldi;
 using namespace std;
 
-class KaldiRecognizer;
+class Recognizer;
+class BatchRecognizer;
 
 class Model {
 
@@ -52,7 +53,8 @@ class Model {
     void ConfigureV2();
     void ReadDataFiles();
 
-    friend class KaldiRecognizer;
+    friend class Recognizer;
+    friend class BatchRecognizer;
 
     string model_path_str_;
     string nnet3_rxfilename_;
diff --git a/src/kaldi_recognizer.cc b/src/recognizer.cc
similarity index 93%
rename from src/kaldi_recognizer.cc
rename to src/recognizer.cc
index 86cf9bdd..f25ff0ee 100644
--- a/src/kaldi_recognizer.cc
+++ b/src/recognizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kaldi_recognizer.h"
+#include "recognizer.h"
 #include "json.h"
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
@@ -21,7 +21,7 @@
 using namespace fst;
 using namespace kaldi::nnet3;
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) {
+Recognizer::Recognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) {
 
     model_->Ref();
 
@@ -46,7 +46,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(
     InitRescoring();
 }
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency)
+Recognizer::Recognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency)
 {
     model_->Ref();
 
@@ -107,7 +107,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons
     InitRescoring();
 }
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) {
+Recognizer::Recognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) {
 
     model_->Ref();
     spk_model->Ref();
@@ -135,7 +135,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel
     InitRescoring();
 }
 
-KaldiRecognizer::~KaldiRecognizer() {
+Recognizer::~Recognizer() {
     delete decoder_;
     delete feature_pipeline_;
     delete silence_weighting_;
@@ -155,7 +155,7 @@ KaldiRecognizer::~KaldiRecognizer() {
          spk_model_->Unref();
 }
 
-void KaldiRecognizer::InitState()
+void Recognizer::InitState()
 {
     frame_offset_ = 0;
     samples_processed_ = 0;
@@ -164,7 +164,7 @@ void KaldiRecognizer::InitState()
     state_ = RECOGNIZER_INITIALIZED;
 }
 
-void KaldiRecognizer::InitRescoring()
+void Recognizer::InitRescoring()
 {
     if (model_->graph_lm_fst_) {
 
@@ -185,7 +185,7 @@ void KaldiRecognizer::InitRescoring()
     }
 }
 
-void KaldiRecognizer::CleanUp()
+void Recognizer::CleanUp()
 {
     delete silence_weighting_;
     silence_weighting_ = new kaldi::OnlineSilenceWeighting(*model_->trans_model_, model_->feature_info_.silence_weighting_config, 3);
@@ -223,7 +223,7 @@ void KaldiRecognizer::CleanUp()
     }
 }
 
-void KaldiRecognizer::UpdateSilenceWeights()
+void Recognizer::UpdateSilenceWeights()
 {
     if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0 &&
         feature_pipeline_->IvectorFeature() != nullptr) {
@@ -236,17 +236,17 @@ void KaldiRecognizer::UpdateSilenceWeights()
     }
 }
 
-void KaldiRecognizer::SetMaxAlternatives(int max_alternatives)
+void Recognizer::SetMaxAlternatives(int max_alternatives)
 {
     max_alternatives_ = max_alternatives;
 }
 
-void KaldiRecognizer::SetWords(bool words)
+void Recognizer::SetWords(bool words)
 {
     words_ = words;
 }
 
-void KaldiRecognizer::SetSpkModel(SpkModel *spk_model)
+void Recognizer::SetSpkModel(SpkModel *spk_model)
 {
     if (state_ == RECOGNIZER_RUNNING) {
         KALDI_ERR << "Can't add speaker model to already running recognizer";
@@ -257,7 +257,7 @@ void KaldiRecognizer::SetSpkModel(SpkModel *spk_model)
     spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const char *data, int len)
+bool Recognizer::AcceptWaveform(const char *data, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len / 2, kUndefined);
@@ -266,7 +266,7 @@ bool KaldiRecognizer::AcceptWaveform(const char *data, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len)
+bool Recognizer::AcceptWaveform(const short *sdata, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len, kUndefined);
@@ -275,7 +275,7 @@ bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
+bool Recognizer::AcceptWaveform(const float *fdata, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len, kUndefined);
@@ -284,7 +284,7 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
+bool Recognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
 {
     // Cleanup if we finalized previous utterance or the whole feature pipeline
     if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) {
@@ -343,7 +343,7 @@ static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
 
 #define MIN_SPK_FEATS 50
 
-bool KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_frames)
+bool Recognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_frames)
 {
     vector<int32> nonsilence_frames;
     if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0) {
@@ -409,7 +409,7 @@ bool KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_
 }
 
 
-const char *KaldiRecognizer::MbrResult(CompactLattice &rlat)
+const char *Recognizer::MbrResult(CompactLattice &rlat)
 {
     CompactLattice aligned_lat;
     if (model_->winfo_) {
@@ -523,7 +523,7 @@ static bool CompactLatticeToWordAlignmentWeight(const CompactLattice &clat,
 }
 
 
-const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
+const char *Recognizer::NbestResult(CompactLattice &clat)
 {
     Lattice lat;
     Lattice nbest_lat;
@@ -584,7 +584,7 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
     return StoreReturn(obj.dump());
 }
 
-const char* KaldiRecognizer::GetResult()
+const char* Recognizer::GetResult()
 {
     if (decoder_->NumFramesDecoded() == 0) {
         return StoreEmptyReturn();
@@ -645,7 +645,7 @@ const char* KaldiRecognizer::GetResult()
 }
 
 
-const char* KaldiRecognizer::PartialResult()
+const char* Recognizer::PartialResult()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -676,7 +676,7 @@ const char* KaldiRecognizer::PartialResult()
     return StoreReturn(res.dump());
 }
 
-const char* KaldiRecognizer::Result()
+const char* Recognizer::Result()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -686,7 +686,7 @@ const char* KaldiRecognizer::Result()
     return GetResult();
 }
 
-const char* KaldiRecognizer::FinalResult()
+const char* Recognizer::FinalResult()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -714,7 +714,7 @@ const char* KaldiRecognizer::FinalResult()
     return last_result_.c_str();
 }
 
-void KaldiRecognizer::Reset()
+void Recognizer::Reset()
 {
     if (state_ == RECOGNIZER_RUNNING) {
         decoder_->FinalizeDecoding();
@@ -723,7 +723,7 @@ void KaldiRecognizer::Reset()
     state_ = RECOGNIZER_ENDPOINT;
 }
 
-const char *KaldiRecognizer::StoreEmptyReturn()
+const char *Recognizer::StoreEmptyReturn()
 {
     if (!max_alternatives_) {
         return StoreReturn("{\"text\": \"\"}");
@@ -733,7 +733,7 @@ const char *KaldiRecognizer::StoreEmptyReturn()
 }
 
 // Store result in recognizer and return as const string
-const char *KaldiRecognizer::StoreReturn(const string &res)
+const char *Recognizer::StoreReturn(const string &res)
 {
     last_result_ = res;
     return last_result_.c_str();
diff --git a/src/kaldi_recognizer.h b/src/recognizer.h
similarity index 91%
rename from src/kaldi_recognizer.h
rename to src/recognizer.h
index 934e237e..e5a733d1 100644
--- a/src/kaldi_recognizer.h
+++ b/src/recognizer.h
@@ -33,19 +33,19 @@
 
 using namespace kaldi;
 
-enum KaldiRecognizerState {
+enum RecognizerState {
     RECOGNIZER_INITIALIZED,
     RECOGNIZER_RUNNING,
     RECOGNIZER_ENDPOINT,
     RECOGNIZER_FINALIZED
 };
 
-class KaldiRecognizer {
+class Recognizer {
     public:
-        KaldiRecognizer(Model *model, float sample_frequency);
-        KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model);
-        KaldiRecognizer(Model *model, float sample_frequency, char const *grammar);
-        ~KaldiRecognizer();
+        Recognizer(Model *model, float sample_frequency);
+        Recognizer(Model *model, float sample_frequency, SpkModel *spk_model);
+        Recognizer(Model *model, float sample_frequency, char const *grammar);
+        ~Recognizer();
         void SetMaxAlternatives(int max_alternatives);
         void SetSpkModel(SpkModel *spk_model);
         void SetWords(bool words);
@@ -101,7 +101,7 @@ class KaldiRecognizer {
         int64 samples_processed_;
         int64 samples_round_start_;
 
-        KaldiRecognizerState state_;
+        RecognizerState state_;
         string last_result_;
 };
 
diff --git a/src/spk_model.h b/src/spk_model.h
index 07cbd4b0..9a76c62a 100644
--- a/src/spk_model.h
+++ b/src/spk_model.h
@@ -22,7 +22,7 @@
 
 using namespace kaldi;
 
-class KaldiRecognizer;
+class Recognizer;
 
 class SpkModel {
 
@@ -32,7 +32,7 @@ class SpkModel {
     void Unref();
 
 protected:
-    friend class KaldiRecognizer;
+    friend class Recognizer;
     ~SpkModel() {};
 
     kaldi::nnet3::Nnet speaker_nnet;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index ba76a73b..2c5b3b82 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "vosk_api.h"
-#include "kaldi_recognizer.h"
+
+#include "recognizer.h"
+#include "batch_recognizer.h"
 #include "model.h"
 #include "spk_model.h"
 
@@ -67,7 +69,7 @@ void vosk_spk_model_free(VoskSpkModel *model)
 VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate);
     } catch (...) {
         return nullptr;
     }
@@ -76,7 +78,7 @@ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate)
 VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, (SpkModel *)spk_model);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, (SpkModel *)spk_model);
     } catch (...) {
         return nullptr;
     }
@@ -85,7 +87,7 @@ VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, Vos
 VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, grammar);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, grammar);
     } catch (...) {
         return nullptr;
     }
@@ -93,12 +95,12 @@ VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, con
 
 void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives)
 {
-    ((KaldiRecognizer *)recognizer)->SetMaxAlternatives(max_alternatives);
+    ((Recognizer *)recognizer)->SetMaxAlternatives(max_alternatives);
 }
 
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)
 {
-    ((KaldiRecognizer *)recognizer)->SetWords((bool)words);
+    ((Recognizer *)recognizer)->SetWords((bool)words);
 }
 
 void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model)
@@ -106,13 +108,13 @@ void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk
     if (recognizer == nullptr || spk_model == nullptr) {
        return;
     }
-    ((KaldiRecognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model);
+    ((Recognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model);
 }
 
 int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -121,7 +123,7 @@ int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data
 int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -130,7 +132,7 @@ int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *d
 int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -138,27 +140,27 @@ int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *d
 
 const char *vosk_recognizer_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->Result();
+    return ((Recognizer *)recognizer)->Result();
 }
 
 const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->PartialResult();
+    return ((Recognizer *)recognizer)->PartialResult();
 }
 
 const char *vosk_recognizer_final_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->FinalResult();
+    return ((Recognizer *)recognizer)->FinalResult();
 }
 
 void vosk_recognizer_reset(VoskRecognizer *recognizer)
 {
-    ((KaldiRecognizer *)recognizer)->Reset();
+    ((Recognizer *)recognizer)->Reset();
 }
 
 void vosk_recognizer_free(VoskRecognizer *recognizer)
 {
-    delete (KaldiRecognizer *)(recognizer);
+    delete (Recognizer *)(recognizer);
 }
 
 void vosk_set_log_level(int log_level)
@@ -180,3 +182,28 @@ void vosk_gpu_thread_init()
     kaldi::CuDevice::Instantiate();
 #endif
 }
+
+VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency)
+{
+    return (VoskBatchRecognizer *)(new BatchRecognizer((Model *)model, sample_frequency));
+}
+
+void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
+{
+    delete ((BatchRecognizer *)recognizer);
+}
+
+void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length)
+{
+    ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
+}
+
+void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id)
+{
+    ((BatchRecognizer *)recognizer)->FinishStream(id);
+}
+
+const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer)
+{
+    return ((BatchRecognizer *)recognizer)->PullResults();
+}
diff --git a/src/vosk_api.h b/src/vosk_api.h
index 7636caa6..df951858 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -39,6 +39,10 @@ typedef struct VoskSpkModel VoskSpkModel;
  *  speaker information and so on */
 typedef struct VoskRecognizer VoskRecognizer;
 
+/**
+ * Batch recognizer object
+ */
+typedef struct VoskBatchRecognizer VoskBatchRecognizer;
 
 /** Loads model data from the file and returns the model object
  *
@@ -285,6 +289,26 @@ void vosk_gpu_init();
  */
 void vosk_gpu_thread_init();
 
+/** Creates the batch recognizer object
+ *  The recognizers process the speech and return text using shared model data
+ *  @param model       VoskModel containing static data for recognizer. Model can be
+ *                     shared across recognizers, even running in different threads.
+ *  @returns recognizer object or NULL if problem occured */
+VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency);
+
+/** Releases batch recognizer object
+ *  Underlying model is also unreferenced and if needed released */
+void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
+
+/** Accept batch voice data */
+void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length);
+
+/** Closes the stream */
+void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id);
+
+/** Return results */
+const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer);
+
 #ifdef __cplusplus
 }
 #endif

From 344e137a61f81887afc974027a93500c0c986436 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Mon, 13 Dec 2021 01:21:59 +0100
Subject: [PATCH 02/25] Decoding works, results are empty yet

---
 python/example/batch/test_batch.py | 13 ++++++++-----
 python/vosk/__init__.py            |  4 ++--
 src/Makefile                       |  8 ++++----
 src/batch_recognizer.cc            | 18 +++++++++++++++++-
 src/model.cc                       |  6 +++---
 src/vosk_api.cc                    |  4 ++--
 src/vosk_api.h                     |  4 ++--
 7 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index fb1bb7e9..3fadab6a 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -1,21 +1,24 @@
 #!/usr/bin/env python3
 
-from vosk import Model, BatchRecognizer
+from vosk import Model, BatchRecognizer, GpuInit, GpuThreadInit
 import sys
 import os
 import wave
 
+GpuInit()
+GpuThreadInit()
+
 model = Model("model")
 rec = BatchRecognizer(model, 16000.0)
 
 fnames = open("tedlium.list").readlines()
-fds = [open(x) for x in fnames]
+fds = [open(x.strip(), "rb") for x in fnames]
 ended = set()
 while True:
-    for i, fd in fds:
-        if i in ended():
+    for i, fd in enumerate(fds):
+        if i in ended:
             continue
-        data = fd.read(4000)
+        data = fd.read(16000)
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 02e1df97..9e25229c 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -117,7 +117,7 @@ def AcceptWaveform(self, uid, data):
         res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
 
     def Results(self):
-        return _ffi.string(_c.vosk_batch_recognizer_result(self._handle)).decode('utf-8')
+        return _ffi.string(_c.vosk_batch_recognizer_results(self._handle)).decode('utf-8')
 
     def FinishStream(self, uid):
-        _c.vosk_recognizer_final_result(self._handle, uid)
+        _c.vosk_batch_recognizer_finish_stream(self._handle, uid)
diff --git a/src/Makefile b/src/Makefile
index 96c21949..823a4aaf 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -37,17 +37,19 @@ CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LIN
 	-I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS) 
 
 LIBS= \
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
 	$(KALDI_ROOT)/src/online2/kaldi-online2.a \
 	$(KALDI_ROOT)/src/decoder/kaldi-decoder.a \
 	$(KALDI_ROOT)/src/ivector/kaldi-ivector.a \
 	$(KALDI_ROOT)/src/gmm/kaldi-gmm.a \
-	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/tree/kaldi-tree.a \
 	$(KALDI_ROOT)/src/feat/kaldi-feat.a \
 	$(KALDI_ROOT)/src/lat/kaldi-lat.a \
 	$(KALDI_ROOT)/src/lm/kaldi-lm.a \
 	$(KALDI_ROOT)/src/rnnlm/kaldi-rnnlm.a \
 	$(KALDI_ROOT)/src/hmm/kaldi-hmm.a \
+	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/transform/kaldi-transform.a \
 	$(KALDI_ROOT)/src/cudamatrix/kaldi-cudamatrix.a \
 	$(KALDI_ROOT)/src/matrix/kaldi-matrix.a \
@@ -68,7 +70,7 @@ ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
 endif
 
 ifeq ($(HAVE_MKL), 1)
-    CFLAGS += -I$(MKL_ROOT)/include
+    CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include
     LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
 endif
 
@@ -79,8 +81,6 @@ endif
 ifeq ($(HAVE_CUDA), 1)
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
     LIBS+=\
-        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
-        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
         -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 112f1fe9..969a62aa 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -25,9 +25,22 @@ BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(
     model_->Ref();
 
     BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+    batched_decoder_config.num_worker_threads = 4;
+    batched_decoder_config.max_batch_size = 100;
+
+    batched_decoder_config.feature_opts.feature_type = "mfcc";
+    batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
+    batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
+    batched_decoder_config.decoder_opts.max_active = 7000;
+    batched_decoder_config.decoder_opts.default_beam = 13.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 8.0;
+    batched_decoder_config.compute_opts.acoustic_scale = 1.0;
+    batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
+    batched_decoder_config.compute_opts.frames_per_chunk = 312;
 
     cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
          (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_);
+    cuda_pipeline_->SetSymbolTable(*model_->word_syms_);
 
     CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
     dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
@@ -60,6 +73,9 @@ void BatchRecognizer::InitRescoring()
 
 void BatchRecognizer::FinishStream(uint64_t id)
 {
+    Vector<BaseFloat> wave;
+    SubVector<BaseFloat> chunk(wave.Data(), 0);
+    dynamic_batcher_->Push(id, false, true, chunk);
     streams_.erase(id);
 }
 
@@ -77,7 +93,7 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
           [&, id](const std::string &str, bool partial,
                        bool endpoint_detected) {
               if (partial) {
-                  KALDI_LOG << "id #" << id << " [partial] : " << str;
+                  KALDI_LOG << "id #" << id << " [partial] : " << str << ":";
               }
 
               if (endpoint_detected) {
diff --git a/src/model.cc b/src/model.cc
index 8b5e12cc..eecaed97 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -241,9 +241,9 @@ void Model::ReadDataFiles()
         SetDropoutTestMode(true, &(nnet_->GetNnet()));
         nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
     }
-    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
-                                                               nnet_);
 
+/*    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
+                                                               nnet_);
     if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_;
 
@@ -261,7 +261,7 @@ void Model::ReadDataFiles()
     } else {
         feature_info_.use_ivectors = false;
     }
-
+*/
     if (stat(global_cmvn_stats_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Reading CMVN stats from " << global_cmvn_stats_rxfilename_;
         feature_info_.use_cmvn = true;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 2c5b3b82..f95adc07 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -193,12 +193,12 @@ void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
     delete ((BatchRecognizer *)recognizer);
 }
 
-void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length)
+void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length)
 {
     ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
 }
 
-void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id)
+void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id)
 {
     ((BatchRecognizer *)recognizer)->FinishStream(id);
 }
diff --git a/src/vosk_api.h b/src/vosk_api.h
index df951858..e085afe7 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -301,10 +301,10 @@ VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_fr
 void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
 
 /** Accept batch voice data */
-void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length);
+void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length);
 
 /** Closes the stream */
-void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id);
+void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id);
 
 /** Return results */
 const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer);

From 60f0396fe0647d57b73ff59e51f09bba69c54ad5 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 17 Dec 2021 01:13:09 +0100
Subject: [PATCH 03/25] Reset lattice on endpoint

---
 python/example/batch/test_batch.py |  5 +-
 python/vosk/__init__.py            |  2 +-
 src/batch_recognizer.cc            | 95 +++++++++++++++++++++++-------
 src/batch_recognizer.h             | 15 ++++-
 src/model.cc                       |  4 +-
 src/vosk_api.cc                    |  4 +-
 src/vosk_api.h                     |  6 +-
 7 files changed, 94 insertions(+), 37 deletions(-)

diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index 3fadab6a..f93eb6ea 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -8,8 +8,7 @@
 GpuInit()
 GpuThreadInit()
 
-model = Model("model")
-rec = BatchRecognizer(model, 16000.0)
+rec = BatchRecognizer()
 
 fnames = open("tedlium.list").readlines()
 fds = [open(x.strip(), "rb") for x in fnames]
@@ -18,7 +17,7 @@
     for i, fd in enumerate(fds):
         if i in ended:
             continue
-        data = fd.read(16000)
+        data = fd.read(8000)
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 9e25229c..964a0ac2 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -105,7 +105,7 @@ def GpuThreadInit():
 class BatchRecognizer(object):
 
     def __init__(self, *args):
-        self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
+        self._handle = _c.vosk_batch_recognizer_new()
 
         if self._handle == _ffi.NULL:
             raise Exception("Failed to create a recognizer")
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 969a62aa..184fb8a2 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -17,16 +17,22 @@
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
 
+#include <sys/stat.h>
+
 using namespace fst;
 using namespace kaldi::nnet3;
 using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID;
 
-BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(model), sample_frequency_(sample_frequency) {
-    model_->Ref();
-
+BatchRecognizer::BatchRecognizer() {
     BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+
+    kaldi::ParseOptions po("something");
+    batched_decoder_config.Register(&po);
+    po.ReadConfigFile("model/conf/model.conf");
+
     batched_decoder_config.num_worker_threads = 4;
     batched_decoder_config.max_batch_size = 100;
+    batched_decoder_config.reset_on_endpoint = true;
 
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
@@ -38,37 +44,78 @@ BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
     batched_decoder_config.compute_opts.frames_per_chunk = 312;
 
+    struct stat buffer;
+
+    string nnet3_rxfilename_ = "model/am/final.mdl";
+    string hclg_fst_rxfilename_ = "model/graph/HCLG.fst";
+    string word_syms_rxfilename_ = "model/graph/words.txt";
+    string winfo_rxfilename_ = "model/graph/phones/word_boundary.int";
+    string std_fst_rxfilename_ = "model/rescore/G.fst";
+    string carpa_rxfilename_ = "model/rescore/G.carpa";
+
+    trans_model_ = new kaldi::TransitionModel();
+    nnet_ = new kaldi::nnet3::AmNnetSimple();
+    {
+        bool binary;
+        kaldi::Input ki(nnet3_rxfilename_, &binary);
+        trans_model_->Read(ki.Stream(), binary);
+        nnet_->Read(ki.Stream(), binary);
+        SetBatchnormTestMode(true, &(nnet_->GetNnet()));
+        SetDropoutTestMode(true, &(nnet_->GetNnet()));
+        nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
+    }
+
+    if (stat(hclg_fst_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading HCLG from " << hclg_fst_rxfilename_;
+        hclg_fst_ = fst::ReadFstKaldiGeneric(hclg_fst_rxfilename_);
+    }
+
+    KALDI_LOG << "Loading words from " << word_syms_rxfilename_;
+    if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) {
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename_;
+    }
+    KALDI_ASSERT(word_syms_);
+
+    if (stat(winfo_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading winfo " << winfo_rxfilename_;
+        kaldi::WordBoundaryInfoNewOpts opts;
+        winfo_ = new kaldi::WordBoundaryInfo(opts, winfo_rxfilename_);
+    }
+
+    if (stat(carpa_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading subtract G.fst model from " << std_fst_rxfilename_;
+        graph_lm_fst_ = fst::ReadAndPrepareLmFst(std_fst_rxfilename_);
+        KALDI_LOG << "Loading CARPA model from " << carpa_rxfilename_;
+        ReadKaldiObject(carpa_rxfilename_, &const_arpa_);
+    }
+
+
+
     cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
-         (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_);
-    cuda_pipeline_->SetSymbolTable(*model_->word_syms_);
+         (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_);
+    cuda_pipeline_->SetSymbolTable(*word_syms_);
 
     CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
     dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
                                                             *cuda_pipeline_);
-
-    InitRescoring();
 }
 
 BatchRecognizer::~BatchRecognizer() {
+
+    delete trans_model_;
+    delete nnet_;
+    delete word_syms_;
+    delete winfo_;
+    delete hclg_fst_;
+    delete graph_lm_fst_;
+
     delete lm_to_subtract_;
     delete carpa_to_add_;
     delete carpa_to_add_scale_;
 
     delete cuda_pipeline_;
     delete dynamic_batcher_;
-
-    model_->Unref();
-}
-
-void BatchRecognizer::InitRescoring()
-{
-    if (model_->graph_lm_fst_) {
-        fst::CacheOptions cache_opts(true, -1);
-        fst::ArcMapFstOptions mapfst_opts(cache_opts);
-        fst::StdToLatticeMapper<BaseFloat> mapper;
-        lm_to_subtract_ = new fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >(*model_->graph_lm_fst_, mapper, mapfst_opts);
-        carpa_to_add_ = new ConstArpaLmDeterministicFst(model_->const_arpa_);
-    }
 }
 
 void BatchRecognizer::FinishStream(uint64_t id)
@@ -104,13 +151,18 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
                   KALDI_LOG << "id #" << id << " : " << str;
               }
             });
+        cuda_pipeline_->SetLatticeCallback(
+          id,
+          [&, id](CompactLattice &clat) {
+              KALDI_LOG << "Got lattice from the stream " << id;
+          });
     }
 
     Vector<BaseFloat> wave;
     wave.Resize(len / 2, kUndefined);
     for (int i = 0; i < len / 2; i++)
         wave(i) = *(((short *)data) + i);
-    SubVector<BaseFloat> chunk(wave.Data(), 0);
+    SubVector<BaseFloat> chunk(wave.Data(), wave.Dim());
 
     dynamic_batcher_->Push(id, first, false, chunk);
 }
@@ -118,6 +170,5 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
 const char* BatchRecognizer::PullResults()
 {
     dynamic_batcher_->WaitForCompletion();
-    cudaDeviceSynchronize();
     return "";
 }
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index 00f4a0db..c8045d53 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -40,7 +40,7 @@ using namespace kaldi::cuda_decoder;
 
 class BatchRecognizer {
     public:
-        BatchRecognizer(Model *model, float sample_frequency);
+        BatchRecognizer();
         ~BatchRecognizer();
 
         void FinishStream(uint64_t id);
@@ -48,12 +48,21 @@ class BatchRecognizer {
         const char* PullResults();
 
     private:
-        void InitRescoring();
 
-        Model *model_ = nullptr;
+        kaldi::TransitionModel *trans_model_ = nullptr;
+        kaldi::nnet3::AmNnetSimple *nnet_ = nullptr;
+        const fst::SymbolTable *word_syms_ = nullptr;
+
+        fst::Fst<fst::StdArc> *hclg_fst_ = nullptr;
+        kaldi::WordBoundaryInfo *winfo_ = nullptr;
+
+        fst::VectorFst<fst::StdArc> *graph_lm_fst_ = nullptr;
+        kaldi::ConstArpaLm const_arpa_;
+
         BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
         CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
 
+
         std::set<int> streams_;
 
         // Rescoring
diff --git a/src/model.cc b/src/model.cc
index eecaed97..c83d07a8 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -242,7 +242,7 @@ void Model::ReadDataFiles()
         nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
     }
 
-/*    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
+    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
                                                                nnet_);
     if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_;
@@ -261,7 +261,7 @@ void Model::ReadDataFiles()
     } else {
         feature_info_.use_ivectors = false;
     }
-*/
+
     if (stat(global_cmvn_stats_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Reading CMVN stats from " << global_cmvn_stats_rxfilename_;
         feature_info_.use_cmvn = true;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index f95adc07..a53dbf87 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -183,9 +183,9 @@ void vosk_gpu_thread_init()
 #endif
 }
 
-VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency)
+VoskBatchRecognizer *vosk_batch_recognizer_new()
 {
-    return (VoskBatchRecognizer *)(new BatchRecognizer((Model *)model, sample_frequency));
+    return (VoskBatchRecognizer *)(new BatchRecognizer());
 }
 
 void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
diff --git a/src/vosk_api.h b/src/vosk_api.h
index e085afe7..c5b92f1c 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -290,11 +290,9 @@ void vosk_gpu_init();
 void vosk_gpu_thread_init();
 
 /** Creates the batch recognizer object
- *  The recognizers process the speech and return text using shared model data
- *  @param model       VoskModel containing static data for recognizer. Model can be
- *                     shared across recognizers, even running in different threads.
+ *
  *  @returns recognizer object or NULL if problem occured */
-VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency);
+VoskBatchRecognizer *vosk_batch_recognizer_new();
 
 /** Releases batch recognizer object
  *  Underlying model is also unreferenced and if needed released */

From 848b2dc753a823c2a3f1ca6e2bb4fd4f1d7eab31 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 17 Dec 2021 22:22:30 +0100
Subject: [PATCH 04/25] Expose results in Python

---
 python/example/batch/asr_server_gpu.py | 85 +++++++++++++++++++++++++
 python/example/batch/test_batch.py     | 24 +++++--
 python/vosk/__init__.py                | 10 ++-
 src/batch_recognizer.cc                | 87 +++++++++++++++++++++++---
 src/batch_recognizer.h                 |  6 +-
 src/json.h                             |  8 +--
 src/vosk_api.cc                        | 14 ++++-
 src/vosk_api.h                         |  8 ++-
 8 files changed, 217 insertions(+), 25 deletions(-)
 create mode 100755 python/example/batch/asr_server_gpu.py

diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py
new file mode 100755
index 00000000..f58587c9
--- /dev/null
+++ b/python/example/batch/asr_server_gpu.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import sys
+import asyncio
+import pathlib
+import websockets
+import logging
+
+from vosk import BatchRecognizer, GpuInit
+
+
+async def recognize(websocket, path):
+    global args
+    global loop
+    global pool
+    global rec
+    global client_cnt
+
+    uid = client_cnt
+    client_cnt += 1
+
+    logging.info('Connection %d from %s', uid, websocket.remote_address);
+
+    while True:
+
+        message = await websocket.recv()
+
+        if message == '{"eof" : 1}':
+            rec.FinishStream(uid)
+            break
+
+        if isinstance(message, str) and 'config' in message:
+            continue
+
+        rec.AcceptWaveform(uid, message)
+        await asyncio.sleep(len(message) / 16000.0 / 2)
+        res = rec.Result(uid)
+        if len(res) == 0:
+            await websocket.send('{ "partial" : "" }')
+        else:
+            await websocket.send(res)
+
+    rec.Wait()
+    res = rec.Result(uid)
+    await websocket.send(res)
+
+def start():
+
+    global rec
+    global args
+    global loop
+    global client_cnt
+
+    # Enable loging if needed
+    #
+    # logger = logging.getLogger('websockets')
+    # logger.setLevel(logging.INFO)
+    # logger.addHandler(logging.StreamHandler())
+    logging.basicConfig(level=logging.INFO)
+
+    args = type('', (), {})()
+
+    args.interface = os.environ.get('VOSK_SERVER_INTERFACE', '0.0.0.0')
+    args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700))
+
+    GpuInit()
+
+    rec = BatchRecognizer()
+
+    client_cnt = 0
+
+    loop = asyncio.get_event_loop()
+
+    start_server = websockets.serve(
+        recognize, args.interface, args.port)
+
+    logging.info("Listening on %s:%d", args.interface, args.port)
+    loop.run_until_complete(start_server)
+    loop.run_forever()
+
+
+if __name__ == '__main__':
+    start()
diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index f93eb6ea..32aa021e 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 
-from vosk import Model, BatchRecognizer, GpuInit, GpuThreadInit
 import sys
 import os
 import wave
+from time import sleep
+
+from vosk import Model, BatchRecognizer, GpuInit
 
 GpuInit()
-GpuThreadInit()
 
 rec = BatchRecognizer()
 
@@ -14,6 +15,7 @@
 fds = [open(x.strip(), "rb") for x in fnames]
 ended = set()
 while True:
+
     for i, fd in enumerate(fds):
         if i in ended:
             continue
@@ -21,8 +23,20 @@
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
-        else:
-            rec.AcceptWaveform(i, data)
-    rec.Results()
+            continue
+        rec.AcceptWaveform(i, data)
+
+    sleep(0.3)
+    for i, fd in enumerate(fds):
+       res = rec.Result(i)
+       print (i, res)
+
     if len(ended) == len(fds):
         break
+
+sleep(20)
+print ("Done")
+for i, fd in enumerate(fds):
+   res = rec.Result(i)
+   print (i, res)
+rec.Wait()
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 964a0ac2..c83a7e34 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -116,8 +116,14 @@ def __del__(self):
     def AcceptWaveform(self, uid, data):
         res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
 
-    def Results(self):
-        return _ffi.string(_c.vosk_batch_recognizer_results(self._handle)).decode('utf-8')
+    def Result(self, uid):
+        ptr = _c.vosk_batch_recognizer_front_result(self._handle, uid)
+        res = _ffi.string(ptr).decode('utf-8')
+        _c.vosk_batch_recognizer_pop(self._handle, uid)
+        return res
 
     def FinishStream(self, uid):
         _c.vosk_batch_recognizer_finish_stream(self._handle, uid)
+
+    def Wait(self):
+        _c.vosk_batch_recognizer_wait(self._handle)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 184fb8a2..1773fc0e 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -16,6 +16,7 @@
 
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
+#include "json.h"
 
 #include <sys/stat.h>
 
@@ -37,12 +38,12 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
     batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
-    batched_decoder_config.decoder_opts.max_active = 7000;
-    batched_decoder_config.decoder_opts.default_beam = 13.0;
-    batched_decoder_config.decoder_opts.lattice_beam = 8.0;
+    batched_decoder_config.decoder_opts.max_active = 5000;
+    batched_decoder_config.decoder_opts.default_beam = 10.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 4.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
-    batched_decoder_config.compute_opts.frames_per_chunk = 312;
+    batched_decoder_config.compute_opts.frames_per_chunk = 51;
 
     struct stat buffer;
 
@@ -126,6 +127,47 @@ void BatchRecognizer::FinishStream(uint64_t id)
     streams_.erase(id);
 }
 
+
+void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset)
+{
+    fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat);
+
+    CompactLattice aligned_lat;
+    WordAlignLattice(clat, *trans_model_, *winfo_, 0, &aligned_lat);
+
+    MinimumBayesRisk mbr(aligned_lat);
+    const vector<BaseFloat> &conf = mbr.GetOneBestConfidences();
+    const vector<int32> &words = mbr.GetOneBest();
+    const vector<pair<BaseFloat, BaseFloat> > &times =
+          mbr.GetOneBestTimes();
+
+    int size = words.size();
+
+    json::JSON obj;
+    stringstream text;
+
+    // Create JSON object
+    for (int i = 0; i < size; i++) {
+        json::JSON word;
+
+        word["word"] = word_syms_->Find(words[i]);
+        word["start"] = times[i].first * 0.03 + offset;
+        word["end"] = times[i].second * 0.03 + offset;
+        word["conf"] = conf[i];
+        obj["result"].append(word);
+
+        if (i) {
+            text << " ";
+        }
+        text << word_syms_->Find(words[i]);
+    }
+    obj["text"] = text.str();
+
+//    KALDI_LOG << "Result " << id << " " << obj.dump();
+
+    results_[id].push(obj.dump());
+}
+
 void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
 {
     bool first = false;
@@ -135,7 +177,8 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
         streams_.insert(id);
 
         // Define the callback for results.
-        cuda_pipeline_->SetBestPathCallback(
+#if 0
+         cuda_pipeline_->SetBestPathCallback(
           id,
           [&, id](const std::string &str, bool partial,
                        bool endpoint_detected) {
@@ -151,11 +194,19 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
                   KALDI_LOG << "id #" << id << " : " << str;
               }
             });
+#endif
         cuda_pipeline_->SetLatticeCallback(
           id,
-          [&, id](CompactLattice &clat) {
-              KALDI_LOG << "Got lattice from the stream " << id;
-          });
+          [&, id](SegmentedLatticeCallbackParams& params) {
+              if (params.results.empty()) {
+                  KALDI_WARN << "Empty result for callback";
+                  return;
+              }
+              CompactLattice *clat = params.results[0].GetLatticeResult();
+              BaseFloat offset = params.results[0].GetTimeOffsetSeconds();
+              PushLattice(id, *clat, offset);
+          },
+          CudaPipelineResult::RESULT_TYPE_LATTICE);
     }
 
     Vector<BaseFloat> wave;
@@ -167,8 +218,24 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
     dynamic_batcher_->Push(id, first, false, chunk);
 }
 
-const char* BatchRecognizer::PullResults()
+const char* BatchRecognizer::FrontResult(uint64_t id)
+{
+    if (results_[id].empty()) {
+        return "";
+    }
+    return results_[id].front().c_str();
+}
+
+void BatchRecognizer::Pop(uint64_t id)
+{
+    if (results_[id].empty()) {
+        return;
+    }
+    results_[id].pop();
+}
+
+void BatchRecognizer::WaitForCompletion()
 {
     dynamic_batcher_->WaitForCompletion();
-    return "";
 }
+
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index c8045d53..0082a364 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -45,9 +45,12 @@ class BatchRecognizer {
 
         void FinishStream(uint64_t id);
         void AcceptWaveform(uint64_t id, const char *data, int len);
-        const char* PullResults();
+        const char *FrontResult(uint64_t id);
+        void Pop(uint64_t id);
+        void WaitForCompletion();
 
     private:
+        void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset);
 
         kaldi::TransitionModel *trans_model_ = nullptr;
         kaldi::nnet3::AmNnetSimple *nnet_ = nullptr;
@@ -64,6 +67,7 @@ class BatchRecognizer {
 
 
         std::set<int> streams_;
+        std::map<int, std::queue<std::string> > results_;
 
         // Rescoring
         fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;
diff --git a/src/json.h b/src/json.h
index 463912ec..2159392b 100644
--- a/src/json.h
+++ b/src/json.h
@@ -424,7 +424,7 @@ class JSON
         Class Type = Class::Null;
 };
 
-JSON Array() {
+inline JSON Array() {
     return JSON::Make( JSON::Class::Array );
 }
 
@@ -435,11 +435,11 @@ JSON Array( T... args ) {
     return arr;
 }
 
-JSON Object() {
+inline JSON Object() {
     return JSON::Make( JSON::Class::Object );
 }
 
-std::ostream& operator<<( std::ostream &os, const JSON &json ) {
+inline std::ostream& operator<<( std::ostream &os, const JSON &json ) {
     os << json.dump();
     return os;
 }
@@ -647,7 +647,7 @@ namespace {
     }
 }
 
-JSON JSON::Load( const string &str ) {
+inline JSON JSON::Load( const string &str ) {
     size_t offset = 0;
     return parse_next( str, offset );
 }
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index a53dbf87..b2a7a6a4 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -203,7 +203,17 @@ void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id
     ((BatchRecognizer *)recognizer)->FinishStream(id);
 }
 
-const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer)
+const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id)
 {
-    return ((BatchRecognizer *)recognizer)->PullResults();
+    return ((BatchRecognizer *)recognizer)->FrontResult(id);
+}
+
+void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
+{
+    return ((BatchRecognizer *)recognizer)->Pop(id);
+}
+
+void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
+{
+    ((BatchRecognizer *)recognizer)->WaitForCompletion();
 }
diff --git a/src/vosk_api.h b/src/vosk_api.h
index c5b92f1c..7177009c 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -305,7 +305,13 @@ void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int
 void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id);
 
 /** Return results */
-const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer);
+const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id);
+
+/** Release and free first retrieved result */
+void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id);
+
+/** Wait for the processing */
+void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer);
 
 #ifdef __cplusplus
 }

From cb0f8e64110ad502f8660a2e3066e490bedfcddc Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 23 Dec 2021 22:34:47 +0100
Subject: [PATCH 05/25] Per-stream wait API

---
 python/example/batch/asr_server_gpu.py |  9 ++++++--
 python/example/batch/test_batch.py     | 32 +++++++++++++++++++-------
 python/vosk/__init__.py                |  3 +++
 src/batch_recognizer.cc                | 10 +++++---
 src/batch_recognizer.h                 |  1 +
 src/vosk_api.cc                        |  7 +++++-
 src/vosk_api.h                         |  3 +++
 7 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py
index f58587c9..11885e9f 100755
--- a/python/example/batch/asr_server_gpu.py
+++ b/python/example/batch/asr_server_gpu.py
@@ -35,14 +35,19 @@ async def recognize(websocket, path):
             continue
 
         rec.AcceptWaveform(uid, message)
-        await asyncio.sleep(len(message) / 16000.0 / 2)
+
+        while rec.GetPendingChunks(uid) > 0:
+            await asyncio.sleep(0.1)
+
         res = rec.Result(uid)
         if len(res) == 0:
             await websocket.send('{ "partial" : "" }')
         else:
             await websocket.send(res)
 
-    rec.Wait()
+    while rec.GetPendingChunks(uid) > 0:
+        await asyncio.sleep(0.1)
+
     res = rec.Result(uid)
     await websocket.send(res)
 
diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index 32aa021e..8737a746 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -4,6 +4,9 @@
 import os
 import wave
 from time import sleep
+import json
+from timeit import default_timer as timer
+
 
 from vosk import Model, BatchRecognizer, GpuInit
 
@@ -13,9 +16,16 @@
 
 fnames = open("tedlium.list").readlines()
 fds = [open(x.strip(), "rb") for x in fnames]
+uids = [fname.strip().split('/')[-1][:-4] for fname in fnames]
+results = [""] * len(fnames)
 ended = set()
+tot_samples = 0
+
+start_time = timer()
+
 while True:
 
+    # Feed in the data
     for i, fd in enumerate(fds):
         if i in ended:
             continue
@@ -25,18 +35,24 @@
             ended.add(i)
             continue
         rec.AcceptWaveform(i, data)
+        tot_samples += len(data)
 
-    sleep(0.3)
+    # Wait for results from CUDA
+    rec.Wait()
+
+    # Retrieve and add results
     for i, fd in enumerate(fds):
        res = rec.Result(i)
-       print (i, res)
+       if len(res) != 0:
+           results[i] = results[i] + " " + json.loads(res)['text']
 
     if len(ended) == len(fds):
         break
 
-sleep(20)
-print ("Done")
-for i, fd in enumerate(fds):
-   res = rec.Result(i)
-   print (i, res)
-rec.Wait()
+end_time = timer()
+
+for i in range(len(results)):
+    print (uids[i], results[i].strip())
+
+print ("Processed %d seconds of audio in %d seconds (%f xRT)" % (tot_samples / 16000.0 / 2, end_time - start_time, 
+    (tot_samples / 16000.0 / 2 / (end_time - start_time))), file=sys.stderr)
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index c83a7e34..0e60c2ba 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -127,3 +127,6 @@ def FinishStream(self, uid):
 
     def Wait(self):
         _c.vosk_batch_recognizer_wait(self._handle)
+
+    def GetPendingChunks(self, uid):
+        return _c.vosk_batch_recognizer_get_pending_chunks(self._handle, uid)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 1773fc0e..972e31dc 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -38,9 +38,9 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
     batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
-    batched_decoder_config.decoder_opts.max_active = 5000;
-    batched_decoder_config.decoder_opts.default_beam = 10.0;
-    batched_decoder_config.decoder_opts.lattice_beam = 4.0;
+    batched_decoder_config.decoder_opts.max_active = 7000;
+    batched_decoder_config.decoder_opts.default_beam = 13.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 6.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
     batched_decoder_config.compute_opts.frames_per_chunk = 51;
@@ -239,3 +239,7 @@ void BatchRecognizer::WaitForCompletion()
     dynamic_batcher_->WaitForCompletion();
 }
 
+int BatchRecognizer::GetPendingChunks(uint64_t id)
+{
+    return dynamic_batcher_->GetPendingChunks(id);
+}
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index 0082a364..f26dd54b 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -48,6 +48,7 @@ class BatchRecognizer {
         const char *FrontResult(uint64_t id);
         void Pop(uint64_t id);
         void WaitForCompletion();
+        int GetPendingChunks(uint64_t id);
 
     private:
         void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset);
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index b2a7a6a4..1f77eb6c 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -210,10 +210,15 @@ const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer,
 
 void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
 {
-    return ((BatchRecognizer *)recognizer)->Pop(id);
+    ((BatchRecognizer *)recognizer)->Pop(id);
 }
 
 void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
 {
     ((BatchRecognizer *)recognizer)->WaitForCompletion();
 }
+
+int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id)
+{
+    return ((BatchRecognizer *)recognizer)->GetPendingChunks(id);
+}
diff --git a/src/vosk_api.h b/src/vosk_api.h
index 7177009c..f6a981cb 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -313,6 +313,9 @@ void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id);
 /** Wait for the processing */
 void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer);
 
+/** Get amount of pending chunks for more intelligent waiting */
+int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id);
+
 #ifdef __cplusplus
 }
 #endif

From 93e81c3bc8ed3960754b4eb6962b6dcc1fa26541 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 24 Dec 2021 00:22:42 +0100
Subject: [PATCH 06/25] Bigger frames per chunk for our big models

---
 src/batch_recognizer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 972e31dc..78cfc6f2 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -43,7 +43,7 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.decoder_opts.lattice_beam = 6.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
-    batched_decoder_config.compute_opts.frames_per_chunk = 51;
+    batched_decoder_config.compute_opts.frames_per_chunk = 180;
 
     struct stat buffer;
 

From 72bf210164ed6f347abce642025751f285b8284c Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 24 Dec 2021 01:07:38 +0100
Subject: [PATCH 07/25] Put the demo into main folder

---
 python/example/batch/asr_server_gpu.py        | 90 -------------------
 .../test_batch.py => test_gpu_batch.py}       |  2 +-
 src/batch_recognizer.cc                       |  5 +-
 src/vosk_api.cc                               |  2 +
 4 files changed, 6 insertions(+), 93 deletions(-)
 delete mode 100755 python/example/batch/asr_server_gpu.py
 rename python/example/{batch/test_batch.py => test_gpu_batch.py} (97%)

diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py
deleted file mode 100755
index 11885e9f..00000000
--- a/python/example/batch/asr_server_gpu.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import os
-import sys
-import asyncio
-import pathlib
-import websockets
-import logging
-
-from vosk import BatchRecognizer, GpuInit
-
-
-async def recognize(websocket, path):
-    global args
-    global loop
-    global pool
-    global rec
-    global client_cnt
-
-    uid = client_cnt
-    client_cnt += 1
-
-    logging.info('Connection %d from %s', uid, websocket.remote_address);
-
-    while True:
-
-        message = await websocket.recv()
-
-        if message == '{"eof" : 1}':
-            rec.FinishStream(uid)
-            break
-
-        if isinstance(message, str) and 'config' in message:
-            continue
-
-        rec.AcceptWaveform(uid, message)
-
-        while rec.GetPendingChunks(uid) > 0:
-            await asyncio.sleep(0.1)
-
-        res = rec.Result(uid)
-        if len(res) == 0:
-            await websocket.send('{ "partial" : "" }')
-        else:
-            await websocket.send(res)
-
-    while rec.GetPendingChunks(uid) > 0:
-        await asyncio.sleep(0.1)
-
-    res = rec.Result(uid)
-    await websocket.send(res)
-
-def start():
-
-    global rec
-    global args
-    global loop
-    global client_cnt
-
-    # Enable loging if needed
-    #
-    # logger = logging.getLogger('websockets')
-    # logger.setLevel(logging.INFO)
-    # logger.addHandler(logging.StreamHandler())
-    logging.basicConfig(level=logging.INFO)
-
-    args = type('', (), {})()
-
-    args.interface = os.environ.get('VOSK_SERVER_INTERFACE', '0.0.0.0')
-    args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700))
-
-    GpuInit()
-
-    rec = BatchRecognizer()
-
-    client_cnt = 0
-
-    loop = asyncio.get_event_loop()
-
-    start_server = websockets.serve(
-        recognize, args.interface, args.port)
-
-    logging.info("Listening on %s:%d", args.interface, args.port)
-    loop.run_until_complete(start_server)
-    loop.run_forever()
-
-
-if __name__ == '__main__':
-    start()
diff --git a/python/example/batch/test_batch.py b/python/example/test_gpu_batch.py
similarity index 97%
rename from python/example/batch/test_batch.py
rename to python/example/test_gpu_batch.py
index 8737a746..3a65bda8 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/test_gpu_batch.py
@@ -29,7 +29,7 @@
     for i, fd in enumerate(fds):
         if i in ended:
             continue
-        data = fd.read(8000)
+        data = fd.read(16000)
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 78cfc6f2..3337ee10 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -31,9 +31,10 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.Register(&po);
     po.ReadConfigFile("model/conf/model.conf");
 
-    batched_decoder_config.num_worker_threads = 4;
-    batched_decoder_config.max_batch_size = 100;
+    batched_decoder_config.num_worker_threads = -1;
+    batched_decoder_config.max_batch_size = 200;
     batched_decoder_config.reset_on_endpoint = true;
+    batched_decoder_config.use_gpu_feature_extraction = true;
 
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 1f77eb6c..3f740d7b 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -171,6 +171,8 @@ void vosk_set_log_level(int log_level)
 void vosk_gpu_init()
 {
 #if HAVE_CUDA
+//    kaldi::CuDevice::EnableTensorCores(true);
+//    kaldi::CuDevice::EnableTf32Compute(true);
     kaldi::CuDevice::Instantiate().SelectGpuId("yes");
     kaldi::CuDevice::Instantiate().AllowMultithreading();
 #endif

From 525b722c44e6b152926178ea226e9ce1c7ba3154 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 24 Dec 2021 01:35:06 +0100
Subject: [PATCH 08/25] Compile without CUDA too

---
 src/Makefile    | 13 ++++++++-----
 src/vosk_api.cc | 24 +++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 823a4aaf..9965db65 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -22,14 +22,12 @@ VOSK_SOURCES= \
 	language_model.cc \
 	model.cc \
 	spk_model.cc \
-	batch_recognizer.cc \
 	vosk_api.cc
 
 VOSK_HEADERS= \
 	recognizer.h \
 	language_model.h \
 	model.h \
-	batch_recognizer.h \
 	spk_model.h \
 	vosk_api.h
 
@@ -37,8 +35,6 @@ CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LIN
 	-I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS) 
 
 LIBS= \
-        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
-        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
 	$(KALDI_ROOT)/src/online2/kaldi-online2.a \
 	$(KALDI_ROOT)/src/decoder/kaldi-decoder.a \
 	$(KALDI_ROOT)/src/ivector/kaldi-ivector.a \
@@ -79,8 +75,15 @@ ifeq ($(HAVE_ACCELERATE), 1)
 endif
 
 ifeq ($(HAVE_CUDA), 1)
+    VOSK_SOURCES += batch_recognizer.cc
+    VOSK_HEADERS += batch_recognizer.h
+
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
-    LIBS+=\
+
+    LIBS := \
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
+        $(LIBS) \
         -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 3f740d7b..65356038 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -15,12 +15,12 @@
 #include "vosk_api.h"
 
 #include "recognizer.h"
-#include "batch_recognizer.h"
 #include "model.h"
 #include "spk_model.h"
 
 #if HAVE_CUDA
 #include "cudamatrix/cu-device.h"
+#include "batch_recognizer.h"
 #endif
 
 #include <string.h>
@@ -187,40 +187,62 @@ void vosk_gpu_thread_init()
 
 VoskBatchRecognizer *vosk_batch_recognizer_new()
 {
+#if HAVE_CUDA
     return (VoskBatchRecognizer *)(new BatchRecognizer());
+#else
+    return NULL;
+#endif
 }
 
 void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
 {
+#if HAVE_CUDA
     delete ((BatchRecognizer *)recognizer);
+#endif
 }
 
 void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
+#endif
 }
 
 void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->FinishStream(id);
+#endif
 }
 
 const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     return ((BatchRecognizer *)recognizer)->FrontResult(id);
+#else
+    return NULL;
+#endif
 }
 
 void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->Pop(id);
+#endif
 }
 
 void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->WaitForCompletion();
+#endif
 }
 
 int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     return ((BatchRecognizer *)recognizer)->GetPendingChunks(id);
+#else
+    return 0;
+#endif
 }

From 5428d36d1657b5d2339288affb8c3512c72896fd Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Sun, 26 Dec 2021 01:12:18 +0100
Subject: [PATCH 09/25] Round times

---
 src/batch_recognizer.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 3337ee10..28818692 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -32,7 +32,8 @@ BatchRecognizer::BatchRecognizer() {
     po.ReadConfigFile("model/conf/model.conf");
 
     batched_decoder_config.num_worker_threads = -1;
-    batched_decoder_config.max_batch_size = 200;
+    batched_decoder_config.max_batch_size = 32;
+    batched_decoder_config.num_channels = 600;
     batched_decoder_config.reset_on_endpoint = true;
     batched_decoder_config.use_gpu_feature_extraction = true;
 
@@ -44,7 +45,7 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.decoder_opts.lattice_beam = 6.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
-    batched_decoder_config.compute_opts.frames_per_chunk = 180;
+    batched_decoder_config.compute_opts.frames_per_chunk = 51;
 
     struct stat buffer;
 
@@ -152,8 +153,8 @@ void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat o
         json::JSON word;
 
         word["word"] = word_syms_->Find(words[i]);
-        word["start"] = times[i].first * 0.03 + offset;
-        word["end"] = times[i].second * 0.03 + offset;
+        word["start"] = round(times[i].first) * 0.03 + offset;
+        word["end"] = round(times[i].second) * 0.03 + offset;
         word["conf"] = conf[i];
         obj["result"].append(word);
 

From 70d5cbd0e0cfe02bcd81f7d4b18554aed97804ba Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Wed, 5 Jan 2022 20:32:08 +0100
Subject: [PATCH 10/25] Update README with Japanese

---
 README.md        |  4 ++--
 nodejs/README.md | 12 ++++++------
 python/README.md |  5 +++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index d2427071..91958486 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 # Vosk Speech Recognition Toolkit
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 18 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
 Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
-Ukrainian.
+Ukrainian, Kazakh, Swedish, Japanese. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/nodejs/README.md b/nodejs/README.md
index 0ac9d753..748e9e6c 100644
--- a/nodejs/README.md
+++ b/nodejs/README.md
@@ -2,18 +2,18 @@ This is an FFI-NAPI wrapper for the Vosk library.
 
 ## Usage
 
-It mostly follows Vosk interface, some methods are not yet fully implemented.
+Bindings mostly follow Vosk interface, some methods are not yet fully implemented.
 
-To use it you need to compile libvosk library, see Python module build
-instructions for details. You can find prebuilt library inside python
-wheel.
+See [demo folder](https://github.com/alphacep/vosk-api/tree/master/nodejs/demo) for
+details.
 
 ## About
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 17 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
-Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino.
+Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
+Ukrainian, Kazakh, Swedish, Japanese. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/python/README.md b/python/README.md
index 300eaa6d..0a40ee79 100644
--- a/python/README.md
+++ b/python/README.md
@@ -1,9 +1,10 @@
 This is a Python module for Vosk.
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 17 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
-Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino.
+Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
+Ukrainian, Kazakh, Swedish, Japanese. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable

From a1eac015dc8b530c26fdb0ac05be24ea3bcf57c3 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 7 Jan 2022 16:27:57 +0100
Subject: [PATCH 11/25] Add Esperanto

---
 README.md        | 2 +-
 nodejs/README.md | 2 +-
 python/README.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 91958486..42af9932 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Vosk is an offline open source speech recognition toolkit. It enables
 speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
 Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
-Ukrainian, Kazakh, Swedish, Japanese. More to come.
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/nodejs/README.md b/nodejs/README.md
index 748e9e6c..5603fae6 100644
--- a/nodejs/README.md
+++ b/nodejs/README.md
@@ -13,7 +13,7 @@ Vosk is an offline open source speech recognition toolkit. It enables
 speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
 Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
-Ukrainian, Kazakh, Swedish, Japanese. More to come.
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/python/README.md b/python/README.md
index 0a40ee79..b121e9a3 100644
--- a/python/README.md
+++ b/python/README.md
@@ -4,7 +4,7 @@ Vosk is an offline open source speech recognition toolkit. It enables
 speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
 Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
-Ukrainian, Kazakh, Swedish, Japanese. More to come.
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable

From c32099705f66fb632dfae6a20ca2e185bce542ed Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 7 Jan 2022 17:33:47 +0100
Subject: [PATCH 12/25] Fix branch name and add implib dump

---
 travis/Dockerfile.win        | 2 +-
 travis/Dockerfile.win32      | 2 +-
 travis/build-wheels-win.sh   | 4 ++--
 travis/build-wheels-win32.sh | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/travis/Dockerfile.win b/travis/Dockerfile.win
index 89081c2b..4f00bfcc 100644
--- a/travis/Dockerfile.win
+++ b/travis/Dockerfile.win
@@ -55,7 +55,7 @@ RUN cd /opt/kaldi \
     && find . -name *.a -exec cp {} /opt/kaldi/local/lib \;
 
 RUN cd /opt/kaldi \
-    && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \
+    && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \
     && cd kaldi/src \
     && CXX=x86_64-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \
         --mathlib=OPENBLAS_CLAPACK \
diff --git a/travis/Dockerfile.win32 b/travis/Dockerfile.win32
index 59198d9a..5a478e52 100644
--- a/travis/Dockerfile.win32
+++ b/travis/Dockerfile.win32
@@ -54,7 +54,7 @@ RUN cd /opt/kaldi \
     && find . -name *.a -exec cp {} /opt/kaldi/local/lib \;
 
 RUN cd /opt/kaldi \
-    && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \
+    && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \
     && cd kaldi/src \
     && CXX=i686-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \
         --mathlib=OPENBLAS_CLAPACK \
diff --git a/travis/build-wheels-win.sh b/travis/build-wheels-win.sh
index 750b6dd7..02bf6efc 100755
--- a/travis/build-wheels-win.sh
+++ b/travis/build-wheels-win.sh
@@ -5,7 +5,7 @@ set -e -x
 cd /opt
 git clone https://github.com/alphacep/vosk-api
 cd vosk-api/src
-CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
+EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
 
 # Collect dependencies
 cp /usr/lib/gcc/x86_64-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src
@@ -14,7 +14,7 @@ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src
 
 # Copy dlls to output folder
 mkdir -p /io/wheelhouse/win64
-cp /opt/vosk-api/src/*.dll /io/wheelhouse/win64
+cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win64
 
 # Build wheel and put to the output folder
 export VOSK_SOURCE=/opt/vosk-api
diff --git a/travis/build-wheels-win32.sh b/travis/build-wheels-win32.sh
index 2b934bd3..82af745e 100755
--- a/travis/build-wheels-win32.sh
+++ b/travis/build-wheels-win32.sh
@@ -5,7 +5,7 @@ set -e -x
 cd /opt
 git clone https://github.com/alphacep/vosk-api
 cd vosk-api/src
-CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
+EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
 
 # Copy dependencies
 cp /usr/lib/gcc/i686-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src
@@ -14,7 +14,7 @@ cp /usr/i686-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src
 
 # Copy dlls to output folder
 mkdir -p /io/wheelhouse/win32
-cp /opt/vosk-api/src/*.dll /io/wheelhouse/win32
+cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win32
 
 # Build wheel and put to the output folder
 export VOSK_SOURCE=/opt/vosk-api

From c6fab363e60943d3e8ec784ea9170b022f200880 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Sun, 9 Jan 2022 15:15:20 +0100
Subject: [PATCH 13/25] Don't close channel which not yet started

---
 src/batch_recognizer.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 28818692..ebc5a1bd 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -123,13 +123,14 @@ BatchRecognizer::~BatchRecognizer() {
 
 void BatchRecognizer::FinishStream(uint64_t id)
 {
-    Vector<BaseFloat> wave;
-    SubVector<BaseFloat> chunk(wave.Data(), 0);
-    dynamic_batcher_->Push(id, false, true, chunk);
-    streams_.erase(id);
+    if (streams_.find(id) != streams_.end()) {
+       Vector<BaseFloat> wave;
+       SubVector<BaseFloat> chunk(wave.Data(), 0);
+       dynamic_batcher_->Push(id, false, true, chunk);
+       streams_.erase(id);
+    }
 }
 
-
 void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset)
 {
     fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat);

From 9861be27876b4ded67806df8ec88474246a61e61 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Mon, 10 Jan 2022 20:02:17 +0100
Subject: [PATCH 14/25] Add libs as dependencies in Makefile

---
 src/Makefile | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 9965db65..6ee41b69 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -55,6 +55,8 @@ LIBS= \
 	$(OPENFST_ROOT)/lib/libfst.a \
 	$(OPENFST_ROOT)/lib/libfstngram.a
 
+LDFLAGS =
+
 
 ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
     CFLAGS += -I$(OPENBLAS_ROOT)/include
@@ -67,11 +69,11 @@ endif
 
 ifeq ($(HAVE_MKL), 1)
     CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include
-    LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
+    LDFLAGS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
 endif
 
 ifeq ($(HAVE_ACCELERATE), 1)
-    LIBS += -framework Accelerate
+    LDFLAGS += -framework Accelerate
 endif
 
 ifeq ($(HAVE_CUDA), 1)
@@ -83,14 +85,15 @@ ifeq ($(HAVE_CUDA), 1)
     LIBS := \
         $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
         $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
-        $(LIBS) \
-        -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
+        $(LIBS)
+
+    LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
 all: $(OUTDIR)/libvosk.$(EXT)
 
-$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o)
-	$(CXX) --shared -s -o $@ $^ $(LIBS) -lm -latomic $(EXTRA_LDFLAGS)
+$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) $(LIBS)
+	$(CXX) --shared -s -o $@ $^ $(LDFLAGS) -lm -latomic $(EXTRA_LDFLAGS)
 
 $(OUTDIR)/%.o: %.cc $(VOSK_HEADERS)
 	$(CXX) $(CFLAGS) -c -o $@ $<

From 6f86944a06aef6289ad9f29e2502f80bc69f55af Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Wed, 12 Jan 2022 01:10:04 +0100
Subject: [PATCH 15/25] Implement wave chunking for cuda decoder

---
 src/batch_recognizer.cc | 86 +++++++++++++++++++++++++++--------------
 src/batch_recognizer.h  |  4 ++
 2 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index ebc5a1bd..f2d93b05 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -31,22 +31,6 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.Register(&po);
     po.ReadConfigFile("model/conf/model.conf");
 
-    batched_decoder_config.num_worker_threads = -1;
-    batched_decoder_config.max_batch_size = 32;
-    batched_decoder_config.num_channels = 600;
-    batched_decoder_config.reset_on_endpoint = true;
-    batched_decoder_config.use_gpu_feature_extraction = true;
-
-    batched_decoder_config.feature_opts.feature_type = "mfcc";
-    batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
-    batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
-    batched_decoder_config.decoder_opts.max_active = 7000;
-    batched_decoder_config.decoder_opts.default_beam = 13.0;
-    batched_decoder_config.decoder_opts.lattice_beam = 6.0;
-    batched_decoder_config.compute_opts.acoustic_scale = 1.0;
-    batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
-    batched_decoder_config.compute_opts.frames_per_chunk = 51;
-
     struct stat buffer;
 
     string nnet3_rxfilename_ = "model/am/final.mdl";
@@ -93,7 +77,26 @@ BatchRecognizer::BatchRecognizer() {
         ReadKaldiObject(carpa_rxfilename_, &const_arpa_);
     }
 
+    batched_decoder_config.num_worker_threads = -1;
+    batched_decoder_config.max_batch_size = 32;
+    batched_decoder_config.num_channels = 600;
+    batched_decoder_config.reset_on_endpoint = true;
+    batched_decoder_config.use_gpu_feature_extraction = true;
 
+    batched_decoder_config.feature_opts.feature_type = "mfcc";
+    batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
+    batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
+    batched_decoder_config.decoder_opts.max_active = 7000;
+    batched_decoder_config.decoder_opts.default_beam = 13.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 6.0;
+    batched_decoder_config.compute_opts.acoustic_scale = 1.0;
+    batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
+
+    int32 nnet_left_context, nnet_right_context;
+    nnet3::ComputeSimpleNnetContext(nnet_->GetNnet(), &nnet_left_context,
+                                    &nnet_right_context);
+
+    batched_decoder_config.compute_opts.frames_per_chunk = std::max(51, (nnet_right_context + 3 - nnet_right_context % 3));
 
     cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
          (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_);
@@ -102,6 +105,8 @@ BatchRecognizer::BatchRecognizer() {
     CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
     dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
                                                             *cuda_pipeline_);
+
+    samples_per_chunk_ = batched_decoder_config.compute_opts.frames_per_chunk * 160;
 }
 
 BatchRecognizer::~BatchRecognizer() {
@@ -123,11 +128,16 @@ BatchRecognizer::~BatchRecognizer() {
 
 void BatchRecognizer::FinishStream(uint64_t id)
 {
-    if (streams_.find(id) != streams_.end()) {
-       Vector<BaseFloat> wave;
-       SubVector<BaseFloat> chunk(wave.Data(), 0);
-       dynamic_batcher_->Push(id, false, true, chunk);
+    if (streams_.find(id) != streams_.end()) {;
+       SubVector<BaseFloat> chunk = buffers_[id].Range(0, buffers_[id].Dim());
+
+       bool first = false;
+       if (initialized_.find(id) == initialized_.end())
+           first = true;
+       dynamic_batcher_->Push(id, first, true, chunk);
        streams_.erase(id);
+       buffers_.erase(id);
+       initialized_.erase(id);
     }
 }
 
@@ -173,11 +183,9 @@ void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat o
 
 void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
 {
-    bool first = false;
-
     if (streams_.find(id) == streams_.end()) {
-        first = true;
         streams_.insert(id);
+        buffers_[id] = Vector<BaseFloat>();
 
         // Define the callback for results.
 #if 0
@@ -212,13 +220,35 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
           CudaPipelineResult::RESULT_TYPE_LATTICE);
     }
 
-    Vector<BaseFloat> wave;
-    wave.Resize(len / 2, kUndefined);
+    // Collect data so we process exactly samples_per_chunk_
+    Vector<BaseFloat> &buf = buffers_[id];
+    int32 orig_size = buf.Dim();
+    buf.Resize(buf.Dim() + len / 2, kCopyData);
     for (int i = 0; i < len / 2; i++)
-        wave(i) = *(((short *)data) + i);
-    SubVector<BaseFloat> chunk(wave.Data(), wave.Dim());
+        buf(i + orig_size) = *(((short *)data) + i);
 
-    dynamic_batcher_->Push(id, first, false, chunk);
+    // Pick chunks
+    int32 i = 0;
+    while (i + samples_per_chunk_ <= buf.Dim()) {
+        SubVector<BaseFloat> chunk = buf.Range(i, samples_per_chunk_);
+
+        bool first = false;
+        if (initialized_.find(id) == initialized_.end()) {
+           first = true;
+           initialized_.insert(id);
+        }
+        dynamic_batcher_->Push(id, first, false, chunk);
+        i += samples_per_chunk_;
+    }
+
+    // Keep remaining data
+    if (i > 0) {
+        int32 remaining = buf.Dim() - i;
+        for (int j = 0; j < remaining; j++) {
+            buf(j) = buf(i + j);
+        }
+        buf.Resize(remaining, kCopyData);
+    }
 }
 
 const char* BatchRecognizer::FrontResult(uint64_t id)
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index f26dd54b..3d634e01 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -66,9 +66,13 @@ class BatchRecognizer {
         BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
         CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
 
+        // Input and output queues
+        int32 samples_per_chunk_;
 
         std::set<int> streams_;
+        std::set<int> initialized_;
         std::map<int, std::queue<std::string> > results_;
+        std::map<int, kaldi::Vector<BaseFloat> > buffers_;
 
         // Rescoring
         fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;

From 2135223490aac2b80dc5b4ad607edb40b97b429d Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Wed, 12 Jan 2022 14:40:45 +0100
Subject: [PATCH 16/25] Put stream information in a single structure

---
 src/batch_recognizer.cc | 73 ++++++++++++++++++++---------------------
 src/batch_recognizer.h  | 13 +++++---
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index f2d93b05..b1215a07 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -106,7 +106,7 @@ BatchRecognizer::BatchRecognizer() {
     dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
                                                             *cuda_pipeline_);
 
-    samples_per_chunk_ = batched_decoder_config.compute_opts.frames_per_chunk * 160;
+    samples_per_chunk_ = cuda_pipeline_->GetNSampsPerChunk();
 }
 
 BatchRecognizer::~BatchRecognizer() {
@@ -128,17 +128,14 @@ BatchRecognizer::~BatchRecognizer() {
 
 void BatchRecognizer::FinishStream(uint64_t id)
 {
-    if (streams_.find(id) != streams_.end()) {;
-       SubVector<BaseFloat> chunk = buffers_[id].Range(0, buffers_[id].Dim());
-
-       bool first = false;
-       if (initialized_.find(id) == initialized_.end())
-           first = true;
-       dynamic_batcher_->Push(id, first, true, chunk);
-       streams_.erase(id);
-       buffers_.erase(id);
-       initialized_.erase(id);
+    auto it = streams_.find(id);
+    if (it == streams_.end()) {
+        return;
     }
+
+    SubVector<BaseFloat> chunk = it->second.buffer.Range(0, it->second.buffer.Dim());
+    dynamic_batcher_->Push(id, !(it->second.initialized), true, chunk);
+    streams_.erase(it);
 }
 
 void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset)
@@ -178,15 +175,12 @@ void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat o
 
 //    KALDI_LOG << "Result " << id << " " << obj.dump();
 
-    results_[id].push(obj.dump());
+    streams_[id].results.push(obj.dump());
 }
 
 void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
 {
     if (streams_.find(id) == streams_.end()) {
-        streams_.insert(id);
-        buffers_[id] = Vector<BaseFloat>();
-
         // Define the callback for results.
 #if 0
          cuda_pipeline_->SetBestPathCallback(
@@ -219,52 +213,55 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
           },
           CudaPipelineResult::RESULT_TYPE_LATTICE);
     }
-
     // Collect data so we process exactly samples_per_chunk_
-    Vector<BaseFloat> &buf = buffers_[id];
-    int32 orig_size = buf.Dim();
-    buf.Resize(buf.Dim() + len / 2, kCopyData);
+    Vector<BaseFloat> &buffer = streams_[id].buffer;
+    int32 end = buffer.Dim();
+    buffer.Resize(end + len / 2, kCopyData);
     for (int i = 0; i < len / 2; i++)
-        buf(i + orig_size) = *(((short *)data) + i);
+        buffer(i + end) = *(((short *)data) + i);
+    end = buffer.Dim();
 
-    // Pick chunks
+    // Pick chunks and submit them to the batcher
     int32 i = 0;
-    while (i + samples_per_chunk_ <= buf.Dim()) {
-        SubVector<BaseFloat> chunk = buf.Range(i, samples_per_chunk_);
-
-        bool first = false;
-        if (initialized_.find(id) == initialized_.end()) {
-           first = true;
-           initialized_.insert(id);
-        }
-        dynamic_batcher_->Push(id, first, false, chunk);
+    while (i + samples_per_chunk_ <= end) {
+        dynamic_batcher_->Push(id, (!streams_[id].initialized), false,
+                                    buffer.Range(i, samples_per_chunk_));
+        streams_[id].initialized = true;
         i += samples_per_chunk_;
     }
 
     // Keep remaining data
     if (i > 0) {
-        int32 remaining = buf.Dim() - i;
-        for (int j = 0; j < remaining; j++) {
-            buf(j) = buf(i + j);
+        int32 tail = end - i;
+        for (int j = 0; j < tail; j++) {
+            buffer(j) = buffer(i + j);
         }
-        buf.Resize(remaining, kCopyData);
+        buffer.Resize(tail, kCopyData);
     }
 }
 
 const char* BatchRecognizer::FrontResult(uint64_t id)
 {
-    if (results_[id].empty()) {
+    auto it = streams_.find(id);
+    if (it == streams_.end()) {
+        return "";
+    }
+    if (it->second.results.empty()) {
         return "";
     }
-    return results_[id].front().c_str();
+    return it->second.results.front().c_str();
 }
 
 void BatchRecognizer::Pop(uint64_t id)
 {
-    if (results_[id].empty()) {
+    auto it = streams_.find(id);
+    if (it == streams_.end()) {
+        return;
+    }
+    if (it->second.results.empty()) {
         return;
     }
-    results_[id].pop();
+    it->second.results.pop();
 }
 
 void BatchRecognizer::WaitForCompletion()
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index 3d634e01..342bd860 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -51,6 +51,12 @@ class BatchRecognizer {
         int GetPendingChunks(uint64_t id);
 
     private:
+        struct Stream {
+            bool initialized = false;
+            std::queue<std::string> results;
+            kaldi::Vector<BaseFloat> buffer;
+        };
+
         void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset);
 
         kaldi::TransitionModel *trans_model_ = nullptr;
@@ -66,13 +72,10 @@ class BatchRecognizer {
         BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
         CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
 
-        // Input and output queues
         int32 samples_per_chunk_;
 
-        std::set<int> streams_;
-        std::set<int> initialized_;
-        std::map<int, std::queue<std::string> > results_;
-        std::map<int, kaldi::Vector<BaseFloat> > buffers_;
+        // Input and output queues
+        std::map<int, Stream> streams_;
 
         // Rescoring
         fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;

From b0903413b109e2b9690ed384a17d483f14affe09 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 21 Jan 2022 13:19:38 +0100
Subject: [PATCH 17/25] Set soname for Android library

---
 android/lib/build-vosk.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/android/lib/build-vosk.sh b/android/lib/build-vosk.sh
index e455fa46..7be5f58c 100755
--- a/android/lib/build-vosk.sh
+++ b/android/lib/build-vosk.sh
@@ -123,7 +123,13 @@ make -j 8 online2 lm rnnlm
 # Vosk-api
 cd $WORKDIR
 mkdir -p $WORKDIR/vosk
-make -j 8 -C ${WORKDIR_BASE}/../../../src OUTDIR=$WORKDIR/vosk KALDI_ROOT=${WORKDIR}/kaldi OPENFST_ROOT=${WORKDIR}/local OPENBLAS_ROOT=${WORKDIR}/local CXX=$CXX EXTRA_LDFLAGS="-llog -static-libstdc++"
+make -j 8 -C ${WORKDIR_BASE}/../../../src \
+    OUTDIR=$WORKDIR/vosk \
+    KALDI_ROOT=${WORKDIR}/kaldi \
+    OPENFST_ROOT=${WORKDIR}/local \
+    OPENBLAS_ROOT=${WORKDIR}/local \
+    CXX=$CXX \
+    EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so"
 cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so
 
 done

From d2c11a611f50639641719a0c0b85838abeaa5109 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Sun, 30 Jan 2022 22:57:36 +0100
Subject: [PATCH 18/25] Read list of files from arguments

---
 python/example/test_gpu_batch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/example/test_gpu_batch.py b/python/example/test_gpu_batch.py
index 3a65bda8..0ad9e288 100755
--- a/python/example/test_gpu_batch.py
+++ b/python/example/test_gpu_batch.py
@@ -14,7 +14,8 @@
 
 rec = BatchRecognizer()
 
-fnames = open("tedlium.list").readlines()
+# Read list of files from the file
+fnames = open(sys.argv[1]).readlines()
 fds = [open(x.strip(), "rb") for x in fnames]
 uids = [fname.strip().split('/')[-1][:-4] for fname in fnames]
 results = [""] * len(fnames)

From 79b8395be055a9398fbd8f2105b0321fb186ebff Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 3 Feb 2022 23:08:09 +0100
Subject: [PATCH 19/25] Add NLSML output

---
 python/example/test_nlsml.py | 31 +++++++++++++++++
 python/vosk/__init__.py      |  3 ++
 src/recognizer.cc            | 65 +++++++++++++++++++++++++++++++++++-
 src/recognizer.h             |  3 ++
 src/vosk_api.cc              |  5 +++
 src/vosk_api.h               |  6 ++++
 6 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100755 python/example/test_nlsml.py

diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py
new file mode 100755
index 00000000..18132093
--- /dev/null
+++ b/python/example/test_nlsml.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+import sys
+import os
+import wave
+
+SetLogLevel(0)
+
+if not os.path.exists("model"):
+    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
+    exit (1)
+
+wf = wave.open(sys.argv[1], "rb")
+if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+    print ("Audio file must be WAV format mono PCM.")
+    exit (1)
+
+model = Model("model")
+rec = KaldiRecognizer(model, wf.getframerate())
+rec.SetMaxAlternatives(10)
+rec.SetNLSML(True)
+
+while True:
+    data = wf.readframes(4000)
+    if len(data) == 0:
+        break
+    if rec.AcceptWaveform(data):
+        print(rec.Result())
+
+print(rec.FinalResult())
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 0e60c2ba..d8e384b9 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -69,6 +69,9 @@ def SetMaxAlternatives(self, max_alternatives):
     def SetWords(self, enable_words):
         _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
 
+    def SetNLSML(self, enable_nlsml):
+        _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
+
     def SetSpkModel(self, spk_model):
         _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
 
diff --git a/src/recognizer.cc b/src/recognizer.cc
index f25ff0ee..cfcf638a 100644
--- a/src/recognizer.cc
+++ b/src/recognizer.cc
@@ -246,6 +246,11 @@ void Recognizer::SetWords(bool words)
     words_ = words;
 }
 
+void Recognizer::SetNLSML(bool nlsml)
+{
+    nlsml_ = nlsml;
+}
+
 void Recognizer::SetSpkModel(SpkModel *spk_model)
 {
     if (state_ == RECOGNIZER_RUNNING) {
@@ -534,7 +539,6 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
     fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
 
     json::JSON obj;
-    std::stringstream ss;
     for (int k = 0; k < nbest_lats.size(); k++) {
 
       Lattice nlat = nbest_lats[k];
@@ -584,6 +588,63 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
     return StoreReturn(obj.dump());
 }
 
+const char *Recognizer::NlsmlResult(CompactLattice &clat)
+{
+    Lattice lat;
+    Lattice nbest_lat;
+    std::vector<Lattice> nbest_lats;
+
+    ConvertLattice (clat, &lat);
+    fst::ShortestPath(lat, &nbest_lat, max_alternatives_);
+    fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
+
+    std::stringstream ss;
+    ss << "<?xml version=\"1.0\"?>\n";
+    ss << "<result grammar=\"default\">\n";
+
+    for (int k = 0; k < nbest_lats.size(); k++) {
+
+      Lattice nlat = nbest_lats[k];
+
+      CompactLattice nclat;
+      fst::Invert(&nlat);
+      DeterminizeLattice(nlat, &nclat);
+
+      CompactLattice aligned_nclat;
+      if (model_->winfo_) {
+          WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat);
+      } else {
+          aligned_nclat = nclat;
+      }
+
+      std::vector<int32> words;
+      std::vector<int32> begin_times;
+      std::vector<int32> lengths;
+      CompactLattice::Weight weight;
+
+      CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight);
+      float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2());
+
+      stringstream text;
+      for (int i = 0; i < words.size(); i++) {
+        json::JSON word;
+        if (words[i] == 0)
+            continue;
+        if (i)
+          text << " ";
+        text << model_->word_syms_->Find(words[i]);
+      }
+
+      ss << "<interpretation grammar=\"default\" confidence=\"" << likelihood << "\">\n";
+      ss << "<input mode=\"speech\">" << text.str() << "</input>\n";
+      ss << "<instance>" << text.str() << "</instance>\n";
+      ss << "</interpretation>\n";
+    }
+    ss << "</result>\n";
+
+    return StoreReturn(ss.str());
+}
+
 const char* Recognizer::GetResult()
 {
     if (decoder_->NumFramesDecoded() == 0) {
@@ -638,6 +699,8 @@ const char* Recognizer::GetResult()
 
     if (max_alternatives_ == 0) {
         return MbrResult(rlat);
+    } else if (nlsml_) {
+        return NlsmlResult(rlat);
     } else {
         return NbestResult(rlat);
     }
diff --git a/src/recognizer.h b/src/recognizer.h
index e5a733d1..b0338a01 100644
--- a/src/recognizer.h
+++ b/src/recognizer.h
@@ -49,6 +49,7 @@ class Recognizer {
         void SetMaxAlternatives(int max_alternatives);
         void SetSpkModel(SpkModel *spk_model);
         void SetWords(bool words);
+        void SetNLSML(bool nlsml);
         bool AcceptWaveform(const char *data, int len);
         bool AcceptWaveform(const short *sdata, int len);
         bool AcceptWaveform(const float *fdata, int len);
@@ -69,6 +70,7 @@ class Recognizer {
         const char *StoreReturn(const string &res);
         const char *MbrResult(CompactLattice &clat);
         const char *NbestResult(CompactLattice &clat);
+        const char *NlsmlResult(CompactLattice &clat);
 
         Model *model_ = nullptr;
         SingleUtteranceNnet3Decoder *decoder_ = nullptr;
@@ -94,6 +96,7 @@ class Recognizer {
         // Other
         int max_alternatives_ = 0; // Disable alternatives by default
         bool words_ = false;
+        bool nlsml_ = false;
 
         float sample_frequency_;
         int32 frame_offset_;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 65356038..5df70715 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -103,6 +103,11 @@ void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)
     ((Recognizer *)recognizer)->SetWords((bool)words);
 }
 
+void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml)
+{
+    ((Recognizer *)recognizer)->SetNLSML((bool)nlsml);
+}
+
 void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model)
 {
     if (recognizer == nullptr || spk_model == nullptr) {
diff --git a/src/vosk_api.h b/src/vosk_api.h
index f6a981cb..c448087f 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -191,6 +191,12 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words);
 
 
+/** Set NLSML output
+ * @param nlsml - boolean value
+ */
+void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml);
+
+
 /** Accept voice data
  *
  *  accept and process new chunk of voice data

From a561c2d6d4e0023fc06a6c9a4da1265f56690754 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 3 Feb 2022 23:26:53 +0100
Subject: [PATCH 20/25] Don't add space before string

---
 src/recognizer.cc | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/recognizer.cc b/src/recognizer.cc
index cfcf638a..bd671b43 100644
--- a/src/recognizer.cc
+++ b/src/recognizer.cc
@@ -565,7 +565,7 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
       stringstream text;
       json::JSON entry;
 
-      for (int i = 0; i < words.size(); i++) {
+      for (int i = 0, first = 1; i < words.size(); i++) {
         json::JSON word;
         if (words[i] == 0)
             continue;
@@ -575,8 +575,12 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
             word["end"] = samples_round_start_ / sample_frequency_ + (frame_offset_ + begin_times[i] + lengths[i]) * 0.03;
             entry["result"].append(word);
         }
-        if (i)
+
+        if (first)
+          first = 0;
+        else
           text << " ";
+
         text << model_->word_syms_->Find(words[i]);
       }
 
@@ -626,12 +630,15 @@ const char *Recognizer::NlsmlResult(CompactLattice &clat)
       float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2());
 
       stringstream text;
-      for (int i = 0; i < words.size(); i++) {
-        json::JSON word;
+      for (int i = 0, first = 1; i < words.size(); i++) {
         if (words[i] == 0)
             continue;
-        if (i)
+
+        if (first)
+          first = 0;
+        else
           text << " ";
+
         text << model_->word_syms_->Find(words[i]);
       }
 

From f574d896e9346b610f6928d4f372f76269681018 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 3 Feb 2022 23:43:00 +0100
Subject: [PATCH 21/25] Emtpy result should be also xml

---
 src/recognizer.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/recognizer.cc b/src/recognizer.cc
index bd671b43..d49b5748 100644
--- a/src/recognizer.cc
+++ b/src/recognizer.cc
@@ -797,6 +797,14 @@ const char *Recognizer::StoreEmptyReturn()
 {
     if (!max_alternatives_) {
         return StoreReturn("{\"text\": \"\"}");
+    } else if (nlsml_) {
+        return StoreReturn("<?xml version=\"1.0\"?>\n"
+                           "<result grammar=\"default\">\n"
+                           "<interpretation confidence=\"1.0\">\n"
+                           "<instance/>\n"
+                           "<input><noinput/></input>\n"
+                           "</interpretation>\n"
+                           "</result>\n");
     } else {
         return StoreReturn("{\"alternatives\" : [{\"text\": \"\", \"confidence\" : 1.0}] }");
     }

From 1f447a8dfc0e371211d404892861631b41cf1630 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 10 Feb 2022 20:52:55 +0100
Subject: [PATCH 22/25] Rename according to Kaldi changes

---
 src/batch_recognizer.cc | 4 ++--
 src/batch_recognizer.h  | 2 +-
 src/vosk_api.cc         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index b1215a07..46a0c097 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -269,7 +269,7 @@ void BatchRecognizer::WaitForCompletion()
     dynamic_batcher_->WaitForCompletion();
 }
 
-int BatchRecognizer::GetPendingChunks(uint64_t id)
+int BatchRecognizer::GetNumPendingChunks(uint64_t id)
 {
-    return dynamic_batcher_->GetPendingChunks(id);
+    return dynamic_batcher_->GetNumPendingChunks(id);
 }
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index 342bd860..c6d90ae0 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -48,7 +48,7 @@ class BatchRecognizer {
         const char *FrontResult(uint64_t id);
         void Pop(uint64_t id);
         void WaitForCompletion();
-        int GetPendingChunks(uint64_t id);
+        int GetNumPendingChunks(uint64_t id);
 
     private:
         struct Stream {
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 5df70715..c7e75403 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -246,7 +246,7 @@ void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
 int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id)
 {
 #if HAVE_CUDA
-    return ((BatchRecognizer *)recognizer)->GetPendingChunks(id);
+    return ((BatchRecognizer *)recognizer)->GetNumPendingChunks(id);
 #else
     return 0;
 #endif

From b63df75c300855fff891f92ef197809ceb79d6c9 Mon Sep 17 00:00:00 2001
From: mulhod <mulhodm@gmail.com>
Date: Fri, 11 Feb 2022 23:46:06 -0500
Subject: [PATCH 23/25] Change KaldiRecognizer reference to Recognizer in
 src/recognizer.cc

---
 src/recognizer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/recognizer.cc b/src/recognizer.cc
index b14392bd..50d84d0e 100644
--- a/src/recognizer.cc
+++ b/src/recognizer.cc
@@ -593,7 +593,7 @@ void ComputePhoneInfo(const TransitionModel &tmodel, const CompactLattice &clat,
     
 }
 
-const char *KaldiRecognizer::WordandPhoneResult(CompactLattice &rlat)
+const char *Recognizer::WordandPhoneResult(CompactLattice &rlat)
 {
     //Computes aligned word and phone-level results without MBR decoding
     CompactLattice aligned_lat;

From de0bec8ca2e8e6da87ac924982f47fbd9782a487 Mon Sep 17 00:00:00 2001
From: mulhod <mulhodm@gmail.com>
Date: Fri, 11 Feb 2022 23:49:36 -0500
Subject: [PATCH 24/25] Change KaldiRecognizer reference to Recognizer in
 src/vosk_api.cc

---
 src/vosk_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index c8b6f15b..3166cc0f 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -100,7 +100,7 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al
 
 void vosk_recognizer_set_result_options(VoskRecognizer *recognizer, const char *result_opts)
 {
-    ((KaldiRecognizer *)recognizer)->SetResultOptions(result_opts);
+    ((Recognizer *)recognizer)->SetResultOptions(result_opts);
 }
 
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)

From 9732d1da5c153c10c3684ee75d4fc320a850cabb Mon Sep 17 00:00:00 2001
From: mulhod <mulhodm@gmail.com>
Date: Sat, 12 Feb 2022 00:30:56 -0500
Subject: [PATCH 25/25] Increment version to 0.3.42

---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index 502be6c8..39c8082e 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -44,7 +44,7 @@ def get_tag(self):
 
 setuptools.setup(
     name="vosk",
-    version="0.3.41",
+    version="0.3.42",
     author="Educational Testing Service",
     author_email="rubale@ets.org",
     description="Offline open source speech recognition API based on Kaldi and Vosk with additional features from ETS",