From 6977be7fb78a82dbae7c80eeab360f46c2035c5e Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Sun, 12 Dec 2021 21:37:44 +0100 Subject: [PATCH 01/25] Batch recognizer draft --- python/example/batch/test_batch.py | 26 +++++ python/vosk/__init__.py | 20 ++++ src/Makefile | 11 ++- src/batch_recognizer.cc | 107 +++++++++++++++++++++ src/batch_recognizer.h | 67 +++++++++++++ src/model.h | 6 +- src/{kaldi_recognizer.cc => recognizer.cc} | 52 +++++----- src/{kaldi_recognizer.h => recognizer.h} | 14 +-- src/spk_model.h | 4 +- src/vosk_api.cc | 57 ++++++++--- src/vosk_api.h | 24 +++++ 11 files changed, 333 insertions(+), 55 deletions(-) create mode 100755 python/example/batch/test_batch.py create mode 100644 src/batch_recognizer.cc create mode 100644 src/batch_recognizer.h rename src/{kaldi_recognizer.cc => recognizer.cc} (93%) rename src/{kaldi_recognizer.h => recognizer.h} (91%) diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py new file mode 100755 index 00000000..fb1bb7e9 --- /dev/null +++ b/python/example/batch/test_batch.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +from vosk import Model, BatchRecognizer +import sys +import os +import wave + +model = Model("model") +rec = BatchRecognizer(model, 16000.0) + +fnames = open("tedlium.list").readlines() +fds = [open(x) for x in fnames] +ended = set() +while True: + for i, fd in fds: + if i in ended(): + continue + data = fd.read(4000) + if len(data) == 0: + rec.FinishStream(i) + ended.add(i) + else: + rec.AcceptWaveform(i, data) + rec.Results() + if len(ended) == len(fds): + break diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index cf39a472..02e1df97 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -101,3 +101,23 @@ def GpuInit(): def GpuThreadInit(): _c.vosk_gpu_thread_init() + +class BatchRecognizer(object): + + def __init__(self, *args): + self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1]) + + if self._handle == _ffi.NULL: + raise Exception("Failed to create a recognizer") + + def __del__(self): + _c.vosk_batch_recognizer_free(self._handle) + + def AcceptWaveform(self, uid, data): + res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data)) + + def Results(self): + return _ffi.string(_c.vosk_batch_recognizer_result(self._handle)).decode('utf-8') + + def FinishStream(self, uid): + _c.vosk_recognizer_final_result(self._handle, uid) diff --git a/src/Makefile b/src/Makefile index 54e96ca7..96c21949 100644 --- a/src/Makefile +++ b/src/Makefile @@ -18,16 +18,18 @@ EXTRA_LDFLAGS?= OUTDIR?=. VOSK_SOURCES= \ - kaldi_recognizer.cc \ + recognizer.cc \ language_model.cc \ model.cc \ spk_model.cc \ + batch_recognizer.cc \ vosk_api.cc VOSK_HEADERS= \ - kaldi_recognizer.h \ + recognizer.h \ language_model.h \ model.h \ + batch_recognizer.h \ spk_model.h \ vosk_api.h @@ -76,7 +78,10 @@ endif ifeq ($(HAVE_CUDA), 1) CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include - LIBS+=-L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt + LIBS+=\ + $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ + $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ + -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt endif all: $(OUTDIR)/libvosk.$(EXT) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc new file mode 100644 index 00000000..112f1fe9 --- /dev/null +++ b/src/batch_recognizer.cc @@ -0,0 +1,107 @@ +// Copyright 2019-2020 Alpha Cephei Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "batch_recognizer.h" + +#include "fstext/fstext-utils.h" +#include "lat/sausages.h" + +using namespace fst; +using namespace kaldi::nnet3; +using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID; + +BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(model), sample_frequency_(sample_frequency) { + model_->Ref(); + + BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config; + + cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline + (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_); + + CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config; + dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config, + *cuda_pipeline_); + + InitRescoring(); +} + +BatchRecognizer::~BatchRecognizer() { + delete lm_to_subtract_; + delete carpa_to_add_; + delete carpa_to_add_scale_; + + delete cuda_pipeline_; + delete dynamic_batcher_; + + model_->Unref(); +} + +void BatchRecognizer::InitRescoring() +{ + if (model_->graph_lm_fst_) { + fst::CacheOptions cache_opts(true, -1); + fst::ArcMapFstOptions mapfst_opts(cache_opts); + fst::StdToLatticeMapper mapper; + lm_to_subtract_ = new fst::ArcMapFst >(*model_->graph_lm_fst_, mapper, mapfst_opts); + carpa_to_add_ = new ConstArpaLmDeterministicFst(model_->const_arpa_); + } +} + +void BatchRecognizer::FinishStream(uint64_t id) +{ + streams_.erase(id); +} + +void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) +{ + bool first = false; + + if (streams_.find(id) == streams_.end()) { + first = true; + streams_.insert(id); + + // Define the callback for results. + cuda_pipeline_->SetBestPathCallback( + id, + [&, id](const std::string &str, bool partial, + bool endpoint_detected) { + if (partial) { + KALDI_LOG << "id #" << id << " [partial] : " << str; + } + + if (endpoint_detected) { + KALDI_LOG << "id #" << id << " [endpoint detected]"; + } + + if (!partial) { + KALDI_LOG << "id #" << id << " : " << str; + } + }); + } + + Vector wave; + wave.Resize(len / 2, kUndefined); + for (int i = 0; i < len / 2; i++) + wave(i) = *(((short *)data) + i); + SubVector chunk(wave.Data(), 0); + + dynamic_batcher_->Push(id, first, false, chunk); +} + +const char* BatchRecognizer::PullResults() +{ + dynamic_batcher_->WaitForCompletion(); + cudaDeviceSynchronize(); + return ""; +} diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h new file mode 100644 index 00000000..00f4a0db --- /dev/null +++ b/src/batch_recognizer.h @@ -0,0 +1,67 @@ +// Copyright 2019 Alpha Cephei Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VOSK_GPU_RECOGNIZER_H +#define VOSK_GPU_RECOGNIZER_H + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "fstext/fstext-utils.h" +#include "decoder/lattice-faster-decoder.h" +#include "feat/feature-mfcc.h" +#include "lat/kaldi-lattice.h" +#include "lat/word-align-lattice.h" +#include "lat/compose-lattice-pruned.h" +#include "nnet3/am-nnet-simple.h" +#include "nnet3/nnet-am-decodable-simple.h" +#include "nnet3/nnet-utils.h" + +#include "cudadecoder/cuda-online-pipeline-dynamic-batcher.h" +#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h" +#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h" +#include "cudadecoder/cuda-pipeline-common.h" + +#include "model.h" + +using namespace kaldi; +using namespace kaldi::cuda_decoder; + +class BatchRecognizer { + public: + BatchRecognizer(Model *model, float sample_frequency); + ~BatchRecognizer(); + + void FinishStream(uint64_t id); + void AcceptWaveform(uint64_t id, const char *data, int len); + const char* PullResults(); + + private: + void InitRescoring(); + + Model *model_ = nullptr; + BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr; + CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr; + + std::set streams_; + + // Rescoring + fst::ArcMapFst > *lm_to_subtract_ = nullptr; + kaldi::ConstArpaLmDeterministicFst *carpa_to_add_ = nullptr; + fst::ScaleDeterministicOnDemandFst *carpa_to_add_scale_ = nullptr; + + float sample_frequency_; +}; + +#endif /* VOSK_GPU_RECOGNIZER_H */ diff --git a/src/model.h b/src/model.h index d5feedd0..c36a96aa 100644 --- a/src/model.h +++ b/src/model.h @@ -36,7 +36,8 @@ using namespace kaldi; using namespace std; -class KaldiRecognizer; +class Recognizer; +class BatchRecognizer; class Model { @@ -52,7 +53,8 @@ class Model { void ConfigureV2(); void ReadDataFiles(); - friend class KaldiRecognizer; + friend class Recognizer; + friend class BatchRecognizer; string model_path_str_; string nnet3_rxfilename_; diff --git a/src/kaldi_recognizer.cc b/src/recognizer.cc similarity index 93% rename from src/kaldi_recognizer.cc rename to src/recognizer.cc index 86cf9bdd..f25ff0ee 100644 --- a/src/kaldi_recognizer.cc +++ b/src/recognizer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kaldi_recognizer.h" +#include "recognizer.h" #include "json.h" #include "fstext/fstext-utils.h" #include "lat/sausages.h" @@ -21,7 +21,7 @@ using namespace fst; using namespace kaldi::nnet3; -KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) { +Recognizer::Recognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) { model_->Ref(); @@ -46,7 +46,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_( InitRescoring(); } -KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) +Recognizer::Recognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) { model_->Ref(); @@ -107,7 +107,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons InitRescoring(); } -KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) { +Recognizer::Recognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) { model_->Ref(); spk_model->Ref(); @@ -135,7 +135,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel InitRescoring(); } -KaldiRecognizer::~KaldiRecognizer() { +Recognizer::~Recognizer() { delete decoder_; delete feature_pipeline_; delete silence_weighting_; @@ -155,7 +155,7 @@ KaldiRecognizer::~KaldiRecognizer() { spk_model_->Unref(); } -void KaldiRecognizer::InitState() +void Recognizer::InitState() { frame_offset_ = 0; samples_processed_ = 0; @@ -164,7 +164,7 @@ void KaldiRecognizer::InitState() state_ = RECOGNIZER_INITIALIZED; } -void KaldiRecognizer::InitRescoring() +void Recognizer::InitRescoring() { if (model_->graph_lm_fst_) { @@ -185,7 +185,7 @@ void KaldiRecognizer::InitRescoring() } } -void KaldiRecognizer::CleanUp() +void Recognizer::CleanUp() { delete silence_weighting_; silence_weighting_ = new kaldi::OnlineSilenceWeighting(*model_->trans_model_, model_->feature_info_.silence_weighting_config, 3); @@ -223,7 +223,7 @@ void KaldiRecognizer::CleanUp() } } -void KaldiRecognizer::UpdateSilenceWeights() +void Recognizer::UpdateSilenceWeights() { if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0 && feature_pipeline_->IvectorFeature() != nullptr) { @@ -236,17 +236,17 @@ void KaldiRecognizer::UpdateSilenceWeights() } } -void KaldiRecognizer::SetMaxAlternatives(int max_alternatives) +void Recognizer::SetMaxAlternatives(int max_alternatives) { max_alternatives_ = max_alternatives; } -void KaldiRecognizer::SetWords(bool words) +void Recognizer::SetWords(bool words) { words_ = words; } -void KaldiRecognizer::SetSpkModel(SpkModel *spk_model) +void Recognizer::SetSpkModel(SpkModel *spk_model) { if (state_ == RECOGNIZER_RUNNING) { KALDI_ERR << "Can't add speaker model to already running recognizer"; @@ -257,7 +257,7 @@ void KaldiRecognizer::SetSpkModel(SpkModel *spk_model) spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts); } -bool KaldiRecognizer::AcceptWaveform(const char *data, int len) +bool Recognizer::AcceptWaveform(const char *data, int len) { Vector wave; wave.Resize(len / 2, kUndefined); @@ -266,7 +266,7 @@ bool KaldiRecognizer::AcceptWaveform(const char *data, int len) return AcceptWaveform(wave); } -bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len) +bool Recognizer::AcceptWaveform(const short *sdata, int len) { Vector wave; wave.Resize(len, kUndefined); @@ -275,7 +275,7 @@ bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len) return AcceptWaveform(wave); } -bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len) +bool Recognizer::AcceptWaveform(const float *fdata, int len) { Vector wave; wave.Resize(len, kUndefined); @@ -284,7 +284,7 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len) return AcceptWaveform(wave); } -bool KaldiRecognizer::AcceptWaveform(Vector &wdata) +bool Recognizer::AcceptWaveform(Vector &wdata) { // Cleanup if we finalized previous utterance or the whole feature pipeline if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) { @@ -343,7 +343,7 @@ static void RunNnetComputation(const MatrixBase &features, #define MIN_SPK_FEATS 50 -bool KaldiRecognizer::GetSpkVector(Vector &out_xvector, int *num_spk_frames) +bool Recognizer::GetSpkVector(Vector &out_xvector, int *num_spk_frames) { vector nonsilence_frames; if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0) { @@ -409,7 +409,7 @@ bool KaldiRecognizer::GetSpkVector(Vector &out_xvector, int *num_spk_ } -const char *KaldiRecognizer::MbrResult(CompactLattice &rlat) +const char *Recognizer::MbrResult(CompactLattice &rlat) { CompactLattice aligned_lat; if (model_->winfo_) { @@ -523,7 +523,7 @@ static bool CompactLatticeToWordAlignmentWeight(const CompactLattice &clat, } -const char *KaldiRecognizer::NbestResult(CompactLattice &clat) +const char *Recognizer::NbestResult(CompactLattice &clat) { Lattice lat; Lattice nbest_lat; @@ -584,7 +584,7 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat) return StoreReturn(obj.dump()); } -const char* KaldiRecognizer::GetResult() +const char* Recognizer::GetResult() { if (decoder_->NumFramesDecoded() == 0) { return StoreEmptyReturn(); @@ -645,7 +645,7 @@ const char* KaldiRecognizer::GetResult() } -const char* KaldiRecognizer::PartialResult() +const char* Recognizer::PartialResult() { if (state_ != RECOGNIZER_RUNNING) { return StoreEmptyReturn(); @@ -676,7 +676,7 @@ const char* KaldiRecognizer::PartialResult() return StoreReturn(res.dump()); } -const char* KaldiRecognizer::Result() +const char* Recognizer::Result() { if (state_ != RECOGNIZER_RUNNING) { return StoreEmptyReturn(); @@ -686,7 +686,7 @@ const char* KaldiRecognizer::Result() return GetResult(); } -const char* KaldiRecognizer::FinalResult() +const char* Recognizer::FinalResult() { if (state_ != RECOGNIZER_RUNNING) { return StoreEmptyReturn(); @@ -714,7 +714,7 @@ const char* KaldiRecognizer::FinalResult() return last_result_.c_str(); } -void KaldiRecognizer::Reset() +void Recognizer::Reset() { if (state_ == RECOGNIZER_RUNNING) { decoder_->FinalizeDecoding(); @@ -723,7 +723,7 @@ void KaldiRecognizer::Reset() state_ = RECOGNIZER_ENDPOINT; } -const char *KaldiRecognizer::StoreEmptyReturn() +const char *Recognizer::StoreEmptyReturn() { if (!max_alternatives_) { return StoreReturn("{\"text\": \"\"}"); @@ -733,7 +733,7 @@ const char *KaldiRecognizer::StoreEmptyReturn() } // Store result in recognizer and return as const string -const char *KaldiRecognizer::StoreReturn(const string &res) +const char *Recognizer::StoreReturn(const string &res) { last_result_ = res; return last_result_.c_str(); diff --git a/src/kaldi_recognizer.h b/src/recognizer.h similarity index 91% rename from src/kaldi_recognizer.h rename to src/recognizer.h index 934e237e..e5a733d1 100644 --- a/src/kaldi_recognizer.h +++ b/src/recognizer.h @@ -33,19 +33,19 @@ using namespace kaldi; -enum KaldiRecognizerState { +enum RecognizerState { RECOGNIZER_INITIALIZED, RECOGNIZER_RUNNING, RECOGNIZER_ENDPOINT, RECOGNIZER_FINALIZED }; -class KaldiRecognizer { +class Recognizer { public: - KaldiRecognizer(Model *model, float sample_frequency); - KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model); - KaldiRecognizer(Model *model, float sample_frequency, char const *grammar); - ~KaldiRecognizer(); + Recognizer(Model *model, float sample_frequency); + Recognizer(Model *model, float sample_frequency, SpkModel *spk_model); + Recognizer(Model *model, float sample_frequency, char const *grammar); + ~Recognizer(); void SetMaxAlternatives(int max_alternatives); void SetSpkModel(SpkModel *spk_model); void SetWords(bool words); @@ -101,7 +101,7 @@ class KaldiRecognizer { int64 samples_processed_; int64 samples_round_start_; - KaldiRecognizerState state_; + RecognizerState state_; string last_result_; }; diff --git a/src/spk_model.h b/src/spk_model.h index 07cbd4b0..9a76c62a 100644 --- a/src/spk_model.h +++ b/src/spk_model.h @@ -22,7 +22,7 @@ using namespace kaldi; -class KaldiRecognizer; +class Recognizer; class SpkModel { @@ -32,7 +32,7 @@ class SpkModel { void Unref(); protected: - friend class KaldiRecognizer; + friend class Recognizer; ~SpkModel() {}; kaldi::nnet3::Nnet speaker_nnet; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index ba76a73b..2c5b3b82 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "vosk_api.h" -#include "kaldi_recognizer.h" + +#include "recognizer.h" +#include "batch_recognizer.h" #include "model.h" #include "spk_model.h" @@ -67,7 +69,7 @@ void vosk_spk_model_free(VoskSpkModel *model) VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate) { try { - return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate); + return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate); } catch (...) { return nullptr; } @@ -76,7 +78,7 @@ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate) VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model) { try { - return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, (SpkModel *)spk_model); + return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, (SpkModel *)spk_model); } catch (...) { return nullptr; } @@ -85,7 +87,7 @@ VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, Vos VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar) { try { - return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, grammar); + return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, grammar); } catch (...) { return nullptr; } @@ -93,12 +95,12 @@ VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, con void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives) { - ((KaldiRecognizer *)recognizer)->SetMaxAlternatives(max_alternatives); + ((Recognizer *)recognizer)->SetMaxAlternatives(max_alternatives); } void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words) { - ((KaldiRecognizer *)recognizer)->SetWords((bool)words); + ((Recognizer *)recognizer)->SetWords((bool)words); } void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model) @@ -106,13 +108,13 @@ void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk if (recognizer == nullptr || spk_model == nullptr) { return; } - ((KaldiRecognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model); + ((Recognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model); } int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length) { try { - return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length); + return ((Recognizer *)(recognizer))->AcceptWaveform(data, length); } catch (...) { return -1; } @@ -121,7 +123,7 @@ int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length) { try { - return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length); + return ((Recognizer *)(recognizer))->AcceptWaveform(data, length); } catch (...) { return -1; } @@ -130,7 +132,7 @@ int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *d int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length) { try { - return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length); + return ((Recognizer *)(recognizer))->AcceptWaveform(data, length); } catch (...) { return -1; } @@ -138,27 +140,27 @@ int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *d const char *vosk_recognizer_result(VoskRecognizer *recognizer) { - return ((KaldiRecognizer *)recognizer)->Result(); + return ((Recognizer *)recognizer)->Result(); } const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer) { - return ((KaldiRecognizer *)recognizer)->PartialResult(); + return ((Recognizer *)recognizer)->PartialResult(); } const char *vosk_recognizer_final_result(VoskRecognizer *recognizer) { - return ((KaldiRecognizer *)recognizer)->FinalResult(); + return ((Recognizer *)recognizer)->FinalResult(); } void vosk_recognizer_reset(VoskRecognizer *recognizer) { - ((KaldiRecognizer *)recognizer)->Reset(); + ((Recognizer *)recognizer)->Reset(); } void vosk_recognizer_free(VoskRecognizer *recognizer) { - delete (KaldiRecognizer *)(recognizer); + delete (Recognizer *)(recognizer); } void vosk_set_log_level(int log_level) @@ -180,3 +182,28 @@ void vosk_gpu_thread_init() kaldi::CuDevice::Instantiate(); #endif } + +VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency) +{ + return (VoskBatchRecognizer *)(new BatchRecognizer((Model *)model, sample_frequency)); +} + +void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer) +{ + delete ((BatchRecognizer *)recognizer); +} + +void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length) +{ + ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length); +} + +void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id) +{ + ((BatchRecognizer *)recognizer)->FinishStream(id); +} + +const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer) +{ + return ((BatchRecognizer *)recognizer)->PullResults(); +} diff --git a/src/vosk_api.h b/src/vosk_api.h index 7636caa6..df951858 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -39,6 +39,10 @@ typedef struct VoskSpkModel VoskSpkModel; * speaker information and so on */ typedef struct VoskRecognizer VoskRecognizer; +/** + * Batch recognizer object + */ +typedef struct VoskBatchRecognizer VoskBatchRecognizer; /** Loads model data from the file and returns the model object * @@ -285,6 +289,26 @@ void vosk_gpu_init(); */ void vosk_gpu_thread_init(); +/** Creates the batch recognizer object + * The recognizers process the speech and return text using shared model data + * @param model VoskModel containing static data for recognizer. Model can be + * shared across recognizers, even running in different threads. + * @returns recognizer object or NULL if problem occured */ +VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency); + +/** Releases batch recognizer object + * Underlying model is also unreferenced and if needed released */ +void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer); + +/** Accept batch voice data */ +void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length); + +/** Closes the stream */ +void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id); + +/** Return results */ +const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer); + #ifdef __cplusplus } #endif From 344e137a61f81887afc974027a93500c0c986436 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Mon, 13 Dec 2021 01:21:59 +0100 Subject: [PATCH 02/25] Decoding works, results are empty yet --- python/example/batch/test_batch.py | 13 ++++++++----- python/vosk/__init__.py | 4 ++-- src/Makefile | 8 ++++---- src/batch_recognizer.cc | 18 +++++++++++++++++- src/model.cc | 6 +++--- src/vosk_api.cc | 4 ++-- src/vosk_api.h | 4 ++-- 7 files changed, 38 insertions(+), 19 deletions(-) diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py index fb1bb7e9..3fadab6a 100755 --- a/python/example/batch/test_batch.py +++ b/python/example/batch/test_batch.py @@ -1,21 +1,24 @@ #!/usr/bin/env python3 -from vosk import Model, BatchRecognizer +from vosk import Model, BatchRecognizer, GpuInit, GpuThreadInit import sys import os import wave +GpuInit() +GpuThreadInit() + model = Model("model") rec = BatchRecognizer(model, 16000.0) fnames = open("tedlium.list").readlines() -fds = [open(x) for x in fnames] +fds = [open(x.strip(), "rb") for x in fnames] ended = set() while True: - for i, fd in fds: - if i in ended(): + for i, fd in enumerate(fds): + if i in ended: continue - data = fd.read(4000) + data = fd.read(16000) if len(data) == 0: rec.FinishStream(i) ended.add(i) diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index 02e1df97..9e25229c 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -117,7 +117,7 @@ def AcceptWaveform(self, uid, data): res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data)) def Results(self): - return _ffi.string(_c.vosk_batch_recognizer_result(self._handle)).decode('utf-8') + return _ffi.string(_c.vosk_batch_recognizer_results(self._handle)).decode('utf-8') def FinishStream(self, uid): - _c.vosk_recognizer_final_result(self._handle, uid) + _c.vosk_batch_recognizer_finish_stream(self._handle, uid) diff --git a/src/Makefile b/src/Makefile index 96c21949..823a4aaf 100644 --- a/src/Makefile +++ b/src/Makefile @@ -37,17 +37,19 @@ CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LIN -I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS) LIBS= \ + $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ + $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ $(KALDI_ROOT)/src/online2/kaldi-online2.a \ $(KALDI_ROOT)/src/decoder/kaldi-decoder.a \ $(KALDI_ROOT)/src/ivector/kaldi-ivector.a \ $(KALDI_ROOT)/src/gmm/kaldi-gmm.a \ - $(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \ $(KALDI_ROOT)/src/tree/kaldi-tree.a \ $(KALDI_ROOT)/src/feat/kaldi-feat.a \ $(KALDI_ROOT)/src/lat/kaldi-lat.a \ $(KALDI_ROOT)/src/lm/kaldi-lm.a \ $(KALDI_ROOT)/src/rnnlm/kaldi-rnnlm.a \ $(KALDI_ROOT)/src/hmm/kaldi-hmm.a \ + $(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \ $(KALDI_ROOT)/src/transform/kaldi-transform.a \ $(KALDI_ROOT)/src/cudamatrix/kaldi-cudamatrix.a \ $(KALDI_ROOT)/src/matrix/kaldi-matrix.a \ @@ -68,7 +70,7 @@ ifeq ($(HAVE_OPENBLAS_CLAPACK), 1) endif ifeq ($(HAVE_MKL), 1) - CFLAGS += -I$(MKL_ROOT)/include + CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential endif @@ -79,8 +81,6 @@ endif ifeq ($(HAVE_CUDA), 1) CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include LIBS+=\ - $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ - $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt endif diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 112f1fe9..969a62aa 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -25,9 +25,22 @@ BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_( model_->Ref(); BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config; + batched_decoder_config.num_worker_threads = 4; + batched_decoder_config.max_batch_size = 100; + + batched_decoder_config.feature_opts.feature_type = "mfcc"; + batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; + batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf"; + batched_decoder_config.decoder_opts.max_active = 7000; + batched_decoder_config.decoder_opts.default_beam = 13.0; + batched_decoder_config.decoder_opts.lattice_beam = 8.0; + batched_decoder_config.compute_opts.acoustic_scale = 1.0; + batched_decoder_config.compute_opts.frame_subsampling_factor = 3; + batched_decoder_config.compute_opts.frames_per_chunk = 312; cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_); + cuda_pipeline_->SetSymbolTable(*model_->word_syms_); CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config; dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config, @@ -60,6 +73,9 @@ void BatchRecognizer::InitRescoring() void BatchRecognizer::FinishStream(uint64_t id) { + Vector wave; + SubVector chunk(wave.Data(), 0); + dynamic_batcher_->Push(id, false, true, chunk); streams_.erase(id); } @@ -77,7 +93,7 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) [&, id](const std::string &str, bool partial, bool endpoint_detected) { if (partial) { - KALDI_LOG << "id #" << id << " [partial] : " << str; + KALDI_LOG << "id #" << id << " [partial] : " << str << ":"; } if (endpoint_detected) { diff --git a/src/model.cc b/src/model.cc index 8b5e12cc..eecaed97 100644 --- a/src/model.cc +++ b/src/model.cc @@ -241,9 +241,9 @@ void Model::ReadDataFiles() SetDropoutTestMode(true, &(nnet_->GetNnet())); nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet())); } - decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_, - nnet_); +/* decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_, + nnet_); if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) { KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_; @@ -261,7 +261,7 @@ void Model::ReadDataFiles() } else { feature_info_.use_ivectors = false; } - +*/ if (stat(global_cmvn_stats_rxfilename_.c_str(), &buffer) == 0) { KALDI_LOG << "Reading CMVN stats from " << global_cmvn_stats_rxfilename_; feature_info_.use_cmvn = true; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 2c5b3b82..f95adc07 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -193,12 +193,12 @@ void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer) delete ((BatchRecognizer *)recognizer); } -void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length) +void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length) { ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length); } -void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id) +void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id) { ((BatchRecognizer *)recognizer)->FinishStream(id); } diff --git a/src/vosk_api.h b/src/vosk_api.h index df951858..e085afe7 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -301,10 +301,10 @@ VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_fr void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer); /** Accept batch voice data */ -void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length); +void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length); /** Closes the stream */ -void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id); +void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id); /** Return results */ const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer); From 60f0396fe0647d57b73ff59e51f09bba69c54ad5 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 17 Dec 2021 01:13:09 +0100 Subject: [PATCH 03/25] Reset lattice on endpoint --- python/example/batch/test_batch.py | 5 +- python/vosk/__init__.py | 2 +- src/batch_recognizer.cc | 95 +++++++++++++++++++++++------- src/batch_recognizer.h | 15 ++++- src/model.cc | 4 +- src/vosk_api.cc | 4 +- src/vosk_api.h | 6 +- 7 files changed, 94 insertions(+), 37 deletions(-) diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py index 3fadab6a..f93eb6ea 100755 --- a/python/example/batch/test_batch.py +++ b/python/example/batch/test_batch.py @@ -8,8 +8,7 @@ GpuInit() GpuThreadInit() -model = Model("model") -rec = BatchRecognizer(model, 16000.0) +rec = BatchRecognizer() fnames = open("tedlium.list").readlines() fds = [open(x.strip(), "rb") for x in fnames] @@ -18,7 +17,7 @@ for i, fd in enumerate(fds): if i in ended: continue - data = fd.read(16000) + data = fd.read(8000) if len(data) == 0: rec.FinishStream(i) ended.add(i) diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index 9e25229c..964a0ac2 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -105,7 +105,7 @@ def GpuThreadInit(): class BatchRecognizer(object): def __init__(self, *args): - self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1]) + self._handle = _c.vosk_batch_recognizer_new() if self._handle == _ffi.NULL: raise Exception("Failed to create a recognizer") diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 969a62aa..184fb8a2 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -17,16 +17,22 @@ #include "fstext/fstext-utils.h" #include "lat/sausages.h" +#include + using namespace fst; using namespace kaldi::nnet3; using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID; -BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(model), sample_frequency_(sample_frequency) { - model_->Ref(); - +BatchRecognizer::BatchRecognizer() { BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config; + + kaldi::ParseOptions po("something"); + batched_decoder_config.Register(&po); + po.ReadConfigFile("model/conf/model.conf"); + batched_decoder_config.num_worker_threads = 4; batched_decoder_config.max_batch_size = 100; + batched_decoder_config.reset_on_endpoint = true; batched_decoder_config.feature_opts.feature_type = "mfcc"; batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; @@ -38,37 +44,78 @@ BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_( batched_decoder_config.compute_opts.frame_subsampling_factor = 3; batched_decoder_config.compute_opts.frames_per_chunk = 312; + struct stat buffer; + + string nnet3_rxfilename_ = "model/am/final.mdl"; + string hclg_fst_rxfilename_ = "model/graph/HCLG.fst"; + string word_syms_rxfilename_ = "model/graph/words.txt"; + string winfo_rxfilename_ = "model/graph/phones/word_boundary.int"; + string std_fst_rxfilename_ = "model/rescore/G.fst"; + string carpa_rxfilename_ = "model/rescore/G.carpa"; + + trans_model_ = new kaldi::TransitionModel(); + nnet_ = new kaldi::nnet3::AmNnetSimple(); + { + bool binary; + kaldi::Input ki(nnet3_rxfilename_, &binary); + trans_model_->Read(ki.Stream(), binary); + nnet_->Read(ki.Stream(), binary); + SetBatchnormTestMode(true, &(nnet_->GetNnet())); + SetDropoutTestMode(true, &(nnet_->GetNnet())); + nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet())); + } + + if (stat(hclg_fst_rxfilename_.c_str(), &buffer) == 0) { + KALDI_LOG << "Loading HCLG from " << hclg_fst_rxfilename_; + hclg_fst_ = fst::ReadFstKaldiGeneric(hclg_fst_rxfilename_); + } + + KALDI_LOG << "Loading words from " << word_syms_rxfilename_; + if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) { + KALDI_ERR << "Could not read symbol table from file " + << word_syms_rxfilename_; + } + KALDI_ASSERT(word_syms_); + + if (stat(winfo_rxfilename_.c_str(), &buffer) == 0) { + KALDI_LOG << "Loading winfo " << winfo_rxfilename_; + kaldi::WordBoundaryInfoNewOpts opts; + winfo_ = new kaldi::WordBoundaryInfo(opts, winfo_rxfilename_); + } + + if (stat(carpa_rxfilename_.c_str(), &buffer) == 0) { + KALDI_LOG << "Loading subtract G.fst model from " << std_fst_rxfilename_; + graph_lm_fst_ = fst::ReadAndPrepareLmFst(std_fst_rxfilename_); + KALDI_LOG << "Loading CARPA model from " << carpa_rxfilename_; + ReadKaldiObject(carpa_rxfilename_, &const_arpa_); + } + + + cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline - (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_); - cuda_pipeline_->SetSymbolTable(*model_->word_syms_); + (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_); + cuda_pipeline_->SetSymbolTable(*word_syms_); CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config; dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config, *cuda_pipeline_); - - InitRescoring(); } BatchRecognizer::~BatchRecognizer() { + + delete trans_model_; + delete nnet_; + delete word_syms_; + delete winfo_; + delete hclg_fst_; + delete graph_lm_fst_; + delete lm_to_subtract_; delete carpa_to_add_; delete carpa_to_add_scale_; delete cuda_pipeline_; delete dynamic_batcher_; - - model_->Unref(); -} - -void BatchRecognizer::InitRescoring() -{ - if (model_->graph_lm_fst_) { - fst::CacheOptions cache_opts(true, -1); - fst::ArcMapFstOptions mapfst_opts(cache_opts); - fst::StdToLatticeMapper mapper; - lm_to_subtract_ = new fst::ArcMapFst >(*model_->graph_lm_fst_, mapper, mapfst_opts); - carpa_to_add_ = new ConstArpaLmDeterministicFst(model_->const_arpa_); - } } void BatchRecognizer::FinishStream(uint64_t id) @@ -104,13 +151,18 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) KALDI_LOG << "id #" << id << " : " << str; } }); + cuda_pipeline_->SetLatticeCallback( + id, + [&, id](CompactLattice &clat) { + KALDI_LOG << "Got lattice from the stream " << id; + }); } Vector wave; wave.Resize(len / 2, kUndefined); for (int i = 0; i < len / 2; i++) wave(i) = *(((short *)data) + i); - SubVector chunk(wave.Data(), 0); + SubVector chunk(wave.Data(), wave.Dim()); dynamic_batcher_->Push(id, first, false, chunk); } @@ -118,6 +170,5 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) const char* BatchRecognizer::PullResults() { dynamic_batcher_->WaitForCompletion(); - cudaDeviceSynchronize(); return ""; } diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h index 00f4a0db..c8045d53 100644 --- a/src/batch_recognizer.h +++ b/src/batch_recognizer.h @@ -40,7 +40,7 @@ using namespace kaldi::cuda_decoder; class BatchRecognizer { public: - BatchRecognizer(Model *model, float sample_frequency); + BatchRecognizer(); ~BatchRecognizer(); void FinishStream(uint64_t id); @@ -48,12 +48,21 @@ class BatchRecognizer { const char* PullResults(); private: - void InitRescoring(); - Model *model_ = nullptr; + kaldi::TransitionModel *trans_model_ = nullptr; + kaldi::nnet3::AmNnetSimple *nnet_ = nullptr; + const fst::SymbolTable *word_syms_ = nullptr; + + fst::Fst *hclg_fst_ = nullptr; + kaldi::WordBoundaryInfo *winfo_ = nullptr; + + fst::VectorFst *graph_lm_fst_ = nullptr; + kaldi::ConstArpaLm const_arpa_; + BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr; CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr; + std::set streams_; // Rescoring diff --git a/src/model.cc b/src/model.cc index eecaed97..c83d07a8 100644 --- a/src/model.cc +++ b/src/model.cc @@ -242,7 +242,7 @@ void Model::ReadDataFiles() nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet())); } -/* decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_, + decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_, nnet_); if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) { KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_; @@ -261,7 +261,7 @@ void Model::ReadDataFiles() } else { feature_info_.use_ivectors = false; } -*/ + if (stat(global_cmvn_stats_rxfilename_.c_str(), &buffer) == 0) { KALDI_LOG << "Reading CMVN stats from " << global_cmvn_stats_rxfilename_; feature_info_.use_cmvn = true; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index f95adc07..a53dbf87 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -183,9 +183,9 @@ void vosk_gpu_thread_init() #endif } -VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency) +VoskBatchRecognizer *vosk_batch_recognizer_new() { - return (VoskBatchRecognizer *)(new BatchRecognizer((Model *)model, sample_frequency)); + return (VoskBatchRecognizer *)(new BatchRecognizer()); } void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer) diff --git a/src/vosk_api.h b/src/vosk_api.h index e085afe7..c5b92f1c 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -290,11 +290,9 @@ void vosk_gpu_init(); void vosk_gpu_thread_init(); /** Creates the batch recognizer object - * The recognizers process the speech and return text using shared model data - * @param model VoskModel containing static data for recognizer. Model can be - * shared across recognizers, even running in different threads. + * * @returns recognizer object or NULL if problem occured */ -VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency); +VoskBatchRecognizer *vosk_batch_recognizer_new(); /** Releases batch recognizer object * Underlying model is also unreferenced and if needed released */ From 848b2dc753a823c2a3f1ca6e2bb4fd4f1d7eab31 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 17 Dec 2021 22:22:30 +0100 Subject: [PATCH 04/25] Expose results in Python --- python/example/batch/asr_server_gpu.py | 85 +++++++++++++++++++++++++ python/example/batch/test_batch.py | 24 +++++-- python/vosk/__init__.py | 10 ++- src/batch_recognizer.cc | 87 +++++++++++++++++++++++--- src/batch_recognizer.h | 6 +- src/json.h | 8 +-- src/vosk_api.cc | 14 ++++- src/vosk_api.h | 8 ++- 8 files changed, 217 insertions(+), 25 deletions(-) create mode 100755 python/example/batch/asr_server_gpu.py diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py new file mode 100755 index 00000000..f58587c9 --- /dev/null +++ b/python/example/batch/asr_server_gpu.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +import json +import os +import sys +import asyncio +import pathlib +import websockets +import logging + +from vosk import BatchRecognizer, GpuInit + + +async def recognize(websocket, path): + global args + global loop + global pool + global rec + global client_cnt + + uid = client_cnt + client_cnt += 1 + + logging.info('Connection %d from %s', uid, websocket.remote_address); + + while True: + + message = await websocket.recv() + + if message == '{"eof" : 1}': + rec.FinishStream(uid) + break + + if isinstance(message, str) and 'config' in message: + continue + + rec.AcceptWaveform(uid, message) + await asyncio.sleep(len(message) / 16000.0 / 2) + res = rec.Result(uid) + if len(res) == 0: + await websocket.send('{ "partial" : "" }') + else: + await websocket.send(res) + + rec.Wait() + res = rec.Result(uid) + await websocket.send(res) + +def start(): + + global rec + global args + global loop + global client_cnt + + # Enable loging if needed + # + # logger = logging.getLogger('websockets') + # logger.setLevel(logging.INFO) + # logger.addHandler(logging.StreamHandler()) + logging.basicConfig(level=logging.INFO) + + args = type('', (), {})() + + args.interface = os.environ.get('VOSK_SERVER_INTERFACE', '0.0.0.0') + args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700)) + + GpuInit() + + rec = BatchRecognizer() + + client_cnt = 0 + + loop = asyncio.get_event_loop() + + start_server = websockets.serve( + recognize, args.interface, args.port) + + logging.info("Listening on %s:%d", args.interface, args.port) + loop.run_until_complete(start_server) + loop.run_forever() + + +if __name__ == '__main__': + start() diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py index f93eb6ea..32aa021e 100755 --- a/python/example/batch/test_batch.py +++ b/python/example/batch/test_batch.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 -from vosk import Model, BatchRecognizer, GpuInit, GpuThreadInit import sys import os import wave +from time import sleep + +from vosk import Model, BatchRecognizer, GpuInit GpuInit() -GpuThreadInit() rec = BatchRecognizer() @@ -14,6 +15,7 @@ fds = [open(x.strip(), "rb") for x in fnames] ended = set() while True: + for i, fd in enumerate(fds): if i in ended: continue @@ -21,8 +23,20 @@ if len(data) == 0: rec.FinishStream(i) ended.add(i) - else: - rec.AcceptWaveform(i, data) - rec.Results() + continue + rec.AcceptWaveform(i, data) + + sleep(0.3) + for i, fd in enumerate(fds): + res = rec.Result(i) + print (i, res) + if len(ended) == len(fds): break + +sleep(20) +print ("Done") +for i, fd in enumerate(fds): + res = rec.Result(i) + print (i, res) +rec.Wait() diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index 964a0ac2..c83a7e34 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -116,8 +116,14 @@ def __del__(self): def AcceptWaveform(self, uid, data): res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data)) - def Results(self): - return _ffi.string(_c.vosk_batch_recognizer_results(self._handle)).decode('utf-8') + def Result(self, uid): + ptr = _c.vosk_batch_recognizer_front_result(self._handle, uid) + res = _ffi.string(ptr).decode('utf-8') + _c.vosk_batch_recognizer_pop(self._handle, uid) + return res def FinishStream(self, uid): _c.vosk_batch_recognizer_finish_stream(self._handle, uid) + + def Wait(self): + _c.vosk_batch_recognizer_wait(self._handle) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 184fb8a2..1773fc0e 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -16,6 +16,7 @@ #include "fstext/fstext-utils.h" #include "lat/sausages.h" +#include "json.h" #include @@ -37,12 +38,12 @@ BatchRecognizer::BatchRecognizer() { batched_decoder_config.feature_opts.feature_type = "mfcc"; batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf"; - batched_decoder_config.decoder_opts.max_active = 7000; - batched_decoder_config.decoder_opts.default_beam = 13.0; - batched_decoder_config.decoder_opts.lattice_beam = 8.0; + batched_decoder_config.decoder_opts.max_active = 5000; + batched_decoder_config.decoder_opts.default_beam = 10.0; + batched_decoder_config.decoder_opts.lattice_beam = 4.0; batched_decoder_config.compute_opts.acoustic_scale = 1.0; batched_decoder_config.compute_opts.frame_subsampling_factor = 3; - batched_decoder_config.compute_opts.frames_per_chunk = 312; + batched_decoder_config.compute_opts.frames_per_chunk = 51; struct stat buffer; @@ -126,6 +127,47 @@ void BatchRecognizer::FinishStream(uint64_t id) streams_.erase(id); } + +void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset) +{ + fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat); + + CompactLattice aligned_lat; + WordAlignLattice(clat, *trans_model_, *winfo_, 0, &aligned_lat); + + MinimumBayesRisk mbr(aligned_lat); + const vector &conf = mbr.GetOneBestConfidences(); + const vector &words = mbr.GetOneBest(); + const vector > × = + mbr.GetOneBestTimes(); + + int size = words.size(); + + json::JSON obj; + stringstream text; + + // Create JSON object + for (int i = 0; i < size; i++) { + json::JSON word; + + word["word"] = word_syms_->Find(words[i]); + word["start"] = times[i].first * 0.03 + offset; + word["end"] = times[i].second * 0.03 + offset; + word["conf"] = conf[i]; + obj["result"].append(word); + + if (i) { + text << " "; + } + text << word_syms_->Find(words[i]); + } + obj["text"] = text.str(); + +// KALDI_LOG << "Result " << id << " " << obj.dump(); + + results_[id].push(obj.dump()); +} + void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) { bool first = false; @@ -135,7 +177,8 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) streams_.insert(id); // Define the callback for results. - cuda_pipeline_->SetBestPathCallback( +#if 0 + cuda_pipeline_->SetBestPathCallback( id, [&, id](const std::string &str, bool partial, bool endpoint_detected) { @@ -151,11 +194,19 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) KALDI_LOG << "id #" << id << " : " << str; } }); +#endif cuda_pipeline_->SetLatticeCallback( id, - [&, id](CompactLattice &clat) { - KALDI_LOG << "Got lattice from the stream " << id; - }); + [&, id](SegmentedLatticeCallbackParams& params) { + if (params.results.empty()) { + KALDI_WARN << "Empty result for callback"; + return; + } + CompactLattice *clat = params.results[0].GetLatticeResult(); + BaseFloat offset = params.results[0].GetTimeOffsetSeconds(); + PushLattice(id, *clat, offset); + }, + CudaPipelineResult::RESULT_TYPE_LATTICE); } Vector wave; @@ -167,8 +218,24 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) dynamic_batcher_->Push(id, first, false, chunk); } -const char* BatchRecognizer::PullResults() +const char* BatchRecognizer::FrontResult(uint64_t id) +{ + if (results_[id].empty()) { + return ""; + } + return results_[id].front().c_str(); +} + +void BatchRecognizer::Pop(uint64_t id) +{ + if (results_[id].empty()) { + return; + } + results_[id].pop(); +} + +void BatchRecognizer::WaitForCompletion() { dynamic_batcher_->WaitForCompletion(); - return ""; } + diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h index c8045d53..0082a364 100644 --- a/src/batch_recognizer.h +++ b/src/batch_recognizer.h @@ -45,9 +45,12 @@ class BatchRecognizer { void FinishStream(uint64_t id); void AcceptWaveform(uint64_t id, const char *data, int len); - const char* PullResults(); + const char *FrontResult(uint64_t id); + void Pop(uint64_t id); + void WaitForCompletion(); private: + void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset); kaldi::TransitionModel *trans_model_ = nullptr; kaldi::nnet3::AmNnetSimple *nnet_ = nullptr; @@ -64,6 +67,7 @@ class BatchRecognizer { std::set streams_; + std::map > results_; // Rescoring fst::ArcMapFst > *lm_to_subtract_ = nullptr; diff --git a/src/json.h b/src/json.h index 463912ec..2159392b 100644 --- a/src/json.h +++ b/src/json.h @@ -424,7 +424,7 @@ class JSON Class Type = Class::Null; }; -JSON Array() { +inline JSON Array() { return JSON::Make( JSON::Class::Array ); } @@ -435,11 +435,11 @@ JSON Array( T... args ) { return arr; } -JSON Object() { +inline JSON Object() { return JSON::Make( JSON::Class::Object ); } -std::ostream& operator<<( std::ostream &os, const JSON &json ) { +inline std::ostream& operator<<( std::ostream &os, const JSON &json ) { os << json.dump(); return os; } @@ -647,7 +647,7 @@ namespace { } } -JSON JSON::Load( const string &str ) { +inline JSON JSON::Load( const string &str ) { size_t offset = 0; return parse_next( str, offset ); } diff --git a/src/vosk_api.cc b/src/vosk_api.cc index a53dbf87..b2a7a6a4 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -203,7 +203,17 @@ void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id ((BatchRecognizer *)recognizer)->FinishStream(id); } -const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer) +const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id) { - return ((BatchRecognizer *)recognizer)->PullResults(); + return ((BatchRecognizer *)recognizer)->FrontResult(id); +} + +void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id) +{ + return ((BatchRecognizer *)recognizer)->Pop(id); +} + +void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer) +{ + ((BatchRecognizer *)recognizer)->WaitForCompletion(); } diff --git a/src/vosk_api.h b/src/vosk_api.h index c5b92f1c..7177009c 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -305,7 +305,13 @@ void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id); /** Return results */ -const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer); +const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id); + +/** Release and free first retrieved result */ +void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id); + +/** Wait for the processing */ +void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer); #ifdef __cplusplus } From cb0f8e64110ad502f8660a2e3066e490bedfcddc Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Thu, 23 Dec 2021 22:34:47 +0100 Subject: [PATCH 05/25] Per-stream wait API --- python/example/batch/asr_server_gpu.py | 9 ++++++-- python/example/batch/test_batch.py | 32 +++++++++++++++++++------- python/vosk/__init__.py | 3 +++ src/batch_recognizer.cc | 10 +++++--- src/batch_recognizer.h | 1 + src/vosk_api.cc | 7 +++++- src/vosk_api.h | 3 +++ 7 files changed, 51 insertions(+), 14 deletions(-) diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py index f58587c9..11885e9f 100755 --- a/python/example/batch/asr_server_gpu.py +++ b/python/example/batch/asr_server_gpu.py @@ -35,14 +35,19 @@ async def recognize(websocket, path): continue rec.AcceptWaveform(uid, message) - await asyncio.sleep(len(message) / 16000.0 / 2) + + while rec.GetPendingChunks(uid) > 0: + await asyncio.sleep(0.1) + res = rec.Result(uid) if len(res) == 0: await websocket.send('{ "partial" : "" }') else: await websocket.send(res) - rec.Wait() + while rec.GetPendingChunks(uid) > 0: + await asyncio.sleep(0.1) + res = rec.Result(uid) await websocket.send(res) diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py index 32aa021e..8737a746 100755 --- a/python/example/batch/test_batch.py +++ b/python/example/batch/test_batch.py @@ -4,6 +4,9 @@ import os import wave from time import sleep +import json +from timeit import default_timer as timer + from vosk import Model, BatchRecognizer, GpuInit @@ -13,9 +16,16 @@ fnames = open("tedlium.list").readlines() fds = [open(x.strip(), "rb") for x in fnames] +uids = [fname.strip().split('/')[-1][:-4] for fname in fnames] +results = [""] * len(fnames) ended = set() +tot_samples = 0 + +start_time = timer() + while True: + # Feed in the data for i, fd in enumerate(fds): if i in ended: continue @@ -25,18 +35,24 @@ ended.add(i) continue rec.AcceptWaveform(i, data) + tot_samples += len(data) - sleep(0.3) + # Wait for results from CUDA + rec.Wait() + + # Retrieve and add results for i, fd in enumerate(fds): res = rec.Result(i) - print (i, res) + if len(res) != 0: + results[i] = results[i] + " " + json.loads(res)['text'] if len(ended) == len(fds): break -sleep(20) -print ("Done") -for i, fd in enumerate(fds): - res = rec.Result(i) - print (i, res) -rec.Wait() +end_time = timer() + +for i in range(len(results)): + print (uids[i], results[i].strip()) + +print ("Processed %d seconds of audio in %d seconds (%f xRT)" % (tot_samples / 16000.0 / 2, end_time - start_time, + (tot_samples / 16000.0 / 2 / (end_time - start_time))), file=sys.stderr) diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index c83a7e34..0e60c2ba 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -127,3 +127,6 @@ def FinishStream(self, uid): def Wait(self): _c.vosk_batch_recognizer_wait(self._handle) + + def GetPendingChunks(self, uid): + return _c.vosk_batch_recognizer_get_pending_chunks(self._handle, uid) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 1773fc0e..972e31dc 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -38,9 +38,9 @@ BatchRecognizer::BatchRecognizer() { batched_decoder_config.feature_opts.feature_type = "mfcc"; batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf"; - batched_decoder_config.decoder_opts.max_active = 5000; - batched_decoder_config.decoder_opts.default_beam = 10.0; - batched_decoder_config.decoder_opts.lattice_beam = 4.0; + batched_decoder_config.decoder_opts.max_active = 7000; + batched_decoder_config.decoder_opts.default_beam = 13.0; + batched_decoder_config.decoder_opts.lattice_beam = 6.0; batched_decoder_config.compute_opts.acoustic_scale = 1.0; batched_decoder_config.compute_opts.frame_subsampling_factor = 3; batched_decoder_config.compute_opts.frames_per_chunk = 51; @@ -239,3 +239,7 @@ void BatchRecognizer::WaitForCompletion() dynamic_batcher_->WaitForCompletion(); } +int BatchRecognizer::GetPendingChunks(uint64_t id) +{ + return dynamic_batcher_->GetPendingChunks(id); +} diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h index 0082a364..f26dd54b 100644 --- a/src/batch_recognizer.h +++ b/src/batch_recognizer.h @@ -48,6 +48,7 @@ class BatchRecognizer { const char *FrontResult(uint64_t id); void Pop(uint64_t id); void WaitForCompletion(); + int GetPendingChunks(uint64_t id); private: void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset); diff --git a/src/vosk_api.cc b/src/vosk_api.cc index b2a7a6a4..1f77eb6c 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -210,10 +210,15 @@ const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id) { - return ((BatchRecognizer *)recognizer)->Pop(id); + ((BatchRecognizer *)recognizer)->Pop(id); } void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer) { ((BatchRecognizer *)recognizer)->WaitForCompletion(); } + +int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id) +{ + return ((BatchRecognizer *)recognizer)->GetPendingChunks(id); +} diff --git a/src/vosk_api.h b/src/vosk_api.h index 7177009c..f6a981cb 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -313,6 +313,9 @@ void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id); /** Wait for the processing */ void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer); +/** Get amount of pending chunks for more intelligent waiting */ +int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id); + #ifdef __cplusplus } #endif From 93e81c3bc8ed3960754b4eb6962b6dcc1fa26541 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 24 Dec 2021 00:22:42 +0100 Subject: [PATCH 06/25] Bigger frames per chunk for our big models --- src/batch_recognizer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 972e31dc..78cfc6f2 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -43,7 +43,7 @@ BatchRecognizer::BatchRecognizer() { batched_decoder_config.decoder_opts.lattice_beam = 6.0; batched_decoder_config.compute_opts.acoustic_scale = 1.0; batched_decoder_config.compute_opts.frame_subsampling_factor = 3; - batched_decoder_config.compute_opts.frames_per_chunk = 51; + batched_decoder_config.compute_opts.frames_per_chunk = 180; struct stat buffer; From 72bf210164ed6f347abce642025751f285b8284c Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 24 Dec 2021 01:07:38 +0100 Subject: [PATCH 07/25] Put the demo into main folder --- python/example/batch/asr_server_gpu.py | 90 ------------------- .../test_batch.py => test_gpu_batch.py} | 2 +- src/batch_recognizer.cc | 5 +- src/vosk_api.cc | 2 + 4 files changed, 6 insertions(+), 93 deletions(-) delete mode 100755 python/example/batch/asr_server_gpu.py rename python/example/{batch/test_batch.py => test_gpu_batch.py} (97%) diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py deleted file mode 100755 index 11885e9f..00000000 --- a/python/example/batch/asr_server_gpu.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 - -import json -import os -import sys -import asyncio -import pathlib -import websockets -import logging - -from vosk import BatchRecognizer, GpuInit - - -async def recognize(websocket, path): - global args - global loop - global pool - global rec - global client_cnt - - uid = client_cnt - client_cnt += 1 - - logging.info('Connection %d from %s', uid, websocket.remote_address); - - while True: - - message = await websocket.recv() - - if message == '{"eof" : 1}': - rec.FinishStream(uid) - break - - if isinstance(message, str) and 'config' in message: - continue - - rec.AcceptWaveform(uid, message) - - while rec.GetPendingChunks(uid) > 0: - await asyncio.sleep(0.1) - - res = rec.Result(uid) - if len(res) == 0: - await websocket.send('{ "partial" : "" }') - else: - await websocket.send(res) - - while rec.GetPendingChunks(uid) > 0: - await asyncio.sleep(0.1) - - res = rec.Result(uid) - await websocket.send(res) - -def start(): - - global rec - global args - global loop - global client_cnt - - # Enable loging if needed - # - # logger = logging.getLogger('websockets') - # logger.setLevel(logging.INFO) - # logger.addHandler(logging.StreamHandler()) - logging.basicConfig(level=logging.INFO) - - args = type('', (), {})() - - args.interface = os.environ.get('VOSK_SERVER_INTERFACE', '0.0.0.0') - args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700)) - - GpuInit() - - rec = BatchRecognizer() - - client_cnt = 0 - - loop = asyncio.get_event_loop() - - start_server = websockets.serve( - recognize, args.interface, args.port) - - logging.info("Listening on %s:%d", args.interface, args.port) - loop.run_until_complete(start_server) - loop.run_forever() - - -if __name__ == '__main__': - start() diff --git a/python/example/batch/test_batch.py b/python/example/test_gpu_batch.py similarity index 97% rename from python/example/batch/test_batch.py rename to python/example/test_gpu_batch.py index 8737a746..3a65bda8 100755 --- a/python/example/batch/test_batch.py +++ b/python/example/test_gpu_batch.py @@ -29,7 +29,7 @@ for i, fd in enumerate(fds): if i in ended: continue - data = fd.read(8000) + data = fd.read(16000) if len(data) == 0: rec.FinishStream(i) ended.add(i) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 78cfc6f2..3337ee10 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -31,9 +31,10 @@ BatchRecognizer::BatchRecognizer() { batched_decoder_config.Register(&po); po.ReadConfigFile("model/conf/model.conf"); - batched_decoder_config.num_worker_threads = 4; - batched_decoder_config.max_batch_size = 100; + batched_decoder_config.num_worker_threads = -1; + batched_decoder_config.max_batch_size = 200; batched_decoder_config.reset_on_endpoint = true; + batched_decoder_config.use_gpu_feature_extraction = true; batched_decoder_config.feature_opts.feature_type = "mfcc"; batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 1f77eb6c..3f740d7b 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -171,6 +171,8 @@ void vosk_set_log_level(int log_level) void vosk_gpu_init() { #if HAVE_CUDA +// kaldi::CuDevice::EnableTensorCores(true); +// kaldi::CuDevice::EnableTf32Compute(true); kaldi::CuDevice::Instantiate().SelectGpuId("yes"); kaldi::CuDevice::Instantiate().AllowMultithreading(); #endif From 525b722c44e6b152926178ea226e9ce1c7ba3154 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 24 Dec 2021 01:35:06 +0100 Subject: [PATCH 08/25] Compile without CUDA too --- src/Makefile | 13 ++++++++----- src/vosk_api.cc | 24 +++++++++++++++++++++++- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index 823a4aaf..9965db65 100644 --- a/src/Makefile +++ b/src/Makefile @@ -22,14 +22,12 @@ VOSK_SOURCES= \ language_model.cc \ model.cc \ spk_model.cc \ - batch_recognizer.cc \ vosk_api.cc VOSK_HEADERS= \ recognizer.h \ language_model.h \ model.h \ - batch_recognizer.h \ spk_model.h \ vosk_api.h @@ -37,8 +35,6 @@ CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LIN -I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS) LIBS= \ - $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ - $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ $(KALDI_ROOT)/src/online2/kaldi-online2.a \ $(KALDI_ROOT)/src/decoder/kaldi-decoder.a \ $(KALDI_ROOT)/src/ivector/kaldi-ivector.a \ @@ -79,8 +75,15 @@ ifeq ($(HAVE_ACCELERATE), 1) endif ifeq ($(HAVE_CUDA), 1) + VOSK_SOURCES += batch_recognizer.cc + VOSK_HEADERS += batch_recognizer.h + CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include - LIBS+=\ + + LIBS := \ + $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ + $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ + $(LIBS) \ -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt endif diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 3f740d7b..65356038 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -15,12 +15,12 @@ #include "vosk_api.h" #include "recognizer.h" -#include "batch_recognizer.h" #include "model.h" #include "spk_model.h" #if HAVE_CUDA #include "cudamatrix/cu-device.h" +#include "batch_recognizer.h" #endif #include @@ -187,40 +187,62 @@ void vosk_gpu_thread_init() VoskBatchRecognizer *vosk_batch_recognizer_new() { +#if HAVE_CUDA return (VoskBatchRecognizer *)(new BatchRecognizer()); +#else + return NULL; +#endif } void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer) { +#if HAVE_CUDA delete ((BatchRecognizer *)recognizer); +#endif } void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length) { +#if HAVE_CUDA ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length); +#endif } void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id) { +#if HAVE_CUDA ((BatchRecognizer *)recognizer)->FinishStream(id); +#endif } const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id) { +#if HAVE_CUDA return ((BatchRecognizer *)recognizer)->FrontResult(id); +#else + return NULL; +#endif } void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id) { +#if HAVE_CUDA ((BatchRecognizer *)recognizer)->Pop(id); +#endif } void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer) { +#if HAVE_CUDA ((BatchRecognizer *)recognizer)->WaitForCompletion(); +#endif } int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id) { +#if HAVE_CUDA return ((BatchRecognizer *)recognizer)->GetPendingChunks(id); +#else + return 0; +#endif } From 5428d36d1657b5d2339288affb8c3512c72896fd Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Sun, 26 Dec 2021 01:12:18 +0100 Subject: [PATCH 09/25] Round times --- src/batch_recognizer.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 3337ee10..28818692 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -32,7 +32,8 @@ BatchRecognizer::BatchRecognizer() { po.ReadConfigFile("model/conf/model.conf"); batched_decoder_config.num_worker_threads = -1; - batched_decoder_config.max_batch_size = 200; + batched_decoder_config.max_batch_size = 32; + batched_decoder_config.num_channels = 600; batched_decoder_config.reset_on_endpoint = true; batched_decoder_config.use_gpu_feature_extraction = true; @@ -44,7 +45,7 @@ BatchRecognizer::BatchRecognizer() { batched_decoder_config.decoder_opts.lattice_beam = 6.0; batched_decoder_config.compute_opts.acoustic_scale = 1.0; batched_decoder_config.compute_opts.frame_subsampling_factor = 3; - batched_decoder_config.compute_opts.frames_per_chunk = 180; + batched_decoder_config.compute_opts.frames_per_chunk = 51; struct stat buffer; @@ -152,8 +153,8 @@ void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat o json::JSON word; word["word"] = word_syms_->Find(words[i]); - word["start"] = times[i].first * 0.03 + offset; - word["end"] = times[i].second * 0.03 + offset; + word["start"] = round(times[i].first) * 0.03 + offset; + word["end"] = round(times[i].second) * 0.03 + offset; word["conf"] = conf[i]; obj["result"].append(word); From 70d5cbd0e0cfe02bcd81f7d4b18554aed97804ba Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Wed, 5 Jan 2022 20:32:08 +0100 Subject: [PATCH 10/25] Update README with Japanese --- README.md | 4 ++-- nodejs/README.md | 12 ++++++------ python/README.md | 5 +++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d2427071..91958486 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Vosk Speech Recognition Toolkit Vosk is an offline open source speech recognition toolkit. It enables -speech recognition models for 18 languages and dialects - English, Indian +speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, -Ukrainian. +Ukrainian, Kazakh, Swedish, Japanese. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/nodejs/README.md b/nodejs/README.md index 0ac9d753..748e9e6c 100644 --- a/nodejs/README.md +++ b/nodejs/README.md @@ -2,18 +2,18 @@ This is an FFI-NAPI wrapper for the Vosk library. ## Usage -It mostly follows Vosk interface, some methods are not yet fully implemented. +Bindings mostly follow Vosk interface, some methods are not yet fully implemented. -To use it you need to compile libvosk library, see Python module build -instructions for details. You can find prebuilt library inside python -wheel. +See [demo folder](https://github.com/alphacep/vosk-api/tree/master/nodejs/demo) for +details. ## About Vosk is an offline open source speech recognition toolkit. It enables -speech recognition models for 17 languages and dialects - English, Indian +speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, -Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino. +Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, +Ukrainian, Kazakh, Swedish, Japanese. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/python/README.md b/python/README.md index 300eaa6d..0a40ee79 100644 --- a/python/README.md +++ b/python/README.md @@ -1,9 +1,10 @@ This is a Python module for Vosk. Vosk is an offline open source speech recognition toolkit. It enables -speech recognition models for 17 languages and dialects - English, Indian +speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, -Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino. +Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, +Ukrainian, Kazakh, Swedish, Japanese. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable From a1eac015dc8b530c26fdb0ac05be24ea3bcf57c3 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 7 Jan 2022 16:27:57 +0100 Subject: [PATCH 11/25] Add Esperanto --- README.md | 2 +- nodejs/README.md | 2 +- python/README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 91958486..42af9932 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, -Ukrainian, Kazakh, Swedish, Japanese. More to come. +Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/nodejs/README.md b/nodejs/README.md index 748e9e6c..5603fae6 100644 --- a/nodejs/README.md +++ b/nodejs/README.md @@ -13,7 +13,7 @@ Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, -Ukrainian, Kazakh, Swedish, Japanese. More to come. +Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/python/README.md b/python/README.md index 0a40ee79..b121e9a3 100644 --- a/python/README.md +++ b/python/README.md @@ -4,7 +4,7 @@ Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, -Ukrainian, Kazakh, Swedish, Japanese. More to come. +Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable From c32099705f66fb632dfae6a20ca2e185bce542ed Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 7 Jan 2022 17:33:47 +0100 Subject: [PATCH 12/25] Fix branch name and add implib dump --- travis/Dockerfile.win | 2 +- travis/Dockerfile.win32 | 2 +- travis/build-wheels-win.sh | 4 ++-- travis/build-wheels-win32.sh | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/travis/Dockerfile.win b/travis/Dockerfile.win index 89081c2b..4f00bfcc 100644 --- a/travis/Dockerfile.win +++ b/travis/Dockerfile.win @@ -55,7 +55,7 @@ RUN cd /opt/kaldi \ && find . -name *.a -exec cp {} /opt/kaldi/local/lib \; RUN cd /opt/kaldi \ - && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \ + && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \ && cd kaldi/src \ && CXX=x86_64-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \ --mathlib=OPENBLAS_CLAPACK \ diff --git a/travis/Dockerfile.win32 b/travis/Dockerfile.win32 index 59198d9a..5a478e52 100644 --- a/travis/Dockerfile.win32 +++ b/travis/Dockerfile.win32 @@ -54,7 +54,7 @@ RUN cd /opt/kaldi \ && find . -name *.a -exec cp {} /opt/kaldi/local/lib \; RUN cd /opt/kaldi \ - && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \ + && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \ && cd kaldi/src \ && CXX=i686-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \ --mathlib=OPENBLAS_CLAPACK \ diff --git a/travis/build-wheels-win.sh b/travis/build-wheels-win.sh index 750b6dd7..02bf6efc 100755 --- a/travis/build-wheels-win.sh +++ b/travis/build-wheels-win.sh @@ -5,7 +5,7 @@ set -e -x cd /opt git clone https://github.com/alphacep/vosk-api cd vosk-api/src -CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) +EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) # Collect dependencies cp /usr/lib/gcc/x86_64-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src @@ -14,7 +14,7 @@ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src # Copy dlls to output folder mkdir -p /io/wheelhouse/win64 -cp /opt/vosk-api/src/*.dll /io/wheelhouse/win64 +cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win64 # Build wheel and put to the output folder export VOSK_SOURCE=/opt/vosk-api diff --git a/travis/build-wheels-win32.sh b/travis/build-wheels-win32.sh index 2b934bd3..82af745e 100755 --- a/travis/build-wheels-win32.sh +++ b/travis/build-wheels-win32.sh @@ -5,7 +5,7 @@ set -e -x cd /opt git clone https://github.com/alphacep/vosk-api cd vosk-api/src -CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) +EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) # Copy dependencies cp /usr/lib/gcc/i686-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src @@ -14,7 +14,7 @@ cp /usr/i686-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src # Copy dlls to output folder mkdir -p /io/wheelhouse/win32 -cp /opt/vosk-api/src/*.dll /io/wheelhouse/win32 +cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win32 # Build wheel and put to the output folder export VOSK_SOURCE=/opt/vosk-api From c6fab363e60943d3e8ec784ea9170b022f200880 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Sun, 9 Jan 2022 15:15:20 +0100 Subject: [PATCH 13/25] Don't close channel which not yet started --- src/batch_recognizer.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index 28818692..ebc5a1bd 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -123,13 +123,14 @@ BatchRecognizer::~BatchRecognizer() { void BatchRecognizer::FinishStream(uint64_t id) { - Vector wave; - SubVector chunk(wave.Data(), 0); - dynamic_batcher_->Push(id, false, true, chunk); - streams_.erase(id); + if (streams_.find(id) != streams_.end()) { + Vector wave; + SubVector chunk(wave.Data(), 0); + dynamic_batcher_->Push(id, false, true, chunk); + streams_.erase(id); + } } - void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset) { fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat); From 9861be27876b4ded67806df8ec88474246a61e61 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Mon, 10 Jan 2022 20:02:17 +0100 Subject: [PATCH 14/25] Add libs as dependencies in Makefile --- src/Makefile | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index 9965db65..6ee41b69 100644 --- a/src/Makefile +++ b/src/Makefile @@ -55,6 +55,8 @@ LIBS= \ $(OPENFST_ROOT)/lib/libfst.a \ $(OPENFST_ROOT)/lib/libfstngram.a +LDFLAGS = + ifeq ($(HAVE_OPENBLAS_CLAPACK), 1) CFLAGS += -I$(OPENBLAS_ROOT)/include @@ -67,11 +69,11 @@ endif ifeq ($(HAVE_MKL), 1) CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include - LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential + LDFLAGS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential endif ifeq ($(HAVE_ACCELERATE), 1) - LIBS += -framework Accelerate + LDFLAGS += -framework Accelerate endif ifeq ($(HAVE_CUDA), 1) @@ -83,14 +85,15 @@ ifeq ($(HAVE_CUDA), 1) LIBS := \ $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ - $(LIBS) \ - -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt + $(LIBS) + + LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt endif all: $(OUTDIR)/libvosk.$(EXT) -$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) - $(CXX) --shared -s -o $@ $^ $(LIBS) -lm -latomic $(EXTRA_LDFLAGS) +$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) $(LIBS) + $(CXX) --shared -s -o $@ $^ $(LDFLAGS) -lm -latomic $(EXTRA_LDFLAGS) $(OUTDIR)/%.o: %.cc $(VOSK_HEADERS) $(CXX) $(CFLAGS) -c -o $@ $< From 6f86944a06aef6289ad9f29e2502f80bc69f55af Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Wed, 12 Jan 2022 01:10:04 +0100 Subject: [PATCH 15/25] Implement wave chunking for cuda decoder --- src/batch_recognizer.cc | 86 +++++++++++++++++++++++++++-------------- src/batch_recognizer.h | 4 ++ 2 files changed, 62 insertions(+), 28 deletions(-) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index ebc5a1bd..f2d93b05 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -31,22 +31,6 @@ BatchRecognizer::BatchRecognizer() { batched_decoder_config.Register(&po); po.ReadConfigFile("model/conf/model.conf"); - batched_decoder_config.num_worker_threads = -1; - batched_decoder_config.max_batch_size = 32; - batched_decoder_config.num_channels = 600; - batched_decoder_config.reset_on_endpoint = true; - batched_decoder_config.use_gpu_feature_extraction = true; - - batched_decoder_config.feature_opts.feature_type = "mfcc"; - batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; - batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf"; - batched_decoder_config.decoder_opts.max_active = 7000; - batched_decoder_config.decoder_opts.default_beam = 13.0; - batched_decoder_config.decoder_opts.lattice_beam = 6.0; - batched_decoder_config.compute_opts.acoustic_scale = 1.0; - batched_decoder_config.compute_opts.frame_subsampling_factor = 3; - batched_decoder_config.compute_opts.frames_per_chunk = 51; - struct stat buffer; string nnet3_rxfilename_ = "model/am/final.mdl"; @@ -93,7 +77,26 @@ BatchRecognizer::BatchRecognizer() { ReadKaldiObject(carpa_rxfilename_, &const_arpa_); } + batched_decoder_config.num_worker_threads = -1; + batched_decoder_config.max_batch_size = 32; + batched_decoder_config.num_channels = 600; + batched_decoder_config.reset_on_endpoint = true; + batched_decoder_config.use_gpu_feature_extraction = true; + batched_decoder_config.feature_opts.feature_type = "mfcc"; + batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; + batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf"; + batched_decoder_config.decoder_opts.max_active = 7000; + batched_decoder_config.decoder_opts.default_beam = 13.0; + batched_decoder_config.decoder_opts.lattice_beam = 6.0; + batched_decoder_config.compute_opts.acoustic_scale = 1.0; + batched_decoder_config.compute_opts.frame_subsampling_factor = 3; + + int32 nnet_left_context, nnet_right_context; + nnet3::ComputeSimpleNnetContext(nnet_->GetNnet(), &nnet_left_context, + &nnet_right_context); + + batched_decoder_config.compute_opts.frames_per_chunk = std::max(51, (nnet_right_context + 3 - nnet_right_context % 3)); cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_); @@ -102,6 +105,8 @@ BatchRecognizer::BatchRecognizer() { CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config; dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config, *cuda_pipeline_); + + samples_per_chunk_ = batched_decoder_config.compute_opts.frames_per_chunk * 160; } BatchRecognizer::~BatchRecognizer() { @@ -123,11 +128,16 @@ BatchRecognizer::~BatchRecognizer() { void BatchRecognizer::FinishStream(uint64_t id) { - if (streams_.find(id) != streams_.end()) { - Vector wave; - SubVector chunk(wave.Data(), 0); - dynamic_batcher_->Push(id, false, true, chunk); + if (streams_.find(id) != streams_.end()) {; + SubVector chunk = buffers_[id].Range(0, buffers_[id].Dim()); + + bool first = false; + if (initialized_.find(id) == initialized_.end()) + first = true; + dynamic_batcher_->Push(id, first, true, chunk); streams_.erase(id); + buffers_.erase(id); + initialized_.erase(id); } } @@ -173,11 +183,9 @@ void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat o void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) { - bool first = false; - if (streams_.find(id) == streams_.end()) { - first = true; streams_.insert(id); + buffers_[id] = Vector(); // Define the callback for results. #if 0 @@ -212,13 +220,35 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) CudaPipelineResult::RESULT_TYPE_LATTICE); } - Vector wave; - wave.Resize(len / 2, kUndefined); + // Collect data so we process exactly samples_per_chunk_ + Vector &buf = buffers_[id]; + int32 orig_size = buf.Dim(); + buf.Resize(buf.Dim() + len / 2, kCopyData); for (int i = 0; i < len / 2; i++) - wave(i) = *(((short *)data) + i); - SubVector chunk(wave.Data(), wave.Dim()); + buf(i + orig_size) = *(((short *)data) + i); - dynamic_batcher_->Push(id, first, false, chunk); + // Pick chunks + int32 i = 0; + while (i + samples_per_chunk_ <= buf.Dim()) { + SubVector chunk = buf.Range(i, samples_per_chunk_); + + bool first = false; + if (initialized_.find(id) == initialized_.end()) { + first = true; + initialized_.insert(id); + } + dynamic_batcher_->Push(id, first, false, chunk); + i += samples_per_chunk_; + } + + // Keep remaining data + if (i > 0) { + int32 remaining = buf.Dim() - i; + for (int j = 0; j < remaining; j++) { + buf(j) = buf(i + j); + } + buf.Resize(remaining, kCopyData); + } } const char* BatchRecognizer::FrontResult(uint64_t id) diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h index f26dd54b..3d634e01 100644 --- a/src/batch_recognizer.h +++ b/src/batch_recognizer.h @@ -66,9 +66,13 @@ class BatchRecognizer { BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr; CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr; + // Input and output queues + int32 samples_per_chunk_; std::set streams_; + std::set initialized_; std::map > results_; + std::map > buffers_; // Rescoring fst::ArcMapFst > *lm_to_subtract_ = nullptr; From 2135223490aac2b80dc5b4ad607edb40b97b429d Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Wed, 12 Jan 2022 14:40:45 +0100 Subject: [PATCH 16/25] Put stream information in a single structure --- src/batch_recognizer.cc | 73 ++++++++++++++++++++--------------------- src/batch_recognizer.h | 13 +++++--- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index f2d93b05..b1215a07 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -106,7 +106,7 @@ BatchRecognizer::BatchRecognizer() { dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config, *cuda_pipeline_); - samples_per_chunk_ = batched_decoder_config.compute_opts.frames_per_chunk * 160; + samples_per_chunk_ = cuda_pipeline_->GetNSampsPerChunk(); } BatchRecognizer::~BatchRecognizer() { @@ -128,17 +128,14 @@ BatchRecognizer::~BatchRecognizer() { void BatchRecognizer::FinishStream(uint64_t id) { - if (streams_.find(id) != streams_.end()) {; - SubVector chunk = buffers_[id].Range(0, buffers_[id].Dim()); - - bool first = false; - if (initialized_.find(id) == initialized_.end()) - first = true; - dynamic_batcher_->Push(id, first, true, chunk); - streams_.erase(id); - buffers_.erase(id); - initialized_.erase(id); + auto it = streams_.find(id); + if (it == streams_.end()) { + return; } + + SubVector chunk = it->second.buffer.Range(0, it->second.buffer.Dim()); + dynamic_batcher_->Push(id, !(it->second.initialized), true, chunk); + streams_.erase(it); } void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset) @@ -178,15 +175,12 @@ void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat o // KALDI_LOG << "Result " << id << " " << obj.dump(); - results_[id].push(obj.dump()); + streams_[id].results.push(obj.dump()); } void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) { if (streams_.find(id) == streams_.end()) { - streams_.insert(id); - buffers_[id] = Vector(); - // Define the callback for results. #if 0 cuda_pipeline_->SetBestPathCallback( @@ -219,52 +213,55 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) }, CudaPipelineResult::RESULT_TYPE_LATTICE); } - // Collect data so we process exactly samples_per_chunk_ - Vector &buf = buffers_[id]; - int32 orig_size = buf.Dim(); - buf.Resize(buf.Dim() + len / 2, kCopyData); + Vector &buffer = streams_[id].buffer; + int32 end = buffer.Dim(); + buffer.Resize(end + len / 2, kCopyData); for (int i = 0; i < len / 2; i++) - buf(i + orig_size) = *(((short *)data) + i); + buffer(i + end) = *(((short *)data) + i); + end = buffer.Dim(); - // Pick chunks + // Pick chunks and submit them to the batcher int32 i = 0; - while (i + samples_per_chunk_ <= buf.Dim()) { - SubVector chunk = buf.Range(i, samples_per_chunk_); - - bool first = false; - if (initialized_.find(id) == initialized_.end()) { - first = true; - initialized_.insert(id); - } - dynamic_batcher_->Push(id, first, false, chunk); + while (i + samples_per_chunk_ <= end) { + dynamic_batcher_->Push(id, (!streams_[id].initialized), false, + buffer.Range(i, samples_per_chunk_)); + streams_[id].initialized = true; i += samples_per_chunk_; } // Keep remaining data if (i > 0) { - int32 remaining = buf.Dim() - i; - for (int j = 0; j < remaining; j++) { - buf(j) = buf(i + j); + int32 tail = end - i; + for (int j = 0; j < tail; j++) { + buffer(j) = buffer(i + j); } - buf.Resize(remaining, kCopyData); + buffer.Resize(tail, kCopyData); } } const char* BatchRecognizer::FrontResult(uint64_t id) { - if (results_[id].empty()) { + auto it = streams_.find(id); + if (it == streams_.end()) { + return ""; + } + if (it->second.results.empty()) { return ""; } - return results_[id].front().c_str(); + return it->second.results.front().c_str(); } void BatchRecognizer::Pop(uint64_t id) { - if (results_[id].empty()) { + auto it = streams_.find(id); + if (it == streams_.end()) { + return; + } + if (it->second.results.empty()) { return; } - results_[id].pop(); + it->second.results.pop(); } void BatchRecognizer::WaitForCompletion() diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h index 3d634e01..342bd860 100644 --- a/src/batch_recognizer.h +++ b/src/batch_recognizer.h @@ -51,6 +51,12 @@ class BatchRecognizer { int GetPendingChunks(uint64_t id); private: + struct Stream { + bool initialized = false; + std::queue results; + kaldi::Vector buffer; + }; + void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset); kaldi::TransitionModel *trans_model_ = nullptr; @@ -66,13 +72,10 @@ class BatchRecognizer { BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr; CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr; - // Input and output queues int32 samples_per_chunk_; - std::set streams_; - std::set initialized_; - std::map > results_; - std::map > buffers_; + // Input and output queues + std::map streams_; // Rescoring fst::ArcMapFst > *lm_to_subtract_ = nullptr; From b0903413b109e2b9690ed384a17d483f14affe09 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Fri, 21 Jan 2022 13:19:38 +0100 Subject: [PATCH 17/25] Set soname for Android library --- android/lib/build-vosk.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/android/lib/build-vosk.sh b/android/lib/build-vosk.sh index e455fa46..7be5f58c 100755 --- a/android/lib/build-vosk.sh +++ b/android/lib/build-vosk.sh @@ -123,7 +123,13 @@ make -j 8 online2 lm rnnlm # Vosk-api cd $WORKDIR mkdir -p $WORKDIR/vosk -make -j 8 -C ${WORKDIR_BASE}/../../../src OUTDIR=$WORKDIR/vosk KALDI_ROOT=${WORKDIR}/kaldi OPENFST_ROOT=${WORKDIR}/local OPENBLAS_ROOT=${WORKDIR}/local CXX=$CXX EXTRA_LDFLAGS="-llog -static-libstdc++" +make -j 8 -C ${WORKDIR_BASE}/../../../src \ + OUTDIR=$WORKDIR/vosk \ + KALDI_ROOT=${WORKDIR}/kaldi \ + OPENFST_ROOT=${WORKDIR}/local \ + OPENBLAS_ROOT=${WORKDIR}/local \ + CXX=$CXX \ + EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so" cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so done From d2c11a611f50639641719a0c0b85838abeaa5109 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Sun, 30 Jan 2022 22:57:36 +0100 Subject: [PATCH 18/25] Read list of files from arguments --- python/example/test_gpu_batch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/example/test_gpu_batch.py b/python/example/test_gpu_batch.py index 3a65bda8..0ad9e288 100755 --- a/python/example/test_gpu_batch.py +++ b/python/example/test_gpu_batch.py @@ -14,7 +14,8 @@ rec = BatchRecognizer() -fnames = open("tedlium.list").readlines() +# Read list of files from the file +fnames = open(sys.argv[1]).readlines() fds = [open(x.strip(), "rb") for x in fnames] uids = [fname.strip().split('/')[-1][:-4] for fname in fnames] results = [""] * len(fnames) From 79b8395be055a9398fbd8f2105b0321fb186ebff Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Thu, 3 Feb 2022 23:08:09 +0100 Subject: [PATCH 19/25] Add NLSML output --- python/example/test_nlsml.py | 31 +++++++++++++++++ python/vosk/__init__.py | 3 ++ src/recognizer.cc | 65 +++++++++++++++++++++++++++++++++++- src/recognizer.h | 3 ++ src/vosk_api.cc | 5 +++ src/vosk_api.h | 6 ++++ 6 files changed, 112 insertions(+), 1 deletion(-) create mode 100755 python/example/test_nlsml.py diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py new file mode 100755 index 00000000..18132093 --- /dev/null +++ b/python/example/test_nlsml.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +from vosk import Model, KaldiRecognizer, SetLogLevel +import sys +import os +import wave + +SetLogLevel(0) + +if not os.path.exists("model"): + print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") + exit (1) + +wf = wave.open(sys.argv[1], "rb") +if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": + print ("Audio file must be WAV format mono PCM.") + exit (1) + +model = Model("model") +rec = KaldiRecognizer(model, wf.getframerate()) +rec.SetMaxAlternatives(10) +rec.SetNLSML(True) + +while True: + data = wf.readframes(4000) + if len(data) == 0: + break + if rec.AcceptWaveform(data): + print(rec.Result()) + +print(rec.FinalResult()) diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index 0e60c2ba..d8e384b9 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -69,6 +69,9 @@ def SetMaxAlternatives(self, max_alternatives): def SetWords(self, enable_words): _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0) + def SetNLSML(self, enable_nlsml): + _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0) + def SetSpkModel(self, spk_model): _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle) diff --git a/src/recognizer.cc b/src/recognizer.cc index f25ff0ee..cfcf638a 100644 --- a/src/recognizer.cc +++ b/src/recognizer.cc @@ -246,6 +246,11 @@ void Recognizer::SetWords(bool words) words_ = words; } +void Recognizer::SetNLSML(bool nlsml) +{ + nlsml_ = nlsml; +} + void Recognizer::SetSpkModel(SpkModel *spk_model) { if (state_ == RECOGNIZER_RUNNING) { @@ -534,7 +539,6 @@ const char *Recognizer::NbestResult(CompactLattice &clat) fst::ConvertNbestToVector(nbest_lat, &nbest_lats); json::JSON obj; - std::stringstream ss; for (int k = 0; k < nbest_lats.size(); k++) { Lattice nlat = nbest_lats[k]; @@ -584,6 +588,63 @@ const char *Recognizer::NbestResult(CompactLattice &clat) return StoreReturn(obj.dump()); } +const char *Recognizer::NlsmlResult(CompactLattice &clat) +{ + Lattice lat; + Lattice nbest_lat; + std::vector nbest_lats; + + ConvertLattice (clat, &lat); + fst::ShortestPath(lat, &nbest_lat, max_alternatives_); + fst::ConvertNbestToVector(nbest_lat, &nbest_lats); + + std::stringstream ss; + ss << "\n"; + ss << "\n"; + + for (int k = 0; k < nbest_lats.size(); k++) { + + Lattice nlat = nbest_lats[k]; + + CompactLattice nclat; + fst::Invert(&nlat); + DeterminizeLattice(nlat, &nclat); + + CompactLattice aligned_nclat; + if (model_->winfo_) { + WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat); + } else { + aligned_nclat = nclat; + } + + std::vector words; + std::vector begin_times; + std::vector lengths; + CompactLattice::Weight weight; + + CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight); + float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2()); + + stringstream text; + for (int i = 0; i < words.size(); i++) { + json::JSON word; + if (words[i] == 0) + continue; + if (i) + text << " "; + text << model_->word_syms_->Find(words[i]); + } + + ss << "\n"; + ss << "" << text.str() << "\n"; + ss << "" << text.str() << "\n"; + ss << "\n"; + } + ss << "\n"; + + return StoreReturn(ss.str()); +} + const char* Recognizer::GetResult() { if (decoder_->NumFramesDecoded() == 0) { @@ -638,6 +699,8 @@ const char* Recognizer::GetResult() if (max_alternatives_ == 0) { return MbrResult(rlat); + } else if (nlsml_) { + return NlsmlResult(rlat); } else { return NbestResult(rlat); } diff --git a/src/recognizer.h b/src/recognizer.h index e5a733d1..b0338a01 100644 --- a/src/recognizer.h +++ b/src/recognizer.h @@ -49,6 +49,7 @@ class Recognizer { void SetMaxAlternatives(int max_alternatives); void SetSpkModel(SpkModel *spk_model); void SetWords(bool words); + void SetNLSML(bool nlsml); bool AcceptWaveform(const char *data, int len); bool AcceptWaveform(const short *sdata, int len); bool AcceptWaveform(const float *fdata, int len); @@ -69,6 +70,7 @@ class Recognizer { const char *StoreReturn(const string &res); const char *MbrResult(CompactLattice &clat); const char *NbestResult(CompactLattice &clat); + const char *NlsmlResult(CompactLattice &clat); Model *model_ = nullptr; SingleUtteranceNnet3Decoder *decoder_ = nullptr; @@ -94,6 +96,7 @@ class Recognizer { // Other int max_alternatives_ = 0; // Disable alternatives by default bool words_ = false; + bool nlsml_ = false; float sample_frequency_; int32 frame_offset_; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 65356038..5df70715 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -103,6 +103,11 @@ void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words) ((Recognizer *)recognizer)->SetWords((bool)words); } +void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml) +{ + ((Recognizer *)recognizer)->SetNLSML((bool)nlsml); +} + void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model) { if (recognizer == nullptr || spk_model == nullptr) { diff --git a/src/vosk_api.h b/src/vosk_api.h index f6a981cb..c448087f 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -191,6 +191,12 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words); +/** Set NLSML output + * @param nlsml - boolean value + */ +void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml); + + /** Accept voice data * * accept and process new chunk of voice data From a561c2d6d4e0023fc06a6c9a4da1265f56690754 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Thu, 3 Feb 2022 23:26:53 +0100 Subject: [PATCH 20/25] Don't add space before string --- src/recognizer.cc | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/recognizer.cc b/src/recognizer.cc index cfcf638a..bd671b43 100644 --- a/src/recognizer.cc +++ b/src/recognizer.cc @@ -565,7 +565,7 @@ const char *Recognizer::NbestResult(CompactLattice &clat) stringstream text; json::JSON entry; - for (int i = 0; i < words.size(); i++) { + for (int i = 0, first = 1; i < words.size(); i++) { json::JSON word; if (words[i] == 0) continue; @@ -575,8 +575,12 @@ const char *Recognizer::NbestResult(CompactLattice &clat) word["end"] = samples_round_start_ / sample_frequency_ + (frame_offset_ + begin_times[i] + lengths[i]) * 0.03; entry["result"].append(word); } - if (i) + + if (first) + first = 0; + else text << " "; + text << model_->word_syms_->Find(words[i]); } @@ -626,12 +630,15 @@ const char *Recognizer::NlsmlResult(CompactLattice &clat) float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2()); stringstream text; - for (int i = 0; i < words.size(); i++) { - json::JSON word; + for (int i = 0, first = 1; i < words.size(); i++) { if (words[i] == 0) continue; - if (i) + + if (first) + first = 0; + else text << " "; + text << model_->word_syms_->Find(words[i]); } From f574d896e9346b610f6928d4f372f76269681018 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Thu, 3 Feb 2022 23:43:00 +0100 Subject: [PATCH 21/25] Emtpy result should be also xml --- src/recognizer.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/recognizer.cc b/src/recognizer.cc index bd671b43..d49b5748 100644 --- a/src/recognizer.cc +++ b/src/recognizer.cc @@ -797,6 +797,14 @@ const char *Recognizer::StoreEmptyReturn() { if (!max_alternatives_) { return StoreReturn("{\"text\": \"\"}"); + } else if (nlsml_) { + return StoreReturn("\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n"); } else { return StoreReturn("{\"alternatives\" : [{\"text\": \"\", \"confidence\" : 1.0}] }"); } From 1f447a8dfc0e371211d404892861631b41cf1630 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Thu, 10 Feb 2022 20:52:55 +0100 Subject: [PATCH 22/25] Rename according to Kaldi changes --- src/batch_recognizer.cc | 4 ++-- src/batch_recognizer.h | 2 +- src/vosk_api.cc | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc index b1215a07..46a0c097 100644 --- a/src/batch_recognizer.cc +++ b/src/batch_recognizer.cc @@ -269,7 +269,7 @@ void BatchRecognizer::WaitForCompletion() dynamic_batcher_->WaitForCompletion(); } -int BatchRecognizer::GetPendingChunks(uint64_t id) +int BatchRecognizer::GetNumPendingChunks(uint64_t id) { - return dynamic_batcher_->GetPendingChunks(id); + return dynamic_batcher_->GetNumPendingChunks(id); } diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h index 342bd860..c6d90ae0 100644 --- a/src/batch_recognizer.h +++ b/src/batch_recognizer.h @@ -48,7 +48,7 @@ class BatchRecognizer { const char *FrontResult(uint64_t id); void Pop(uint64_t id); void WaitForCompletion(); - int GetPendingChunks(uint64_t id); + int GetNumPendingChunks(uint64_t id); private: struct Stream { diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 5df70715..c7e75403 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -246,7 +246,7 @@ void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer) int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id) { #if HAVE_CUDA - return ((BatchRecognizer *)recognizer)->GetPendingChunks(id); + return ((BatchRecognizer *)recognizer)->GetNumPendingChunks(id); #else return 0; #endif From b63df75c300855fff891f92ef197809ceb79d6c9 Mon Sep 17 00:00:00 2001 From: mulhod Date: Fri, 11 Feb 2022 23:46:06 -0500 Subject: [PATCH 23/25] Change KaldiRecognizer reference to Recognizer in src/recognizer.cc --- src/recognizer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/recognizer.cc b/src/recognizer.cc index b14392bd..50d84d0e 100644 --- a/src/recognizer.cc +++ b/src/recognizer.cc @@ -593,7 +593,7 @@ void ComputePhoneInfo(const TransitionModel &tmodel, const CompactLattice &clat, } -const char *KaldiRecognizer::WordandPhoneResult(CompactLattice &rlat) +const char *Recognizer::WordandPhoneResult(CompactLattice &rlat) { //Computes aligned word and phone-level results without MBR decoding CompactLattice aligned_lat; From de0bec8ca2e8e6da87ac924982f47fbd9782a487 Mon Sep 17 00:00:00 2001 From: mulhod Date: Fri, 11 Feb 2022 23:49:36 -0500 Subject: [PATCH 24/25] Change KaldiRecognizer reference to Recognizer in src/vosk_api.cc --- src/vosk_api.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vosk_api.cc b/src/vosk_api.cc index c8b6f15b..3166cc0f 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -100,7 +100,7 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al void vosk_recognizer_set_result_options(VoskRecognizer *recognizer, const char *result_opts) { - ((KaldiRecognizer *)recognizer)->SetResultOptions(result_opts); + ((Recognizer *)recognizer)->SetResultOptions(result_opts); } void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words) From 9732d1da5c153c10c3684ee75d4fc320a850cabb Mon Sep 17 00:00:00 2001 From: mulhod Date: Sat, 12 Feb 2022 00:30:56 -0500 Subject: [PATCH 25/25] Increment version to 0.3.42 --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 502be6c8..39c8082e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -44,7 +44,7 @@ def get_tag(self): setuptools.setup( name="vosk", - version="0.3.41", + version="0.3.42", author="Educational Testing Service", author_email="rubale@ets.org", description="Offline open source speech recognition API based on Kaldi and Vosk with additional features from ETS",