diff --git a/README.md b/README.md index 6218cfa4..8d96cdc3 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Vosk Speech Recognition Toolkit Vosk is an offline open source speech recognition toolkit. It enables -speech recognition models for 18 languages and dialects - English, Indian +speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, -Ukrainian. +Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/android/lib/build-vosk.sh b/android/lib/build-vosk.sh index e455fa46..7be5f58c 100755 --- a/android/lib/build-vosk.sh +++ b/android/lib/build-vosk.sh @@ -123,7 +123,13 @@ make -j 8 online2 lm rnnlm # Vosk-api cd $WORKDIR mkdir -p $WORKDIR/vosk -make -j 8 -C ${WORKDIR_BASE}/../../../src OUTDIR=$WORKDIR/vosk KALDI_ROOT=${WORKDIR}/kaldi OPENFST_ROOT=${WORKDIR}/local OPENBLAS_ROOT=${WORKDIR}/local CXX=$CXX EXTRA_LDFLAGS="-llog -static-libstdc++" +make -j 8 -C ${WORKDIR_BASE}/../../../src \ + OUTDIR=$WORKDIR/vosk \ + KALDI_ROOT=${WORKDIR}/kaldi \ + OPENFST_ROOT=${WORKDIR}/local \ + OPENBLAS_ROOT=${WORKDIR}/local \ + CXX=$CXX \ + EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so" cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so done diff --git a/nodejs/README.md b/nodejs/README.md index 0ac9d753..5603fae6 100644 --- a/nodejs/README.md +++ b/nodejs/README.md @@ -2,18 +2,18 @@ This is an FFI-NAPI wrapper for the Vosk library. ## Usage -It mostly follows Vosk interface, some methods are not yet fully implemented. +Bindings mostly follow Vosk interface, some methods are not yet fully implemented. -To use it you need to compile libvosk library, see Python module build -instructions for details. You can find prebuilt library inside python -wheel. +See [demo folder](https://github.com/alphacep/vosk-api/tree/master/nodejs/demo) for +details. ## About Vosk is an offline open source speech recognition toolkit. It enables -speech recognition models for 17 languages and dialects - English, Indian +speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, -Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino. +Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, +Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/python/README.md b/python/README.md index 300eaa6d..b121e9a3 100644 --- a/python/README.md +++ b/python/README.md @@ -1,9 +1,10 @@ This is a Python module for Vosk. Vosk is an offline open source speech recognition toolkit. It enables -speech recognition models for 17 languages and dialects - English, Indian +speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, -Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino. +Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, +Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come. Vosk models are small (50 Mb) but provide continuous large vocabulary transcription, zero-latency response with streaming API, reconfigurable diff --git a/python/example/test_gpu_batch.py b/python/example/test_gpu_batch.py new file mode 100755 index 00000000..0ad9e288 --- /dev/null +++ b/python/example/test_gpu_batch.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import sys +import os +import wave +from time import sleep +import json +from timeit import default_timer as timer + + +from vosk import Model, BatchRecognizer, GpuInit + +GpuInit() + +rec = BatchRecognizer() + +# Read list of files from the file +fnames = open(sys.argv[1]).readlines() +fds = [open(x.strip(), "rb") for x in fnames] +uids = [fname.strip().split('/')[-1][:-4] for fname in fnames] +results = [""] * len(fnames) +ended = set() +tot_samples = 0 + +start_time = timer() + +while True: + + # Feed in the data + for i, fd in enumerate(fds): + if i in ended: + continue + data = fd.read(16000) + if len(data) == 0: + rec.FinishStream(i) + ended.add(i) + continue + rec.AcceptWaveform(i, data) + tot_samples += len(data) + + # Wait for results from CUDA + rec.Wait() + + # Retrieve and add results + for i, fd in enumerate(fds): + res = rec.Result(i) + if len(res) != 0: + results[i] = results[i] + " " + json.loads(res)['text'] + + if len(ended) == len(fds): + break + +end_time = timer() + +for i in range(len(results)): + print (uids[i], results[i].strip()) + +print ("Processed %d seconds of audio in %d seconds (%f xRT)" % (tot_samples / 16000.0 / 2, end_time - start_time, + (tot_samples / 16000.0 / 2 / (end_time - start_time))), file=sys.stderr) diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py new file mode 100755 index 00000000..18132093 --- /dev/null +++ b/python/example/test_nlsml.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +from vosk import Model, KaldiRecognizer, SetLogLevel +import sys +import os +import wave + +SetLogLevel(0) + +if not os.path.exists("model"): + print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") + exit (1) + +wf = wave.open(sys.argv[1], "rb") +if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": + print ("Audio file must be WAV format mono PCM.") + exit (1) + +model = Model("model") +rec = KaldiRecognizer(model, wf.getframerate()) +rec.SetMaxAlternatives(10) +rec.SetNLSML(True) + +while True: + data = wf.readframes(4000) + if len(data) == 0: + break + if rec.AcceptWaveform(data): + print(rec.Result()) + +print(rec.FinalResult()) diff --git a/python/setup.py b/python/setup.py index 502be6c8..39c8082e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -44,7 +44,7 @@ def get_tag(self): setuptools.setup( name="vosk", - version="0.3.41", + version="0.3.42", author="Educational Testing Service", author_email="rubale@ets.org", description="Offline open source speech recognition API based on Kaldi and Vosk with additional features from ETS", diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index a250b253..18606866 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -72,6 +72,9 @@ def SetResultOptions(self, result_opts): def SetWords(self, enable_words): _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0) + def SetNLSML(self, enable_nlsml): + _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0) + def SetSpkModel(self, spk_model): _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle) @@ -104,3 +107,32 @@ def GpuInit(): def GpuThreadInit(): _c.vosk_gpu_thread_init() + +class BatchRecognizer(object): + + def __init__(self, *args): + self._handle = _c.vosk_batch_recognizer_new() + + if self._handle == _ffi.NULL: + raise Exception("Failed to create a recognizer") + + def __del__(self): + _c.vosk_batch_recognizer_free(self._handle) + + def AcceptWaveform(self, uid, data): + res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data)) + + def Result(self, uid): + ptr = _c.vosk_batch_recognizer_front_result(self._handle, uid) + res = _ffi.string(ptr).decode('utf-8') + _c.vosk_batch_recognizer_pop(self._handle, uid) + return res + + def FinishStream(self, uid): + _c.vosk_batch_recognizer_finish_stream(self._handle, uid) + + def Wait(self): + _c.vosk_batch_recognizer_wait(self._handle) + + def GetPendingChunks(self, uid): + return _c.vosk_batch_recognizer_get_pending_chunks(self._handle, uid) diff --git a/src/Makefile b/src/Makefile index 54e96ca7..6ee41b69 100644 --- a/src/Makefile +++ b/src/Makefile @@ -18,14 +18,14 @@ EXTRA_LDFLAGS?= OUTDIR?=. VOSK_SOURCES= \ - kaldi_recognizer.cc \ + recognizer.cc \ language_model.cc \ model.cc \ spk_model.cc \ vosk_api.cc VOSK_HEADERS= \ - kaldi_recognizer.h \ + recognizer.h \ language_model.h \ model.h \ spk_model.h \ @@ -39,13 +39,13 @@ LIBS= \ $(KALDI_ROOT)/src/decoder/kaldi-decoder.a \ $(KALDI_ROOT)/src/ivector/kaldi-ivector.a \ $(KALDI_ROOT)/src/gmm/kaldi-gmm.a \ - $(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \ $(KALDI_ROOT)/src/tree/kaldi-tree.a \ $(KALDI_ROOT)/src/feat/kaldi-feat.a \ $(KALDI_ROOT)/src/lat/kaldi-lat.a \ $(KALDI_ROOT)/src/lm/kaldi-lm.a \ $(KALDI_ROOT)/src/rnnlm/kaldi-rnnlm.a \ $(KALDI_ROOT)/src/hmm/kaldi-hmm.a \ + $(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \ $(KALDI_ROOT)/src/transform/kaldi-transform.a \ $(KALDI_ROOT)/src/cudamatrix/kaldi-cudamatrix.a \ $(KALDI_ROOT)/src/matrix/kaldi-matrix.a \ @@ -55,6 +55,8 @@ LIBS= \ $(OPENFST_ROOT)/lib/libfst.a \ $(OPENFST_ROOT)/lib/libfstngram.a +LDFLAGS = + ifeq ($(HAVE_OPENBLAS_CLAPACK), 1) CFLAGS += -I$(OPENBLAS_ROOT)/include @@ -66,23 +68,32 @@ ifeq ($(HAVE_OPENBLAS_CLAPACK), 1) endif ifeq ($(HAVE_MKL), 1) - CFLAGS += -I$(MKL_ROOT)/include - LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential + CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include + LDFLAGS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential endif ifeq ($(HAVE_ACCELERATE), 1) - LIBS += -framework Accelerate + LDFLAGS += -framework Accelerate endif ifeq ($(HAVE_CUDA), 1) + VOSK_SOURCES += batch_recognizer.cc + VOSK_HEADERS += batch_recognizer.h + CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include - LIBS+=-L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt + + LIBS := \ + $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \ + $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \ + $(LIBS) + + LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt endif all: $(OUTDIR)/libvosk.$(EXT) -$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) - $(CXX) --shared -s -o $@ $^ $(LIBS) -lm -latomic $(EXTRA_LDFLAGS) +$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) $(LIBS) + $(CXX) --shared -s -o $@ $^ $(LDFLAGS) -lm -latomic $(EXTRA_LDFLAGS) $(OUTDIR)/%.o: %.cc $(VOSK_HEADERS) $(CXX) $(CFLAGS) -c -o $@ $< diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc new file mode 100644 index 00000000..46a0c097 --- /dev/null +++ b/src/batch_recognizer.cc @@ -0,0 +1,275 @@ +// Copyright 2019-2020 Alpha Cephei Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "batch_recognizer.h" + +#include "fstext/fstext-utils.h" +#include "lat/sausages.h" +#include "json.h" + +#include + +using namespace fst; +using namespace kaldi::nnet3; +using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID; + +BatchRecognizer::BatchRecognizer() { + BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config; + + kaldi::ParseOptions po("something"); + batched_decoder_config.Register(&po); + po.ReadConfigFile("model/conf/model.conf"); + + struct stat buffer; + + string nnet3_rxfilename_ = "model/am/final.mdl"; + string hclg_fst_rxfilename_ = "model/graph/HCLG.fst"; + string word_syms_rxfilename_ = "model/graph/words.txt"; + string winfo_rxfilename_ = "model/graph/phones/word_boundary.int"; + string std_fst_rxfilename_ = "model/rescore/G.fst"; + string carpa_rxfilename_ = "model/rescore/G.carpa"; + + trans_model_ = new kaldi::TransitionModel(); + nnet_ = new kaldi::nnet3::AmNnetSimple(); + { + bool binary; + kaldi::Input ki(nnet3_rxfilename_, &binary); + trans_model_->Read(ki.Stream(), binary); + nnet_->Read(ki.Stream(), binary); + SetBatchnormTestMode(true, &(nnet_->GetNnet())); + SetDropoutTestMode(true, &(nnet_->GetNnet())); + nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet())); + } + + if (stat(hclg_fst_rxfilename_.c_str(), &buffer) == 0) { + KALDI_LOG << "Loading HCLG from " << hclg_fst_rxfilename_; + hclg_fst_ = fst::ReadFstKaldiGeneric(hclg_fst_rxfilename_); + } + + KALDI_LOG << "Loading words from " << word_syms_rxfilename_; + if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) { + KALDI_ERR << "Could not read symbol table from file " + << word_syms_rxfilename_; + } + KALDI_ASSERT(word_syms_); + + if (stat(winfo_rxfilename_.c_str(), &buffer) == 0) { + KALDI_LOG << "Loading winfo " << winfo_rxfilename_; + kaldi::WordBoundaryInfoNewOpts opts; + winfo_ = new kaldi::WordBoundaryInfo(opts, winfo_rxfilename_); + } + + if (stat(carpa_rxfilename_.c_str(), &buffer) == 0) { + KALDI_LOG << "Loading subtract G.fst model from " << std_fst_rxfilename_; + graph_lm_fst_ = fst::ReadAndPrepareLmFst(std_fst_rxfilename_); + KALDI_LOG << "Loading CARPA model from " << carpa_rxfilename_; + ReadKaldiObject(carpa_rxfilename_, &const_arpa_); + } + + batched_decoder_config.num_worker_threads = -1; + batched_decoder_config.max_batch_size = 32; + batched_decoder_config.num_channels = 600; + batched_decoder_config.reset_on_endpoint = true; + batched_decoder_config.use_gpu_feature_extraction = true; + + batched_decoder_config.feature_opts.feature_type = "mfcc"; + batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf"; + batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf"; + batched_decoder_config.decoder_opts.max_active = 7000; + batched_decoder_config.decoder_opts.default_beam = 13.0; + batched_decoder_config.decoder_opts.lattice_beam = 6.0; + batched_decoder_config.compute_opts.acoustic_scale = 1.0; + batched_decoder_config.compute_opts.frame_subsampling_factor = 3; + + int32 nnet_left_context, nnet_right_context; + nnet3::ComputeSimpleNnetContext(nnet_->GetNnet(), &nnet_left_context, + &nnet_right_context); + + batched_decoder_config.compute_opts.frames_per_chunk = std::max(51, (nnet_right_context + 3 - nnet_right_context % 3)); + + cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline + (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_); + cuda_pipeline_->SetSymbolTable(*word_syms_); + + CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config; + dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config, + *cuda_pipeline_); + + samples_per_chunk_ = cuda_pipeline_->GetNSampsPerChunk(); +} + +BatchRecognizer::~BatchRecognizer() { + + delete trans_model_; + delete nnet_; + delete word_syms_; + delete winfo_; + delete hclg_fst_; + delete graph_lm_fst_; + + delete lm_to_subtract_; + delete carpa_to_add_; + delete carpa_to_add_scale_; + + delete cuda_pipeline_; + delete dynamic_batcher_; +} + +void BatchRecognizer::FinishStream(uint64_t id) +{ + auto it = streams_.find(id); + if (it == streams_.end()) { + return; + } + + SubVector chunk = it->second.buffer.Range(0, it->second.buffer.Dim()); + dynamic_batcher_->Push(id, !(it->second.initialized), true, chunk); + streams_.erase(it); +} + +void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset) +{ + fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat); + + CompactLattice aligned_lat; + WordAlignLattice(clat, *trans_model_, *winfo_, 0, &aligned_lat); + + MinimumBayesRisk mbr(aligned_lat); + const vector &conf = mbr.GetOneBestConfidences(); + const vector &words = mbr.GetOneBest(); + const vector > × = + mbr.GetOneBestTimes(); + + int size = words.size(); + + json::JSON obj; + stringstream text; + + // Create JSON object + for (int i = 0; i < size; i++) { + json::JSON word; + + word["word"] = word_syms_->Find(words[i]); + word["start"] = round(times[i].first) * 0.03 + offset; + word["end"] = round(times[i].second) * 0.03 + offset; + word["conf"] = conf[i]; + obj["result"].append(word); + + if (i) { + text << " "; + } + text << word_syms_->Find(words[i]); + } + obj["text"] = text.str(); + +// KALDI_LOG << "Result " << id << " " << obj.dump(); + + streams_[id].results.push(obj.dump()); +} + +void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len) +{ + if (streams_.find(id) == streams_.end()) { + // Define the callback for results. +#if 0 + cuda_pipeline_->SetBestPathCallback( + id, + [&, id](const std::string &str, bool partial, + bool endpoint_detected) { + if (partial) { + KALDI_LOG << "id #" << id << " [partial] : " << str << ":"; + } + + if (endpoint_detected) { + KALDI_LOG << "id #" << id << " [endpoint detected]"; + } + + if (!partial) { + KALDI_LOG << "id #" << id << " : " << str; + } + }); +#endif + cuda_pipeline_->SetLatticeCallback( + id, + [&, id](SegmentedLatticeCallbackParams& params) { + if (params.results.empty()) { + KALDI_WARN << "Empty result for callback"; + return; + } + CompactLattice *clat = params.results[0].GetLatticeResult(); + BaseFloat offset = params.results[0].GetTimeOffsetSeconds(); + PushLattice(id, *clat, offset); + }, + CudaPipelineResult::RESULT_TYPE_LATTICE); + } + // Collect data so we process exactly samples_per_chunk_ + Vector &buffer = streams_[id].buffer; + int32 end = buffer.Dim(); + buffer.Resize(end + len / 2, kCopyData); + for (int i = 0; i < len / 2; i++) + buffer(i + end) = *(((short *)data) + i); + end = buffer.Dim(); + + // Pick chunks and submit them to the batcher + int32 i = 0; + while (i + samples_per_chunk_ <= end) { + dynamic_batcher_->Push(id, (!streams_[id].initialized), false, + buffer.Range(i, samples_per_chunk_)); + streams_[id].initialized = true; + i += samples_per_chunk_; + } + + // Keep remaining data + if (i > 0) { + int32 tail = end - i; + for (int j = 0; j < tail; j++) { + buffer(j) = buffer(i + j); + } + buffer.Resize(tail, kCopyData); + } +} + +const char* BatchRecognizer::FrontResult(uint64_t id) +{ + auto it = streams_.find(id); + if (it == streams_.end()) { + return ""; + } + if (it->second.results.empty()) { + return ""; + } + return it->second.results.front().c_str(); +} + +void BatchRecognizer::Pop(uint64_t id) +{ + auto it = streams_.find(id); + if (it == streams_.end()) { + return; + } + if (it->second.results.empty()) { + return; + } + it->second.results.pop(); +} + +void BatchRecognizer::WaitForCompletion() +{ + dynamic_batcher_->WaitForCompletion(); +} + +int BatchRecognizer::GetNumPendingChunks(uint64_t id) +{ + return dynamic_batcher_->GetNumPendingChunks(id); +} diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h new file mode 100644 index 00000000..c6d90ae0 --- /dev/null +++ b/src/batch_recognizer.h @@ -0,0 +1,88 @@ +// Copyright 2019 Alpha Cephei Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VOSK_GPU_RECOGNIZER_H +#define VOSK_GPU_RECOGNIZER_H + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "fstext/fstext-utils.h" +#include "decoder/lattice-faster-decoder.h" +#include "feat/feature-mfcc.h" +#include "lat/kaldi-lattice.h" +#include "lat/word-align-lattice.h" +#include "lat/compose-lattice-pruned.h" +#include "nnet3/am-nnet-simple.h" +#include "nnet3/nnet-am-decodable-simple.h" +#include "nnet3/nnet-utils.h" + +#include "cudadecoder/cuda-online-pipeline-dynamic-batcher.h" +#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h" +#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h" +#include "cudadecoder/cuda-pipeline-common.h" + +#include "model.h" + +using namespace kaldi; +using namespace kaldi::cuda_decoder; + +class BatchRecognizer { + public: + BatchRecognizer(); + ~BatchRecognizer(); + + void FinishStream(uint64_t id); + void AcceptWaveform(uint64_t id, const char *data, int len); + const char *FrontResult(uint64_t id); + void Pop(uint64_t id); + void WaitForCompletion(); + int GetNumPendingChunks(uint64_t id); + + private: + struct Stream { + bool initialized = false; + std::queue results; + kaldi::Vector buffer; + }; + + void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset); + + kaldi::TransitionModel *trans_model_ = nullptr; + kaldi::nnet3::AmNnetSimple *nnet_ = nullptr; + const fst::SymbolTable *word_syms_ = nullptr; + + fst::Fst *hclg_fst_ = nullptr; + kaldi::WordBoundaryInfo *winfo_ = nullptr; + + fst::VectorFst *graph_lm_fst_ = nullptr; + kaldi::ConstArpaLm const_arpa_; + + BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr; + CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr; + + int32 samples_per_chunk_; + + // Input and output queues + std::map streams_; + + // Rescoring + fst::ArcMapFst > *lm_to_subtract_ = nullptr; + kaldi::ConstArpaLmDeterministicFst *carpa_to_add_ = nullptr; + fst::ScaleDeterministicOnDemandFst *carpa_to_add_scale_ = nullptr; + + float sample_frequency_; +}; + +#endif /* VOSK_GPU_RECOGNIZER_H */ diff --git a/src/json.h b/src/json.h index 463912ec..2159392b 100644 --- a/src/json.h +++ b/src/json.h @@ -424,7 +424,7 @@ class JSON Class Type = Class::Null; }; -JSON Array() { +inline JSON Array() { return JSON::Make( JSON::Class::Array ); } @@ -435,11 +435,11 @@ JSON Array( T... args ) { return arr; } -JSON Object() { +inline JSON Object() { return JSON::Make( JSON::Class::Object ); } -std::ostream& operator<<( std::ostream &os, const JSON &json ) { +inline std::ostream& operator<<( std::ostream &os, const JSON &json ) { os << json.dump(); return os; } @@ -647,7 +647,7 @@ namespace { } } -JSON JSON::Load( const string &str ) { +inline JSON JSON::Load( const string &str ) { size_t offset = 0; return parse_next( str, offset ); } diff --git a/src/model.cc b/src/model.cc index 956d8c5c..61e1ef24 100644 --- a/src/model.cc +++ b/src/model.cc @@ -243,9 +243,9 @@ void Model::ReadDataFiles() SetDropoutTestMode(true, &(nnet_->GetNnet())); nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet())); } + decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_, nnet_); - if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) { KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_; diff --git a/src/model.h b/src/model.h index ea05c2f9..856f9fdc 100644 --- a/src/model.h +++ b/src/model.h @@ -36,7 +36,8 @@ using namespace kaldi; using namespace std; -class KaldiRecognizer; +class Recognizer; +class BatchRecognizer; class Model { @@ -52,7 +53,8 @@ class Model { void ConfigureV2(); void ReadDataFiles(); - friend class KaldiRecognizer; + friend class Recognizer; + friend class BatchRecognizer; string model_path_str_; string nnet3_rxfilename_; diff --git a/src/kaldi_recognizer.cc b/src/recognizer.cc similarity index 89% rename from src/kaldi_recognizer.cc rename to src/recognizer.cc index dd1dc6ee..50d84d0e 100644 --- a/src/kaldi_recognizer.cc +++ b/src/recognizer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kaldi_recognizer.h" +#include "recognizer.h" #include "json.h" #include "fstext/fstext-utils.h" #include "lat/sausages.h" @@ -23,7 +23,7 @@ using namespace fst; using namespace kaldi::nnet3; -KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) { +Recognizer::Recognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) { model_->Ref(); @@ -48,7 +48,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_( InitRescoring(); } -KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) +Recognizer::Recognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) { model_->Ref(); @@ -109,7 +109,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons InitRescoring(); } -KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) { +Recognizer::Recognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) { model_->Ref(); spk_model->Ref(); @@ -137,7 +137,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel InitRescoring(); } -KaldiRecognizer::~KaldiRecognizer() { +Recognizer::~Recognizer() { delete decoder_; delete feature_pipeline_; delete silence_weighting_; @@ -157,7 +157,7 @@ KaldiRecognizer::~KaldiRecognizer() { spk_model_->Unref(); } -void KaldiRecognizer::InitState() +void Recognizer::InitState() { frame_offset_ = 0; samples_processed_ = 0; @@ -166,7 +166,7 @@ void KaldiRecognizer::InitState() state_ = RECOGNIZER_INITIALIZED; } -void KaldiRecognizer::InitRescoring() +void Recognizer::InitRescoring() { if (model_->graph_lm_fst_) { @@ -187,7 +187,7 @@ void KaldiRecognizer::InitRescoring() } } -void KaldiRecognizer::CleanUp() +void Recognizer::CleanUp() { delete silence_weighting_; silence_weighting_ = new kaldi::OnlineSilenceWeighting(*model_->trans_model_, model_->feature_info_.silence_weighting_config, 3); @@ -225,7 +225,7 @@ void KaldiRecognizer::CleanUp() } } -void KaldiRecognizer::UpdateSilenceWeights() +void Recognizer::UpdateSilenceWeights() { if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0 && feature_pipeline_->IvectorFeature() != nullptr) { @@ -238,22 +238,27 @@ void KaldiRecognizer::UpdateSilenceWeights() } } -void KaldiRecognizer::SetMaxAlternatives(int max_alternatives) +void Recognizer::SetMaxAlternatives(int max_alternatives) { max_alternatives_ = max_alternatives; } -void KaldiRecognizer::SetResultOptions(const char *result_opts) +void Recognizer::SetResultOptions(const char *result_opts) { result_opts_ = result_opts; } -void KaldiRecognizer::SetWords(bool words) +void Recognizer::SetWords(bool words) { words_ = words; } -void KaldiRecognizer::SetSpkModel(SpkModel *spk_model) +void Recognizer::SetNLSML(bool nlsml) +{ + nlsml_ = nlsml; +} + +void Recognizer::SetSpkModel(SpkModel *spk_model) { if (state_ == RECOGNIZER_RUNNING) { KALDI_ERR << "Can't add speaker model to already running recognizer"; @@ -264,7 +269,7 @@ void KaldiRecognizer::SetSpkModel(SpkModel *spk_model) spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts); } -bool KaldiRecognizer::AcceptWaveform(const char *data, int len) +bool Recognizer::AcceptWaveform(const char *data, int len) { Vector wave; wave.Resize(len / 2, kUndefined); @@ -273,7 +278,7 @@ bool KaldiRecognizer::AcceptWaveform(const char *data, int len) return AcceptWaveform(wave); } -bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len) +bool Recognizer::AcceptWaveform(const short *sdata, int len) { Vector wave; wave.Resize(len, kUndefined); @@ -282,7 +287,7 @@ bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len) return AcceptWaveform(wave); } -bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len) +bool Recognizer::AcceptWaveform(const float *fdata, int len) { Vector wave; wave.Resize(len, kUndefined); @@ -291,7 +296,7 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len) return AcceptWaveform(wave); } -bool KaldiRecognizer::AcceptWaveform(Vector &wdata) +bool Recognizer::AcceptWaveform(Vector &wdata) { // Cleanup if we finalized previous utterance or the whole feature pipeline if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) { @@ -350,7 +355,7 @@ static void RunNnetComputation(const MatrixBase &features, #define MIN_SPK_FEATS 50 -bool KaldiRecognizer::GetSpkVector(Vector &out_xvector, int *num_spk_frames) +bool Recognizer::GetSpkVector(Vector &out_xvector, int *num_spk_frames) { vector nonsilence_frames; if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0) { @@ -415,7 +420,8 @@ bool KaldiRecognizer::GetSpkVector(Vector &out_xvector, int *num_spk_ return true; } -const char *KaldiRecognizer::MbrResult(CompactLattice &rlat) + +const char *Recognizer::MbrResult(CompactLattice &rlat) { CompactLattice aligned_lat; if (model_->winfo_) { @@ -587,7 +593,7 @@ void ComputePhoneInfo(const TransitionModel &tmodel, const CompactLattice &clat, } -const char *KaldiRecognizer::WordandPhoneResult(CompactLattice &rlat) +const char *Recognizer::WordandPhoneResult(CompactLattice &rlat) { //Computes aligned word and phone-level results without MBR decoding CompactLattice aligned_lat; @@ -760,7 +766,7 @@ static bool CompactLatticeToWordAlignmentWeight(const CompactLattice &clat, } -const char *KaldiRecognizer::NbestResult(CompactLattice &clat) +const char *Recognizer::NbestResult(CompactLattice &clat) { Lattice lat; Lattice nbest_lat; @@ -771,7 +777,6 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat) fst::ConvertNbestToVector(nbest_lat, &nbest_lats); json::JSON obj; - std::stringstream ss; for (int k = 0; k < nbest_lats.size(); k++) { Lattice nlat = nbest_lats[k]; @@ -798,7 +803,7 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat) stringstream text; json::JSON entry; - for (int i = 0; i < words.size(); i++) { + for (int i = 0, first = 1; i < words.size(); i++) { json::JSON word; if (words[i] == 0) continue; @@ -808,8 +813,12 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat) word["end"] = samples_round_start_ / sample_frequency_ + (frame_offset_ + begin_times[i] + lengths[i]) * 0.03; entry["result"].append(word); } - if (i) + + if (first) + first = 0; + else text << " "; + text << model_->word_syms_->Find(words[i]); } @@ -821,7 +830,67 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat) return StoreReturn(obj.dump()); } -const char* KaldiRecognizer::GetResult() +const char *Recognizer::NlsmlResult(CompactLattice &clat) +{ + Lattice lat; + Lattice nbest_lat; + std::vector nbest_lats; + + ConvertLattice (clat, &lat); + fst::ShortestPath(lat, &nbest_lat, max_alternatives_); + fst::ConvertNbestToVector(nbest_lat, &nbest_lats); + + std::stringstream ss; + ss << "\n"; + ss << "\n"; + + for (int k = 0; k < nbest_lats.size(); k++) { + + Lattice nlat = nbest_lats[k]; + + CompactLattice nclat; + fst::Invert(&nlat); + DeterminizeLattice(nlat, &nclat); + + CompactLattice aligned_nclat; + if (model_->winfo_) { + WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat); + } else { + aligned_nclat = nclat; + } + + std::vector words; + std::vector begin_times; + std::vector lengths; + CompactLattice::Weight weight; + + CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight); + float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2()); + + stringstream text; + for (int i = 0, first = 1; i < words.size(); i++) { + if (words[i] == 0) + continue; + + if (first) + first = 0; + else + text << " "; + + text << model_->word_syms_->Find(words[i]); + } + + ss << "\n"; + ss << "" << text.str() << "\n"; + ss << "" << text.str() << "\n"; + ss << "\n"; + } + ss << "\n"; + + return StoreReturn(ss.str()); +} + +const char* Recognizer::GetResult() { if (decoder_->NumFramesDecoded() == 0) { return StoreEmptyReturn(); @@ -886,13 +955,15 @@ const char* KaldiRecognizer::GetResult() } } else if (strcmp(result_opts_, "words")!=0 && strcmp(result_opts_, "phones")!=0){ KALDI_ERR << "Invalid recognizer result options"; + } else if (nlsml_) { + return NlsmlResult(rlat); } else { return NbestResult(rlat); } } -const char* KaldiRecognizer::PartialResult() +const char* Recognizer::PartialResult() { if (state_ != RECOGNIZER_RUNNING) { return StoreEmptyReturn(); @@ -923,7 +994,7 @@ const char* KaldiRecognizer::PartialResult() return StoreReturn(res.dump()); } -const char* KaldiRecognizer::Result() +const char* Recognizer::Result() { if (state_ != RECOGNIZER_RUNNING) { return StoreEmptyReturn(); @@ -933,7 +1004,7 @@ const char* KaldiRecognizer::Result() return GetResult(); } -const char* KaldiRecognizer::FinalResult() +const char* Recognizer::FinalResult() { if (state_ != RECOGNIZER_RUNNING) { return StoreEmptyReturn(); @@ -961,7 +1032,7 @@ const char* KaldiRecognizer::FinalResult() return last_result_.c_str(); } -void KaldiRecognizer::Reset() +void Recognizer::Reset() { if (state_ == RECOGNIZER_RUNNING) { decoder_->FinalizeDecoding(); @@ -970,17 +1041,25 @@ void KaldiRecognizer::Reset() state_ = RECOGNIZER_ENDPOINT; } -const char *KaldiRecognizer::StoreEmptyReturn() +const char *Recognizer::StoreEmptyReturn() { if (!max_alternatives_) { return StoreReturn("{\"text\": \"\"}"); + } else if (nlsml_) { + return StoreReturn("\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n"); } else { return StoreReturn("{\"alternatives\" : [{\"text\": \"\", \"confidence\" : 1.0}] }"); } } // Store result in recognizer and return as const string -const char *KaldiRecognizer::StoreReturn(const string &res) +const char *Recognizer::StoreReturn(const string &res) { last_result_ = res; return last_result_.c_str(); diff --git a/src/kaldi_recognizer.h b/src/recognizer.h similarity index 89% rename from src/kaldi_recognizer.h rename to src/recognizer.h index 2349aa3b..68143f43 100644 --- a/src/kaldi_recognizer.h +++ b/src/recognizer.h @@ -33,23 +33,24 @@ using namespace kaldi; -enum KaldiRecognizerState { +enum RecognizerState { RECOGNIZER_INITIALIZED, RECOGNIZER_RUNNING, RECOGNIZER_ENDPOINT, RECOGNIZER_FINALIZED }; -class KaldiRecognizer { +class Recognizer { public: - KaldiRecognizer(Model *model, float sample_frequency); - KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model); - KaldiRecognizer(Model *model, float sample_frequency, char const *grammar); - ~KaldiRecognizer(); + Recognizer(Model *model, float sample_frequency); + Recognizer(Model *model, float sample_frequency, SpkModel *spk_model); + Recognizer(Model *model, float sample_frequency, char const *grammar); + ~Recognizer(); void SetMaxAlternatives(int max_alternatives); void SetResultOptions(const char *result_opts); void SetSpkModel(SpkModel *spk_model); void SetWords(bool words); + void SetNLSML(bool nlsml); bool AcceptWaveform(const char *data, int len); bool AcceptWaveform(const short *sdata, int len); bool AcceptWaveform(const float *fdata, int len); @@ -71,6 +72,7 @@ class KaldiRecognizer { const char *MbrResult(CompactLattice &clat); const char *WordandPhoneResult(CompactLattice &clat); const char *NbestResult(CompactLattice &clat); + const char *NlsmlResult(CompactLattice &clat); Model *model_ = nullptr; SingleUtteranceNnet3Decoder *decoder_ = nullptr; @@ -97,6 +99,7 @@ class KaldiRecognizer { int max_alternatives_ = 0; // Disable alternatives by default const char *result_opts_ = "words"; // By default enable only word-level results bool words_ = false; + bool nlsml_ = false; float sample_frequency_; int32 frame_offset_; @@ -104,7 +107,7 @@ class KaldiRecognizer { int64 samples_processed_; int64 samples_round_start_; - KaldiRecognizerState state_; + RecognizerState state_; string last_result_; }; diff --git a/src/spk_model.h b/src/spk_model.h index 07cbd4b0..9a76c62a 100644 --- a/src/spk_model.h +++ b/src/spk_model.h @@ -22,7 +22,7 @@ using namespace kaldi; -class KaldiRecognizer; +class Recognizer; class SpkModel { @@ -32,7 +32,7 @@ class SpkModel { void Unref(); protected: - friend class KaldiRecognizer; + friend class Recognizer; ~SpkModel() {}; kaldi::nnet3::Nnet speaker_nnet; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 2088b6cb..3166cc0f 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -13,12 +13,14 @@ // limitations under the License. #include "vosk_api.h" -#include "kaldi_recognizer.h" + +#include "recognizer.h" #include "model.h" #include "spk_model.h" #if HAVE_CUDA #include "cudamatrix/cu-device.h" +#include "batch_recognizer.h" #endif #include @@ -67,7 +69,7 @@ void vosk_spk_model_free(VoskSpkModel *model) VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate) { try { - return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate); + return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate); } catch (...) { return nullptr; } @@ -76,7 +78,7 @@ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate) VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model) { try { - return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, (SpkModel *)spk_model); + return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, (SpkModel *)spk_model); } catch (...) { return nullptr; } @@ -85,7 +87,7 @@ VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, Vos VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar) { try { - return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, grammar); + return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, grammar); } catch (...) { return nullptr; } @@ -93,17 +95,22 @@ VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, con void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives) { - ((KaldiRecognizer *)recognizer)->SetMaxAlternatives(max_alternatives); + ((Recognizer *)recognizer)->SetMaxAlternatives(max_alternatives); } void vosk_recognizer_set_result_options(VoskRecognizer *recognizer, const char *result_opts) { - ((KaldiRecognizer *)recognizer)->SetResultOptions(result_opts); + ((Recognizer *)recognizer)->SetResultOptions(result_opts); } void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words) { - ((KaldiRecognizer *)recognizer)->SetWords((bool)words); + ((Recognizer *)recognizer)->SetWords((bool)words); +} + +void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml) +{ + ((Recognizer *)recognizer)->SetNLSML((bool)nlsml); } void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model) @@ -111,13 +118,13 @@ void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk if (recognizer == nullptr || spk_model == nullptr) { return; } - ((KaldiRecognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model); + ((Recognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model); } int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length) { try { - return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length); + return ((Recognizer *)(recognizer))->AcceptWaveform(data, length); } catch (...) { return -1; } @@ -126,7 +133,7 @@ int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length) { try { - return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length); + return ((Recognizer *)(recognizer))->AcceptWaveform(data, length); } catch (...) { return -1; } @@ -135,7 +142,7 @@ int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *d int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length) { try { - return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length); + return ((Recognizer *)(recognizer))->AcceptWaveform(data, length); } catch (...) { return -1; } @@ -143,27 +150,27 @@ int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *d const char *vosk_recognizer_result(VoskRecognizer *recognizer) { - return ((KaldiRecognizer *)recognizer)->Result(); + return ((Recognizer *)recognizer)->Result(); } const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer) { - return ((KaldiRecognizer *)recognizer)->PartialResult(); + return ((Recognizer *)recognizer)->PartialResult(); } const char *vosk_recognizer_final_result(VoskRecognizer *recognizer) { - return ((KaldiRecognizer *)recognizer)->FinalResult(); + return ((Recognizer *)recognizer)->FinalResult(); } void vosk_recognizer_reset(VoskRecognizer *recognizer) { - ((KaldiRecognizer *)recognizer)->Reset(); + ((Recognizer *)recognizer)->Reset(); } void vosk_recognizer_free(VoskRecognizer *recognizer) { - delete (KaldiRecognizer *)(recognizer); + delete (Recognizer *)(recognizer); } void vosk_set_log_level(int log_level) @@ -174,6 +181,8 @@ void vosk_set_log_level(int log_level) void vosk_gpu_init() { #if HAVE_CUDA +// kaldi::CuDevice::EnableTensorCores(true); +// kaldi::CuDevice::EnableTf32Compute(true); kaldi::CuDevice::Instantiate().SelectGpuId("yes"); kaldi::CuDevice::Instantiate().AllowMultithreading(); #endif @@ -185,3 +194,65 @@ void vosk_gpu_thread_init() kaldi::CuDevice::Instantiate(); #endif } + +VoskBatchRecognizer *vosk_batch_recognizer_new() +{ +#if HAVE_CUDA + return (VoskBatchRecognizer *)(new BatchRecognizer()); +#else + return NULL; +#endif +} + +void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer) +{ +#if HAVE_CUDA + delete ((BatchRecognizer *)recognizer); +#endif +} + +void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length) +{ +#if HAVE_CUDA + ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length); +#endif +} + +void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id) +{ +#if HAVE_CUDA + ((BatchRecognizer *)recognizer)->FinishStream(id); +#endif +} + +const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id) +{ +#if HAVE_CUDA + return ((BatchRecognizer *)recognizer)->FrontResult(id); +#else + return NULL; +#endif +} + +void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id) +{ +#if HAVE_CUDA + ((BatchRecognizer *)recognizer)->Pop(id); +#endif +} + +void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer) +{ +#if HAVE_CUDA + ((BatchRecognizer *)recognizer)->WaitForCompletion(); +#endif +} + +int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id) +{ +#if HAVE_CUDA + return ((BatchRecognizer *)recognizer)->GetNumPendingChunks(id); +#else + return 0; +#endif +} diff --git a/src/vosk_api.h b/src/vosk_api.h index a1d46347..5988145d 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -39,6 +39,10 @@ typedef struct VoskSpkModel VoskSpkModel; * speaker information and so on */ typedef struct VoskRecognizer VoskRecognizer; +/** + * Batch recognizer object + */ +typedef struct VoskBatchRecognizer VoskBatchRecognizer; /** Loads model data from the file and returns the model object * @@ -282,6 +286,12 @@ void vosk_recognizer_set_result_options(VoskRecognizer *recognizer, const char * void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words); +/** Set NLSML output + * @param nlsml - boolean value + */ +void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml); + + /** Accept voice data * * accept and process new chunk of voice data @@ -380,6 +390,33 @@ void vosk_gpu_init(); */ void vosk_gpu_thread_init(); +/** Creates the batch recognizer object + * + * @returns recognizer object or NULL if problem occured */ +VoskBatchRecognizer *vosk_batch_recognizer_new(); + +/** Releases batch recognizer object + * Underlying model is also unreferenced and if needed released */ +void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer); + +/** Accept batch voice data */ +void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length); + +/** Closes the stream */ +void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id); + +/** Return results */ +const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id); + +/** Release and free first retrieved result */ +void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id); + +/** Wait for the processing */ +void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer); + +/** Get amount of pending chunks for more intelligent waiting */ +int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id); + #ifdef __cplusplus } #endif diff --git a/travis/Dockerfile.win b/travis/Dockerfile.win index 89081c2b..4f00bfcc 100644 --- a/travis/Dockerfile.win +++ b/travis/Dockerfile.win @@ -55,7 +55,7 @@ RUN cd /opt/kaldi \ && find . -name *.a -exec cp {} /opt/kaldi/local/lib \; RUN cd /opt/kaldi \ - && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \ + && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \ && cd kaldi/src \ && CXX=x86_64-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \ --mathlib=OPENBLAS_CLAPACK \ diff --git a/travis/Dockerfile.win32 b/travis/Dockerfile.win32 index 59198d9a..5a478e52 100644 --- a/travis/Dockerfile.win32 +++ b/travis/Dockerfile.win32 @@ -54,7 +54,7 @@ RUN cd /opt/kaldi \ && find . -name *.a -exec cp {} /opt/kaldi/local/lib \; RUN cd /opt/kaldi \ - && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \ + && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \ && cd kaldi/src \ && CXX=i686-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \ --mathlib=OPENBLAS_CLAPACK \ diff --git a/travis/build-wheels-win.sh b/travis/build-wheels-win.sh index 750b6dd7..02bf6efc 100755 --- a/travis/build-wheels-win.sh +++ b/travis/build-wheels-win.sh @@ -5,7 +5,7 @@ set -e -x cd /opt git clone https://github.com/alphacep/vosk-api cd vosk-api/src -CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) +EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) # Collect dependencies cp /usr/lib/gcc/x86_64-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src @@ -14,7 +14,7 @@ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src # Copy dlls to output folder mkdir -p /io/wheelhouse/win64 -cp /opt/vosk-api/src/*.dll /io/wheelhouse/win64 +cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win64 # Build wheel and put to the output folder export VOSK_SOURCE=/opt/vosk-api diff --git a/travis/build-wheels-win32.sh b/travis/build-wheels-win32.sh index 2b934bd3..82af745e 100755 --- a/travis/build-wheels-win32.sh +++ b/travis/build-wheels-win32.sh @@ -5,7 +5,7 @@ set -e -x cd /opt git clone https://github.com/alphacep/vosk-api cd vosk-api/src -CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) +EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc) # Copy dependencies cp /usr/lib/gcc/i686-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src @@ -14,7 +14,7 @@ cp /usr/i686-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src # Copy dlls to output folder mkdir -p /io/wheelhouse/win32 -cp /opt/vosk-api/src/*.dll /io/wheelhouse/win32 +cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win32 # Build wheel and put to the output folder export VOSK_SOURCE=/opt/vosk-api