diff --git a/README.md b/README.md
index 6218cfa4..8d96cdc3 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 # Vosk Speech Recognition Toolkit
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 18 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
 Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
-Ukrainian.
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/android/lib/build-vosk.sh b/android/lib/build-vosk.sh
index e455fa46..7be5f58c 100755
--- a/android/lib/build-vosk.sh
+++ b/android/lib/build-vosk.sh
@@ -123,7 +123,13 @@ make -j 8 online2 lm rnnlm
 # Vosk-api
 cd $WORKDIR
 mkdir -p $WORKDIR/vosk
-make -j 8 -C ${WORKDIR_BASE}/../../../src OUTDIR=$WORKDIR/vosk KALDI_ROOT=${WORKDIR}/kaldi OPENFST_ROOT=${WORKDIR}/local OPENBLAS_ROOT=${WORKDIR}/local CXX=$CXX EXTRA_LDFLAGS="-llog -static-libstdc++"
+make -j 8 -C ${WORKDIR_BASE}/../../../src \
+    OUTDIR=$WORKDIR/vosk \
+    KALDI_ROOT=${WORKDIR}/kaldi \
+    OPENFST_ROOT=${WORKDIR}/local \
+    OPENBLAS_ROOT=${WORKDIR}/local \
+    CXX=$CXX \
+    EXTRA_LDFLAGS="-llog -static-libstdc++ -Wl,-soname,libvosk.so"
 cp $WORKDIR/vosk/libvosk.so $WORKDIR/../../src/main/jniLibs/$arch/libvosk.so
 
 done
diff --git a/nodejs/README.md b/nodejs/README.md
index 0ac9d753..5603fae6 100644
--- a/nodejs/README.md
+++ b/nodejs/README.md
@@ -2,18 +2,18 @@ This is an FFI-NAPI wrapper for the Vosk library.
 
 ## Usage
 
-It mostly follows Vosk interface, some methods are not yet fully implemented.
+Bindings mostly follow Vosk interface, some methods are not yet fully implemented.
 
-To use it you need to compile libvosk library, see Python module build
-instructions for details. You can find prebuilt library inside python
-wheel.
+See [demo folder](https://github.com/alphacep/vosk-api/tree/master/nodejs/demo) for
+details.
 
 ## About
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 17 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
-Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino.
+Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/python/README.md b/python/README.md
index 300eaa6d..b121e9a3 100644
--- a/python/README.md
+++ b/python/README.md
@@ -1,9 +1,10 @@
 This is a Python module for Vosk.
 
 Vosk is an offline open source speech recognition toolkit. It enables
-speech recognition models for 17 languages and dialects - English, Indian
+speech recognition for 20+ languages and dialects - English, Indian
 English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish,
-Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino.
+Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino,
+Ukrainian, Kazakh, Swedish, Japanese, Esperanto. More to come.
 
 Vosk models are small (50 Mb) but provide continuous large vocabulary
 transcription, zero-latency response with streaming API, reconfigurable
diff --git a/python/example/test_gpu_batch.py b/python/example/test_gpu_batch.py
new file mode 100755
index 00000000..0ad9e288
--- /dev/null
+++ b/python/example/test_gpu_batch.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import wave
+from time import sleep
+import json
+from timeit import default_timer as timer
+
+
+from vosk import Model, BatchRecognizer, GpuInit
+
+GpuInit()
+
+rec = BatchRecognizer()
+
+# Read list of files from the file
+fnames = open(sys.argv[1]).readlines()
+fds = [open(x.strip(), "rb") for x in fnames]
+uids = [fname.strip().split('/')[-1][:-4] for fname in fnames]
+results = [""] * len(fnames)
+ended = set()
+tot_samples = 0
+
+start_time = timer()
+
+while True:
+
+    # Feed in the data
+    for i, fd in enumerate(fds):
+        if i in ended:
+            continue
+        data = fd.read(16000)
+        if len(data) == 0:
+            rec.FinishStream(i)
+            ended.add(i)
+            continue
+        rec.AcceptWaveform(i, data)
+        tot_samples += len(data)
+
+    # Wait for results from CUDA
+    rec.Wait()
+
+    # Retrieve and add results
+    for i, fd in enumerate(fds):
+       res = rec.Result(i)
+       if len(res) != 0:
+           results[i] = results[i] + " " + json.loads(res)['text']
+
+    if len(ended) == len(fds):
+        break
+
+end_time = timer()
+
+for i in range(len(results)):
+    print (uids[i], results[i].strip())
+
+print ("Processed %d seconds of audio in %d seconds (%f xRT)" % (tot_samples / 16000.0 / 2, end_time - start_time, 
+    (tot_samples / 16000.0 / 2 / (end_time - start_time))), file=sys.stderr)
diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py
new file mode 100755
index 00000000..18132093
--- /dev/null
+++ b/python/example/test_nlsml.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+import sys
+import os
+import wave
+
+SetLogLevel(0)
+
+if not os.path.exists("model"):
+    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
+    exit (1)
+
+wf = wave.open(sys.argv[1], "rb")
+if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+    print ("Audio file must be WAV format mono PCM.")
+    exit (1)
+
+model = Model("model")
+rec = KaldiRecognizer(model, wf.getframerate())
+rec.SetMaxAlternatives(10)
+rec.SetNLSML(True)
+
+while True:
+    data = wf.readframes(4000)
+    if len(data) == 0:
+        break
+    if rec.AcceptWaveform(data):
+        print(rec.Result())
+
+print(rec.FinalResult())
diff --git a/python/setup.py b/python/setup.py
index 502be6c8..39c8082e 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -44,7 +44,7 @@ def get_tag(self):
 
 setuptools.setup(
     name="vosk",
-    version="0.3.41",
+    version="0.3.42",
     author="Educational Testing Service",
     author_email="rubale@ets.org",
     description="Offline open source speech recognition API based on Kaldi and Vosk with additional features from ETS",
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index a250b253..18606866 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -72,6 +72,9 @@ def SetResultOptions(self, result_opts):
     def SetWords(self, enable_words):
         _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
 
+    def SetNLSML(self, enable_nlsml):
+        _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
+
     def SetSpkModel(self, spk_model):
         _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
 
@@ -104,3 +107,32 @@ def GpuInit():
 
 def GpuThreadInit():
     _c.vosk_gpu_thread_init()
+
+class BatchRecognizer(object):
+
+    def __init__(self, *args):
+        self._handle = _c.vosk_batch_recognizer_new()
+
+        if self._handle == _ffi.NULL:
+            raise Exception("Failed to create a recognizer")
+
+    def __del__(self):
+        _c.vosk_batch_recognizer_free(self._handle)
+
+    def AcceptWaveform(self, uid, data):
+        res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
+
+    def Result(self, uid):
+        ptr = _c.vosk_batch_recognizer_front_result(self._handle, uid)
+        res = _ffi.string(ptr).decode('utf-8')
+        _c.vosk_batch_recognizer_pop(self._handle, uid)
+        return res
+
+    def FinishStream(self, uid):
+        _c.vosk_batch_recognizer_finish_stream(self._handle, uid)
+
+    def Wait(self):
+        _c.vosk_batch_recognizer_wait(self._handle)
+
+    def GetPendingChunks(self, uid):
+        return _c.vosk_batch_recognizer_get_pending_chunks(self._handle, uid)
diff --git a/src/Makefile b/src/Makefile
index 54e96ca7..6ee41b69 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -18,14 +18,14 @@ EXTRA_LDFLAGS?=
 OUTDIR?=.
 
 VOSK_SOURCES= \
-	kaldi_recognizer.cc \
+	recognizer.cc \
 	language_model.cc \
 	model.cc \
 	spk_model.cc \
 	vosk_api.cc
 
 VOSK_HEADERS= \
-	kaldi_recognizer.h \
+	recognizer.h \
 	language_model.h \
 	model.h \
 	spk_model.h \
@@ -39,13 +39,13 @@ LIBS= \
 	$(KALDI_ROOT)/src/decoder/kaldi-decoder.a \
 	$(KALDI_ROOT)/src/ivector/kaldi-ivector.a \
 	$(KALDI_ROOT)/src/gmm/kaldi-gmm.a \
-	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/tree/kaldi-tree.a \
 	$(KALDI_ROOT)/src/feat/kaldi-feat.a \
 	$(KALDI_ROOT)/src/lat/kaldi-lat.a \
 	$(KALDI_ROOT)/src/lm/kaldi-lm.a \
 	$(KALDI_ROOT)/src/rnnlm/kaldi-rnnlm.a \
 	$(KALDI_ROOT)/src/hmm/kaldi-hmm.a \
+	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/transform/kaldi-transform.a \
 	$(KALDI_ROOT)/src/cudamatrix/kaldi-cudamatrix.a \
 	$(KALDI_ROOT)/src/matrix/kaldi-matrix.a \
@@ -55,6 +55,8 @@ LIBS= \
 	$(OPENFST_ROOT)/lib/libfst.a \
 	$(OPENFST_ROOT)/lib/libfstngram.a
 
+LDFLAGS =
+
 
 ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
     CFLAGS += -I$(OPENBLAS_ROOT)/include
@@ -66,23 +68,32 @@ ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
 endif
 
 ifeq ($(HAVE_MKL), 1)
-    CFLAGS += -I$(MKL_ROOT)/include
-    LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
+    CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include
+    LDFLAGS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
 endif
 
 ifeq ($(HAVE_ACCELERATE), 1)
-    LIBS += -framework Accelerate
+    LDFLAGS += -framework Accelerate
 endif
 
 ifeq ($(HAVE_CUDA), 1)
+    VOSK_SOURCES += batch_recognizer.cc
+    VOSK_HEADERS += batch_recognizer.h
+
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
-    LIBS+=-L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
+
+    LIBS := \
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
+        $(LIBS)
+
+    LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
 all: $(OUTDIR)/libvosk.$(EXT)
 
-$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o)
-	$(CXX) --shared -s -o $@ $^ $(LIBS) -lm -latomic $(EXTRA_LDFLAGS)
+$(OUTDIR)/libvosk.$(EXT): $(VOSK_SOURCES:%.cc=$(OUTDIR)/%.o) $(LIBS)
+	$(CXX) --shared -s -o $@ $^ $(LDFLAGS) -lm -latomic $(EXTRA_LDFLAGS)
 
 $(OUTDIR)/%.o: %.cc $(VOSK_HEADERS)
 	$(CXX) $(CFLAGS) -c -o $@ $<
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
new file mode 100644
index 00000000..46a0c097
--- /dev/null
+++ b/src/batch_recognizer.cc
@@ -0,0 +1,275 @@
+// Copyright 2019-2020 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "batch_recognizer.h"
+
+#include "fstext/fstext-utils.h"
+#include "lat/sausages.h"
+#include "json.h"
+
+#include <sys/stat.h>
+
+using namespace fst;
+using namespace kaldi::nnet3;
+using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID;
+
+BatchRecognizer::BatchRecognizer() {
+    BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+
+    kaldi::ParseOptions po("something");
+    batched_decoder_config.Register(&po);
+    po.ReadConfigFile("model/conf/model.conf");
+
+    struct stat buffer;
+
+    string nnet3_rxfilename_ = "model/am/final.mdl";
+    string hclg_fst_rxfilename_ = "model/graph/HCLG.fst";
+    string word_syms_rxfilename_ = "model/graph/words.txt";
+    string winfo_rxfilename_ = "model/graph/phones/word_boundary.int";
+    string std_fst_rxfilename_ = "model/rescore/G.fst";
+    string carpa_rxfilename_ = "model/rescore/G.carpa";
+
+    trans_model_ = new kaldi::TransitionModel();
+    nnet_ = new kaldi::nnet3::AmNnetSimple();
+    {
+        bool binary;
+        kaldi::Input ki(nnet3_rxfilename_, &binary);
+        trans_model_->Read(ki.Stream(), binary);
+        nnet_->Read(ki.Stream(), binary);
+        SetBatchnormTestMode(true, &(nnet_->GetNnet()));
+        SetDropoutTestMode(true, &(nnet_->GetNnet()));
+        nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
+    }
+
+    if (stat(hclg_fst_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading HCLG from " << hclg_fst_rxfilename_;
+        hclg_fst_ = fst::ReadFstKaldiGeneric(hclg_fst_rxfilename_);
+    }
+
+    KALDI_LOG << "Loading words from " << word_syms_rxfilename_;
+    if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) {
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename_;
+    }
+    KALDI_ASSERT(word_syms_);
+
+    if (stat(winfo_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading winfo " << winfo_rxfilename_;
+        kaldi::WordBoundaryInfoNewOpts opts;
+        winfo_ = new kaldi::WordBoundaryInfo(opts, winfo_rxfilename_);
+    }
+
+    if (stat(carpa_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading subtract G.fst model from " << std_fst_rxfilename_;
+        graph_lm_fst_ = fst::ReadAndPrepareLmFst(std_fst_rxfilename_);
+        KALDI_LOG << "Loading CARPA model from " << carpa_rxfilename_;
+        ReadKaldiObject(carpa_rxfilename_, &const_arpa_);
+    }
+
+    batched_decoder_config.num_worker_threads = -1;
+    batched_decoder_config.max_batch_size = 32;
+    batched_decoder_config.num_channels = 600;
+    batched_decoder_config.reset_on_endpoint = true;
+    batched_decoder_config.use_gpu_feature_extraction = true;
+
+    batched_decoder_config.feature_opts.feature_type = "mfcc";
+    batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
+    batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
+    batched_decoder_config.decoder_opts.max_active = 7000;
+    batched_decoder_config.decoder_opts.default_beam = 13.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 6.0;
+    batched_decoder_config.compute_opts.acoustic_scale = 1.0;
+    batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
+
+    int32 nnet_left_context, nnet_right_context;
+    nnet3::ComputeSimpleNnetContext(nnet_->GetNnet(), &nnet_left_context,
+                                    &nnet_right_context);
+
+    batched_decoder_config.compute_opts.frames_per_chunk = std::max(51, (nnet_right_context + 3 - nnet_right_context % 3));
+
+    cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
+         (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_);
+    cuda_pipeline_->SetSymbolTable(*word_syms_);
+
+    CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
+    dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
+                                                            *cuda_pipeline_);
+
+    samples_per_chunk_ = cuda_pipeline_->GetNSampsPerChunk();
+}
+
+BatchRecognizer::~BatchRecognizer() {
+
+    delete trans_model_;
+    delete nnet_;
+    delete word_syms_;
+    delete winfo_;
+    delete hclg_fst_;
+    delete graph_lm_fst_;
+
+    delete lm_to_subtract_;
+    delete carpa_to_add_;
+    delete carpa_to_add_scale_;
+
+    delete cuda_pipeline_;
+    delete dynamic_batcher_;
+}
+
+void BatchRecognizer::FinishStream(uint64_t id)
+{
+    auto it = streams_.find(id);
+    if (it == streams_.end()) {
+        return;
+    }
+
+    SubVector<BaseFloat> chunk = it->second.buffer.Range(0, it->second.buffer.Dim());
+    dynamic_batcher_->Push(id, !(it->second.initialized), true, chunk);
+    streams_.erase(it);
+}
+
+void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset)
+{
+    fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat);
+
+    CompactLattice aligned_lat;
+    WordAlignLattice(clat, *trans_model_, *winfo_, 0, &aligned_lat);
+
+    MinimumBayesRisk mbr(aligned_lat);
+    const vector<BaseFloat> &conf = mbr.GetOneBestConfidences();
+    const vector<int32> &words = mbr.GetOneBest();
+    const vector<pair<BaseFloat, BaseFloat> > &times =
+          mbr.GetOneBestTimes();
+
+    int size = words.size();
+
+    json::JSON obj;
+    stringstream text;
+
+    // Create JSON object
+    for (int i = 0; i < size; i++) {
+        json::JSON word;
+
+        word["word"] = word_syms_->Find(words[i]);
+        word["start"] = round(times[i].first) * 0.03 + offset;
+        word["end"] = round(times[i].second) * 0.03 + offset;
+        word["conf"] = conf[i];
+        obj["result"].append(word);
+
+        if (i) {
+            text << " ";
+        }
+        text << word_syms_->Find(words[i]);
+    }
+    obj["text"] = text.str();
+
+//    KALDI_LOG << "Result " << id << " " << obj.dump();
+
+    streams_[id].results.push(obj.dump());
+}
+
+void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
+{
+    if (streams_.find(id) == streams_.end()) {
+        // Define the callback for results.
+#if 0
+         cuda_pipeline_->SetBestPathCallback(
+          id,
+          [&, id](const std::string &str, bool partial,
+                       bool endpoint_detected) {
+              if (partial) {
+                  KALDI_LOG << "id #" << id << " [partial] : " << str << ":";
+              }
+
+              if (endpoint_detected) {
+                  KALDI_LOG << "id #" << id << " [endpoint detected]";
+              }
+
+              if (!partial) {
+                  KALDI_LOG << "id #" << id << " : " << str;
+              }
+            });
+#endif
+        cuda_pipeline_->SetLatticeCallback(
+          id,
+          [&, id](SegmentedLatticeCallbackParams& params) {
+              if (params.results.empty()) {
+                  KALDI_WARN << "Empty result for callback";
+                  return;
+              }
+              CompactLattice *clat = params.results[0].GetLatticeResult();
+              BaseFloat offset = params.results[0].GetTimeOffsetSeconds();
+              PushLattice(id, *clat, offset);
+          },
+          CudaPipelineResult::RESULT_TYPE_LATTICE);
+    }
+    // Collect data so we process exactly samples_per_chunk_
+    Vector<BaseFloat> &buffer = streams_[id].buffer;
+    int32 end = buffer.Dim();
+    buffer.Resize(end + len / 2, kCopyData);
+    for (int i = 0; i < len / 2; i++)
+        buffer(i + end) = *(((short *)data) + i);
+    end = buffer.Dim();
+
+    // Pick chunks and submit them to the batcher
+    int32 i = 0;
+    while (i + samples_per_chunk_ <= end) {
+        dynamic_batcher_->Push(id, (!streams_[id].initialized), false,
+                                    buffer.Range(i, samples_per_chunk_));
+        streams_[id].initialized = true;
+        i += samples_per_chunk_;
+    }
+
+    // Keep remaining data
+    if (i > 0) {
+        int32 tail = end - i;
+        for (int j = 0; j < tail; j++) {
+            buffer(j) = buffer(i + j);
+        }
+        buffer.Resize(tail, kCopyData);
+    }
+}
+
+const char* BatchRecognizer::FrontResult(uint64_t id)
+{
+    auto it = streams_.find(id);
+    if (it == streams_.end()) {
+        return "";
+    }
+    if (it->second.results.empty()) {
+        return "";
+    }
+    return it->second.results.front().c_str();
+}
+
+void BatchRecognizer::Pop(uint64_t id)
+{
+    auto it = streams_.find(id);
+    if (it == streams_.end()) {
+        return;
+    }
+    if (it->second.results.empty()) {
+        return;
+    }
+    it->second.results.pop();
+}
+
+void BatchRecognizer::WaitForCompletion()
+{
+    dynamic_batcher_->WaitForCompletion();
+}
+
+int BatchRecognizer::GetNumPendingChunks(uint64_t id)
+{
+    return dynamic_batcher_->GetNumPendingChunks(id);
+}
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
new file mode 100644
index 00000000..c6d90ae0
--- /dev/null
+++ b/src/batch_recognizer.h
@@ -0,0 +1,88 @@
+// Copyright 2019 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VOSK_GPU_RECOGNIZER_H
+#define VOSK_GPU_RECOGNIZER_H
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "fstext/fstext-utils.h"
+#include "decoder/lattice-faster-decoder.h"
+#include "feat/feature-mfcc.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/word-align-lattice.h"
+#include "lat/compose-lattice-pruned.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
+
+#include "cudadecoder/cuda-online-pipeline-dynamic-batcher.h"
+#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"
+#include "cudadecoder/cuda-pipeline-common.h"
+
+#include "model.h"
+
+using namespace kaldi;
+using namespace kaldi::cuda_decoder;
+
+class BatchRecognizer {
+    public:
+        BatchRecognizer();
+        ~BatchRecognizer();
+
+        void FinishStream(uint64_t id);
+        void AcceptWaveform(uint64_t id, const char *data, int len);
+        const char *FrontResult(uint64_t id);
+        void Pop(uint64_t id);
+        void WaitForCompletion();
+        int GetNumPendingChunks(uint64_t id);
+
+    private:
+        struct Stream {
+            bool initialized = false;
+            std::queue<std::string> results;
+            kaldi::Vector<BaseFloat> buffer;
+        };
+
+        void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset);
+
+        kaldi::TransitionModel *trans_model_ = nullptr;
+        kaldi::nnet3::AmNnetSimple *nnet_ = nullptr;
+        const fst::SymbolTable *word_syms_ = nullptr;
+
+        fst::Fst<fst::StdArc> *hclg_fst_ = nullptr;
+        kaldi::WordBoundaryInfo *winfo_ = nullptr;
+
+        fst::VectorFst<fst::StdArc> *graph_lm_fst_ = nullptr;
+        kaldi::ConstArpaLm const_arpa_;
+
+        BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
+        CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
+
+        int32 samples_per_chunk_;
+
+        // Input and output queues
+        std::map<int, Stream> streams_;
+
+        // Rescoring
+        fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;
+        kaldi::ConstArpaLmDeterministicFst *carpa_to_add_ = nullptr;
+        fst::ScaleDeterministicOnDemandFst *carpa_to_add_scale_ = nullptr;
+
+        float sample_frequency_;
+};
+
+#endif /* VOSK_GPU_RECOGNIZER_H */
diff --git a/src/json.h b/src/json.h
index 463912ec..2159392b 100644
--- a/src/json.h
+++ b/src/json.h
@@ -424,7 +424,7 @@ class JSON
         Class Type = Class::Null;
 };
 
-JSON Array() {
+inline JSON Array() {
     return JSON::Make( JSON::Class::Array );
 }
 
@@ -435,11 +435,11 @@ JSON Array( T... args ) {
     return arr;
 }
 
-JSON Object() {
+inline JSON Object() {
     return JSON::Make( JSON::Class::Object );
 }
 
-std::ostream& operator<<( std::ostream &os, const JSON &json ) {
+inline std::ostream& operator<<( std::ostream &os, const JSON &json ) {
     os << json.dump();
     return os;
 }
@@ -647,7 +647,7 @@ namespace {
     }
 }
 
-JSON JSON::Load( const string &str ) {
+inline JSON JSON::Load( const string &str ) {
     size_t offset = 0;
     return parse_next( str, offset );
 }
diff --git a/src/model.cc b/src/model.cc
index 956d8c5c..61e1ef24 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -243,9 +243,9 @@ void Model::ReadDataFiles()
         SetDropoutTestMode(true, &(nnet_->GetNnet()));
         nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
     }
+
     decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
                                                                nnet_);
-
     if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_;
 
diff --git a/src/model.h b/src/model.h
index ea05c2f9..856f9fdc 100644
--- a/src/model.h
+++ b/src/model.h
@@ -36,7 +36,8 @@
 using namespace kaldi;
 using namespace std;
 
-class KaldiRecognizer;
+class Recognizer;
+class BatchRecognizer;
 
 class Model {
 
@@ -52,7 +53,8 @@ class Model {
     void ConfigureV2();
     void ReadDataFiles();
 
-    friend class KaldiRecognizer;
+    friend class Recognizer;
+    friend class BatchRecognizer;
 
     string model_path_str_;
     string nnet3_rxfilename_;
diff --git a/src/kaldi_recognizer.cc b/src/recognizer.cc
similarity index 89%
rename from src/kaldi_recognizer.cc
rename to src/recognizer.cc
index dd1dc6ee..50d84d0e 100644
--- a/src/kaldi_recognizer.cc
+++ b/src/recognizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kaldi_recognizer.h"
+#include "recognizer.h"
 #include "json.h"
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
@@ -23,7 +23,7 @@
 using namespace fst;
 using namespace kaldi::nnet3;
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) {
+Recognizer::Recognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) {
 
     model_->Ref();
 
@@ -48,7 +48,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(
     InitRescoring();
 }
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency)
+Recognizer::Recognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency)
 {
     model_->Ref();
 
@@ -109,7 +109,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons
     InitRescoring();
 }
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) {
+Recognizer::Recognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) {
 
     model_->Ref();
     spk_model->Ref();
@@ -137,7 +137,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel
     InitRescoring();
 }
 
-KaldiRecognizer::~KaldiRecognizer() {
+Recognizer::~Recognizer() {
     delete decoder_;
     delete feature_pipeline_;
     delete silence_weighting_;
@@ -157,7 +157,7 @@ KaldiRecognizer::~KaldiRecognizer() {
          spk_model_->Unref();
 }
 
-void KaldiRecognizer::InitState()
+void Recognizer::InitState()
 {
     frame_offset_ = 0;
     samples_processed_ = 0;
@@ -166,7 +166,7 @@ void KaldiRecognizer::InitState()
     state_ = RECOGNIZER_INITIALIZED;
 }
 
-void KaldiRecognizer::InitRescoring()
+void Recognizer::InitRescoring()
 {
     if (model_->graph_lm_fst_) {
 
@@ -187,7 +187,7 @@ void KaldiRecognizer::InitRescoring()
     }
 }
 
-void KaldiRecognizer::CleanUp()
+void Recognizer::CleanUp()
 {
     delete silence_weighting_;
     silence_weighting_ = new kaldi::OnlineSilenceWeighting(*model_->trans_model_, model_->feature_info_.silence_weighting_config, 3);
@@ -225,7 +225,7 @@ void KaldiRecognizer::CleanUp()
     }
 }
 
-void KaldiRecognizer::UpdateSilenceWeights()
+void Recognizer::UpdateSilenceWeights()
 {
     if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0 &&
         feature_pipeline_->IvectorFeature() != nullptr) {
@@ -238,22 +238,27 @@ void KaldiRecognizer::UpdateSilenceWeights()
     }
 }
 
-void KaldiRecognizer::SetMaxAlternatives(int max_alternatives)
+void Recognizer::SetMaxAlternatives(int max_alternatives)
 {
     max_alternatives_ = max_alternatives;
 }
 
-void KaldiRecognizer::SetResultOptions(const char *result_opts)
+void Recognizer::SetResultOptions(const char *result_opts)
 {
     result_opts_ = result_opts;
 }
 
-void KaldiRecognizer::SetWords(bool words)
+void Recognizer::SetWords(bool words)
 {
     words_ = words;
 }
 
-void KaldiRecognizer::SetSpkModel(SpkModel *spk_model)
+void Recognizer::SetNLSML(bool nlsml)
+{
+    nlsml_ = nlsml;
+}
+
+void Recognizer::SetSpkModel(SpkModel *spk_model)
 {
     if (state_ == RECOGNIZER_RUNNING) {
         KALDI_ERR << "Can't add speaker model to already running recognizer";
@@ -264,7 +269,7 @@ void KaldiRecognizer::SetSpkModel(SpkModel *spk_model)
     spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const char *data, int len)
+bool Recognizer::AcceptWaveform(const char *data, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len / 2, kUndefined);
@@ -273,7 +278,7 @@ bool KaldiRecognizer::AcceptWaveform(const char *data, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len)
+bool Recognizer::AcceptWaveform(const short *sdata, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len, kUndefined);
@@ -282,7 +287,7 @@ bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
+bool Recognizer::AcceptWaveform(const float *fdata, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len, kUndefined);
@@ -291,7 +296,7 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
+bool Recognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
 {
     // Cleanup if we finalized previous utterance or the whole feature pipeline
     if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) {
@@ -350,7 +355,7 @@ static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
 
 #define MIN_SPK_FEATS 50
 
-bool KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_frames)
+bool Recognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_frames)
 {
     vector<int32> nonsilence_frames;
     if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0) {
@@ -415,7 +420,8 @@ bool KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_
     return true;
 }
 
-const char *KaldiRecognizer::MbrResult(CompactLattice &rlat)
+
+const char *Recognizer::MbrResult(CompactLattice &rlat)
 {
     CompactLattice aligned_lat;
     if (model_->winfo_) {
@@ -587,7 +593,7 @@ void ComputePhoneInfo(const TransitionModel &tmodel, const CompactLattice &clat,
     
 }
 
-const char *KaldiRecognizer::WordandPhoneResult(CompactLattice &rlat)
+const char *Recognizer::WordandPhoneResult(CompactLattice &rlat)
 {
     //Computes aligned word and phone-level results without MBR decoding
     CompactLattice aligned_lat;
@@ -760,7 +766,7 @@ static bool CompactLatticeToWordAlignmentWeight(const CompactLattice &clat,
 }
 
 
-const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
+const char *Recognizer::NbestResult(CompactLattice &clat)
 {
     Lattice lat;
     Lattice nbest_lat;
@@ -771,7 +777,6 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
     fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
 
     json::JSON obj;
-    std::stringstream ss;
     for (int k = 0; k < nbest_lats.size(); k++) {
 
       Lattice nlat = nbest_lats[k];
@@ -798,7 +803,7 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
       stringstream text;
       json::JSON entry;
 
-      for (int i = 0; i < words.size(); i++) {
+      for (int i = 0, first = 1; i < words.size(); i++) {
         json::JSON word;
         if (words[i] == 0)
             continue;
@@ -808,8 +813,12 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
             word["end"] = samples_round_start_ / sample_frequency_ + (frame_offset_ + begin_times[i] + lengths[i]) * 0.03;
             entry["result"].append(word);
         }
-        if (i)
+
+        if (first)
+          first = 0;
+        else
           text << " ";
+
         text << model_->word_syms_->Find(words[i]);
       }
 
@@ -821,7 +830,67 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
     return StoreReturn(obj.dump());
 }
 
-const char* KaldiRecognizer::GetResult()
+const char *Recognizer::NlsmlResult(CompactLattice &clat)
+{
+    Lattice lat;
+    Lattice nbest_lat;
+    std::vector<Lattice> nbest_lats;
+
+    ConvertLattice (clat, &lat);
+    fst::ShortestPath(lat, &nbest_lat, max_alternatives_);
+    fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
+
+    std::stringstream ss;
+    ss << "<?xml version=\"1.0\"?>\n";
+    ss << "<result grammar=\"default\">\n";
+
+    for (int k = 0; k < nbest_lats.size(); k++) {
+
+      Lattice nlat = nbest_lats[k];
+
+      CompactLattice nclat;
+      fst::Invert(&nlat);
+      DeterminizeLattice(nlat, &nclat);
+
+      CompactLattice aligned_nclat;
+      if (model_->winfo_) {
+          WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat);
+      } else {
+          aligned_nclat = nclat;
+      }
+
+      std::vector<int32> words;
+      std::vector<int32> begin_times;
+      std::vector<int32> lengths;
+      CompactLattice::Weight weight;
+
+      CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight);
+      float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2());
+
+      stringstream text;
+      for (int i = 0, first = 1; i < words.size(); i++) {
+        if (words[i] == 0)
+            continue;
+
+        if (first)
+          first = 0;
+        else
+          text << " ";
+
+        text << model_->word_syms_->Find(words[i]);
+      }
+
+      ss << "<interpretation grammar=\"default\" confidence=\"" << likelihood << "\">\n";
+      ss << "<input mode=\"speech\">" << text.str() << "</input>\n";
+      ss << "<instance>" << text.str() << "</instance>\n";
+      ss << "</interpretation>\n";
+    }
+    ss << "</result>\n";
+
+    return StoreReturn(ss.str());
+}
+
+const char* Recognizer::GetResult()
 {
     if (decoder_->NumFramesDecoded() == 0) {
         return StoreEmptyReturn();
@@ -886,13 +955,15 @@ const char* KaldiRecognizer::GetResult()
         }
     } else if (strcmp(result_opts_, "words")!=0 && strcmp(result_opts_, "phones")!=0){        
         KALDI_ERR << "Invalid recognizer result options";
+    } else if (nlsml_) {
+        return NlsmlResult(rlat);
     } else {
         return NbestResult(rlat);
     } 
 }
 
 
-const char* KaldiRecognizer::PartialResult()
+const char* Recognizer::PartialResult()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -923,7 +994,7 @@ const char* KaldiRecognizer::PartialResult()
     return StoreReturn(res.dump());
 }
 
-const char* KaldiRecognizer::Result()
+const char* Recognizer::Result()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -933,7 +1004,7 @@ const char* KaldiRecognizer::Result()
     return GetResult();
 }
 
-const char* KaldiRecognizer::FinalResult()
+const char* Recognizer::FinalResult()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -961,7 +1032,7 @@ const char* KaldiRecognizer::FinalResult()
     return last_result_.c_str();
 }
 
-void KaldiRecognizer::Reset()
+void Recognizer::Reset()
 {
     if (state_ == RECOGNIZER_RUNNING) {
         decoder_->FinalizeDecoding();
@@ -970,17 +1041,25 @@ void KaldiRecognizer::Reset()
     state_ = RECOGNIZER_ENDPOINT;
 }
 
-const char *KaldiRecognizer::StoreEmptyReturn()
+const char *Recognizer::StoreEmptyReturn()
 {
     if (!max_alternatives_) {
         return StoreReturn("{\"text\": \"\"}");
+    } else if (nlsml_) {
+        return StoreReturn("<?xml version=\"1.0\"?>\n"
+                           "<result grammar=\"default\">\n"
+                           "<interpretation confidence=\"1.0\">\n"
+                           "<instance/>\n"
+                           "<input><noinput/></input>\n"
+                           "</interpretation>\n"
+                           "</result>\n");
     } else {
         return StoreReturn("{\"alternatives\" : [{\"text\": \"\", \"confidence\" : 1.0}] }");
     }
 }
 
 // Store result in recognizer and return as const string
-const char *KaldiRecognizer::StoreReturn(const string &res)
+const char *Recognizer::StoreReturn(const string &res)
 {
     last_result_ = res;
     return last_result_.c_str();
diff --git a/src/kaldi_recognizer.h b/src/recognizer.h
similarity index 89%
rename from src/kaldi_recognizer.h
rename to src/recognizer.h
index 2349aa3b..68143f43 100644
--- a/src/kaldi_recognizer.h
+++ b/src/recognizer.h
@@ -33,23 +33,24 @@
 
 using namespace kaldi;
 
-enum KaldiRecognizerState {
+enum RecognizerState {
     RECOGNIZER_INITIALIZED,
     RECOGNIZER_RUNNING,
     RECOGNIZER_ENDPOINT,
     RECOGNIZER_FINALIZED
 };
 
-class KaldiRecognizer {
+class Recognizer {
     public:
-        KaldiRecognizer(Model *model, float sample_frequency);
-        KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model);
-        KaldiRecognizer(Model *model, float sample_frequency, char const *grammar);
-        ~KaldiRecognizer();
+        Recognizer(Model *model, float sample_frequency);
+        Recognizer(Model *model, float sample_frequency, SpkModel *spk_model);
+        Recognizer(Model *model, float sample_frequency, char const *grammar);
+        ~Recognizer();
         void SetMaxAlternatives(int max_alternatives);
         void SetResultOptions(const char *result_opts);
         void SetSpkModel(SpkModel *spk_model);
         void SetWords(bool words);
+        void SetNLSML(bool nlsml);
         bool AcceptWaveform(const char *data, int len);
         bool AcceptWaveform(const short *sdata, int len);
         bool AcceptWaveform(const float *fdata, int len);
@@ -71,6 +72,7 @@ class KaldiRecognizer {
         const char *MbrResult(CompactLattice &clat);
         const char *WordandPhoneResult(CompactLattice &clat);
         const char *NbestResult(CompactLattice &clat);
+        const char *NlsmlResult(CompactLattice &clat);
 
         Model *model_ = nullptr;
         SingleUtteranceNnet3Decoder *decoder_ = nullptr;
@@ -97,6 +99,7 @@ class KaldiRecognizer {
         int max_alternatives_ = 0; // Disable alternatives by default
         const char *result_opts_ = "words"; // By default enable only word-level results
         bool words_ = false;
+        bool nlsml_ = false;
 
         float sample_frequency_;
         int32 frame_offset_;
@@ -104,7 +107,7 @@ class KaldiRecognizer {
         int64 samples_processed_;
         int64 samples_round_start_;
 
-        KaldiRecognizerState state_;
+        RecognizerState state_;
         string last_result_;
 };
 
diff --git a/src/spk_model.h b/src/spk_model.h
index 07cbd4b0..9a76c62a 100644
--- a/src/spk_model.h
+++ b/src/spk_model.h
@@ -22,7 +22,7 @@
 
 using namespace kaldi;
 
-class KaldiRecognizer;
+class Recognizer;
 
 class SpkModel {
 
@@ -32,7 +32,7 @@ class SpkModel {
     void Unref();
 
 protected:
-    friend class KaldiRecognizer;
+    friend class Recognizer;
     ~SpkModel() {};
 
     kaldi::nnet3::Nnet speaker_nnet;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 2088b6cb..3166cc0f 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "vosk_api.h"
-#include "kaldi_recognizer.h"
+
+#include "recognizer.h"
 #include "model.h"
 #include "spk_model.h"
 
 #if HAVE_CUDA
 #include "cudamatrix/cu-device.h"
+#include "batch_recognizer.h"
 #endif
 
 #include <string.h>
@@ -67,7 +69,7 @@ void vosk_spk_model_free(VoskSpkModel *model)
 VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate);
     } catch (...) {
         return nullptr;
     }
@@ -76,7 +78,7 @@ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate)
 VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, (SpkModel *)spk_model);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, (SpkModel *)spk_model);
     } catch (...) {
         return nullptr;
     }
@@ -85,7 +87,7 @@ VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, Vos
 VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, grammar);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, grammar);
     } catch (...) {
         return nullptr;
     }
@@ -93,17 +95,22 @@ VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, con
 
 void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives)
 {
-    ((KaldiRecognizer *)recognizer)->SetMaxAlternatives(max_alternatives);
+    ((Recognizer *)recognizer)->SetMaxAlternatives(max_alternatives);
 }
 
 void vosk_recognizer_set_result_options(VoskRecognizer *recognizer, const char *result_opts)
 {
-    ((KaldiRecognizer *)recognizer)->SetResultOptions(result_opts);
+    ((Recognizer *)recognizer)->SetResultOptions(result_opts);
 }
 
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)
 {
-    ((KaldiRecognizer *)recognizer)->SetWords((bool)words);
+    ((Recognizer *)recognizer)->SetWords((bool)words);
+}
+
+void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml)
+{
+    ((Recognizer *)recognizer)->SetNLSML((bool)nlsml);
 }
 
 void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model)
@@ -111,13 +118,13 @@ void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk
     if (recognizer == nullptr || spk_model == nullptr) {
        return;
     }
-    ((KaldiRecognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model);
+    ((Recognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model);
 }
 
 int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -126,7 +133,7 @@ int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data
 int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -135,7 +142,7 @@ int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *d
 int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -143,27 +150,27 @@ int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *d
 
 const char *vosk_recognizer_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->Result();
+    return ((Recognizer *)recognizer)->Result();
 }
 
 const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->PartialResult();
+    return ((Recognizer *)recognizer)->PartialResult();
 }
 
 const char *vosk_recognizer_final_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->FinalResult();
+    return ((Recognizer *)recognizer)->FinalResult();
 }
 
 void vosk_recognizer_reset(VoskRecognizer *recognizer)
 {
-    ((KaldiRecognizer *)recognizer)->Reset();
+    ((Recognizer *)recognizer)->Reset();
 }
 
 void vosk_recognizer_free(VoskRecognizer *recognizer)
 {
-    delete (KaldiRecognizer *)(recognizer);
+    delete (Recognizer *)(recognizer);
 }
 
 void vosk_set_log_level(int log_level)
@@ -174,6 +181,8 @@ void vosk_set_log_level(int log_level)
 void vosk_gpu_init()
 {
 #if HAVE_CUDA
+//    kaldi::CuDevice::EnableTensorCores(true);
+//    kaldi::CuDevice::EnableTf32Compute(true);
     kaldi::CuDevice::Instantiate().SelectGpuId("yes");
     kaldi::CuDevice::Instantiate().AllowMultithreading();
 #endif
@@ -185,3 +194,65 @@ void vosk_gpu_thread_init()
     kaldi::CuDevice::Instantiate();
 #endif
 }
+
+VoskBatchRecognizer *vosk_batch_recognizer_new()
+{
+#if HAVE_CUDA
+    return (VoskBatchRecognizer *)(new BatchRecognizer());
+#else
+    return NULL;
+#endif
+}
+
+void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
+{
+#if HAVE_CUDA
+    delete ((BatchRecognizer *)recognizer);
+#endif
+}
+
+void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length)
+{
+#if HAVE_CUDA
+    ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
+#endif
+}
+
+void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id)
+{
+#if HAVE_CUDA
+    ((BatchRecognizer *)recognizer)->FinishStream(id);
+#endif
+}
+
+const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id)
+{
+#if HAVE_CUDA
+    return ((BatchRecognizer *)recognizer)->FrontResult(id);
+#else
+    return NULL;
+#endif
+}
+
+void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
+{
+#if HAVE_CUDA
+    ((BatchRecognizer *)recognizer)->Pop(id);
+#endif
+}
+
+void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
+{
+#if HAVE_CUDA
+    ((BatchRecognizer *)recognizer)->WaitForCompletion();
+#endif
+}
+
+int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id)
+{
+#if HAVE_CUDA
+    return ((BatchRecognizer *)recognizer)->GetNumPendingChunks(id);
+#else
+    return 0;
+#endif
+}
diff --git a/src/vosk_api.h b/src/vosk_api.h
index a1d46347..5988145d 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -39,6 +39,10 @@ typedef struct VoskSpkModel VoskSpkModel;
  *  speaker information and so on */
 typedef struct VoskRecognizer VoskRecognizer;
 
+/**
+ * Batch recognizer object
+ */
+typedef struct VoskBatchRecognizer VoskBatchRecognizer;
 
 /** Loads model data from the file and returns the model object
  *
@@ -282,6 +286,12 @@ void vosk_recognizer_set_result_options(VoskRecognizer *recognizer, const char *
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words);
 
 
+/** Set NLSML output
+ * @param nlsml - boolean value
+ */
+void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml);
+
+
 /** Accept voice data
  *
  *  accept and process new chunk of voice data
@@ -380,6 +390,33 @@ void vosk_gpu_init();
  */
 void vosk_gpu_thread_init();
 
+/** Creates the batch recognizer object
+ *
+ *  @returns recognizer object or NULL if problem occured */
+VoskBatchRecognizer *vosk_batch_recognizer_new();
+
+/** Releases batch recognizer object
+ *  Underlying model is also unreferenced and if needed released */
+void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
+
+/** Accept batch voice data */
+void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length);
+
+/** Closes the stream */
+void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id);
+
+/** Return results */
+const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id);
+
+/** Release and free first retrieved result */
+void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id);
+
+/** Wait for the processing */
+void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer);
+
+/** Get amount of pending chunks for more intelligent waiting */
+int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/travis/Dockerfile.win b/travis/Dockerfile.win
index 89081c2b..4f00bfcc 100644
--- a/travis/Dockerfile.win
+++ b/travis/Dockerfile.win
@@ -55,7 +55,7 @@ RUN cd /opt/kaldi \
     && find . -name *.a -exec cp {} /opt/kaldi/local/lib \;
 
 RUN cd /opt/kaldi \
-    && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \
+    && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \
     && cd kaldi/src \
     && CXX=x86_64-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \
         --mathlib=OPENBLAS_CLAPACK \
diff --git a/travis/Dockerfile.win32 b/travis/Dockerfile.win32
index 59198d9a..5a478e52 100644
--- a/travis/Dockerfile.win32
+++ b/travis/Dockerfile.win32
@@ -54,7 +54,7 @@ RUN cd /opt/kaldi \
     && find . -name *.a -exec cp {} /opt/kaldi/local/lib \;
 
 RUN cd /opt/kaldi \
-    && git clone -b android-mix --single-branch https://github.com/alphacep/kaldi \
+    && git clone -b vosk-android --single-branch https://github.com/alphacep/kaldi \
     && cd kaldi/src \
     && CXX=i686-w64-mingw32-g++-posix CXXFLAGS="-O3 -ftree-vectorize -DFST_NO_DYNAMIC_LINKING" ./configure --shared --mingw=yes --use-cuda=no \
         --mathlib=OPENBLAS_CLAPACK \
diff --git a/travis/build-wheels-win.sh b/travis/build-wheels-win.sh
index 750b6dd7..02bf6efc 100755
--- a/travis/build-wheels-win.sh
+++ b/travis/build-wheels-win.sh
@@ -5,7 +5,7 @@ set -e -x
 cd /opt
 git clone https://github.com/alphacep/vosk-api
 cd vosk-api/src
-CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
+EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=x86_64-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
 
 # Collect dependencies
 cp /usr/lib/gcc/x86_64-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src
@@ -14,7 +14,7 @@ cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src
 
 # Copy dlls to output folder
 mkdir -p /io/wheelhouse/win64
-cp /opt/vosk-api/src/*.dll /io/wheelhouse/win64
+cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win64
 
 # Build wheel and put to the output folder
 export VOSK_SOURCE=/opt/vosk-api
diff --git a/travis/build-wheels-win32.sh b/travis/build-wheels-win32.sh
index 2b934bd3..82af745e 100755
--- a/travis/build-wheels-win32.sh
+++ b/travis/build-wheels-win32.sh
@@ -5,7 +5,7 @@ set -e -x
 cd /opt
 git clone https://github.com/alphacep/vosk-api
 cd vosk-api/src
-CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
+EXTRA_LDFLAGS=-Wl,--out-implib,libvosk.lib CXX=i686-w64-mingw32-g++-posix EXT=dll KALDI_ROOT=/opt/kaldi/kaldi OPENFST_ROOT=/opt/kaldi/local OPENBLAS_ROOT=/opt/kaldi/local make -j $(nproc)
 
 # Copy dependencies
 cp /usr/lib/gcc/i686-w64-mingw32/*-posix/libstdc++-6.dll /opt/vosk-api/src
@@ -14,7 +14,7 @@ cp /usr/i686-w64-mingw32/lib/libwinpthread-1.dll /opt/vosk-api/src
 
 # Copy dlls to output folder
 mkdir -p /io/wheelhouse/win32
-cp /opt/vosk-api/src/*.dll /io/wheelhouse/win32
+cp /opt/vosk-api/src/*.{dll,lib} /io/wheelhouse/win32
 
 # Build wheel and put to the output folder
 export VOSK_SOURCE=/opt/vosk-api