reinfer · Jun 25, 2024
diff --git a/‎CHANGELOG.md
+25 b/‎CHANGELOG.md
+25
diff --git a/‎CMakeLists.txt
+4-2 b/‎CMakeLists.txt
+4-2
diff --git a/‎README.md
+17 b/‎README.md
+17
diff --git a/‎docker/Dockerfile
+5-5 b/‎docker/Dockerfile
+5-5
diff --git a/‎docker/build_all.sh
+1-1 b/‎docker/build_all.sh
+1-1
diff --git a/‎include/ctranslate2/decoding.h
+2 b/‎include/ctranslate2/decoding.h
+2
diff --git a/‎include/ctranslate2/layers/attention.h
+2-1 b/‎include/ctranslate2/layers/attention.h
+2-1
diff --git a/‎include/ctranslate2/layers/transformer.h
+3-1 b/‎include/ctranslate2/layers/transformer.h
+3-1
diff --git a/‎include/ctranslate2/ops/ops.h
+1 b/‎include/ctranslate2/ops/ops.h
+1
diff --git a/‎include/ctranslate2/ops/slide.h
+26 b/‎include/ctranslate2/ops/slide.h
+26
diff --git a/‎include/ctranslate2/scoring.h
+3-1 b/‎include/ctranslate2/scoring.h
+3-1
diff --git a/‎include/ctranslate2/translation.h
+5-1 b/‎include/ctranslate2/translation.h
+5-1
diff --git a/‎python/cpp/translation_result.cc
+6-1 b/‎python/cpp/translation_result.cc
+6-1
diff --git a/‎python/cpp/translator.cc
+8-1 b/‎python/cpp/translator.cc
+8-1
diff --git a/‎python/ctranslate2/converters/opennmt_py.py
+5-4 b/‎python/ctranslate2/converters/opennmt_py.py
+5-4
diff --git a/‎python/ctranslate2/converters/transformers.py
+94-5 b/‎python/ctranslate2/converters/transformers.py
+94-5
diff --git a/‎python/ctranslate2/specs/transformer_spec.py
+3 b/‎python/ctranslate2/specs/transformer_spec.py
+3
diff --git a/‎python/ctranslate2/version.py
+1-1 b/‎python/ctranslate2/version.py
+1-1
diff --git a/‎python/setup.py
+1 b/‎python/setup.py
+1
diff --git a/‎python/tests/requirements.txt
+1-1 b/‎python/tests/requirements.txt
+1-1
diff --git a/‎python/tests/test_transformers.py
+6-2 b/‎python/tests/test_transformers.py
+6-2
diff --git a/‎python/tools/prepare_build_environment_linux.sh
+7-9 b/‎python/tools/prepare_build_environment_linux.sh
+7-9
diff --git a/‎python/tools/prepare_build_environment_windows.sh
+9-7 b/‎python/tools/prepare_build_environment_windows.sh
+9-7
diff --git a/‎src/decoding.cc
+18-8 b/‎src/decoding.cc
+18-8
diff --git a/‎src/layers/attention.cc
+23-5 b/‎src/layers/attention.cc
+23-5
diff --git a/‎src/layers/transformer.cc
+92-43 b/‎src/layers/transformer.cc
+92-43
diff --git a/‎src/models/language_model.cc
+2-1 b/‎src/models/language_model.cc
+2-1
diff --git a/‎src/models/sequence_to_sequence.cc
+6-2 b/‎src/models/sequence_to_sequence.cc
+6-2
diff --git a/‎src/ops/concat_split_cpu.cc ‎src/ops/concat_split_slide_cpu.cc
+30-1 b/‎src/ops/concat_split_cpu.cc ‎src/ops/concat_split_slide_cpu.cc
+30-1
diff --git a/‎src/ops/concat_split_gpu.cu ‎src/ops/concat_split_slide_gpu.cu
+49-2 b/‎src/ops/concat_split_gpu.cu ‎src/ops/concat_split_slide_gpu.cu
+49-2
diff --git a/‎src/ops/slide.cc
+47 b/‎src/ops/slide.cc
+47
diff --git a/‎src/scoring.cc
+3-2 b/‎src/scoring.cc
+3-2
@@ -4,6 +4,31 @@
 
 ### Fixes and improvements
 
+## [v3.24.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v3.23.0) (2024-01-08)
+
+### New features
+* Support of new option offset to ignore token score of special tokens
+
+## [v3.23.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v3.23.0) (2023-12-05)
+
+### New features
+* Support Phi model
+
+### Fixes and improvements
+* Fix the conversion for whisper without the "alignment_heads" in the "generation_config.json"
+* Fix forward batch
+
+## [v3.22.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v3.22.0) (2023-11-22)
+
+### New features
+* Support "sliding window" and "chunking input" for Mistral
+
+### Fixes and improvements
+* Take into account the "generation_config.json" and fix "lang_ids" getter for Whisper converter
+* Accept callback even on "generate_tokens" method
+* Fix iomp5 linking with latest Intel OpenAPI on Ubuntu
+* Fixed "decoder_start_token_id" for T5
+
 ## [v3.21.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v3.21.0) (2023-11-09)
 
 ### New features
 
@@ -134,7 +134,7 @@ set(SOURCES
   src/ops/bias_add.cc
   src/ops/bias_add_cpu.cc
   src/ops/concat.cc
-  src/ops/concat_split_cpu.cc
+  src/ops/concat_split_slide_cpu.cc
   src/ops/conv1d.cc
   src/ops/conv1d_cpu.cc
   src/ops/cos.cc
@@ -168,6 +168,7 @@ set(SOURCES
   src/ops/softmax.cc
   src/ops/softmax_cpu.cc
   src/ops/split.cc
+  src/ops/slide.cc
   src/ops/sub.cc
   src/ops/swish.cc
   src/ops/tanh.cc
@@ -263,6 +264,7 @@ if(NOT OPENMP_RUNTIME STREQUAL "NONE")
       ${INTEL_ROOT}/oneAPI/compiler/latest/windows/compiler/lib/intel64_win
       ${INTEL_ROOT}/oneapi/compiler/latest/linux/compiler/lib/intel64_lin
       ${INTEL_ROOT}/oneapi/compiler/latest/mac/compiler/lib
+      ${INTEL_ROOT}/oneapi/compiler/latest/lib
       )
     if(IOMP5_LIBRARY)
       list(APPEND LIBRARIES ${IOMP5_LIBRARY})
@@ -505,7 +507,7 @@ if (WITH_CUDA)
     src/cuda/utils.cc
     src/ops/alibi_add_gpu.cu
     src/ops/bias_add_gpu.cu
-    src/ops/concat_split_gpu.cu
+    src/ops/concat_split_slide_gpu.cu
     src/ops/conv1d_gpu.cu
     src/ops/dequantize_gpu.cu
     src/ops/gather_gpu.cu
 
@@ -123,3 +123,20 @@ Executed with CUDA 11 on a [*g5.xlarge*](https://aws.amazon.com/ec2/instance-typ
 * [Documentation](https://opennmt.net/CTranslate2)
 * [Forum](https://forum.opennmt.net)
 * [Gitter](https://gitter.im/OpenNMT/CTranslate2)
+
+
+# To build locally
+
+    cd CTranslate2
+    mkdir build
+    sudo cmake -DCMAKE_INSTALL_PREFIX=/usr/local -DWITH_CUDA=ON -DWITH_CUDNN=ON -DWITH_MKL=ON -DOPENMP_RUNTIME=COMP -DCMAKE_BUILD_TYPE=Release ..
+    sudo make -j4
+    sudo make install
+    sudo ldconfig  
+
+# LD_LIBRARY_PATH should contain the ctranslate install path
+
+# Build python wheel
+    cd python
+    python setup.py bdist_wheel --dist-dir <path_to_wheel_dir>
+    auditwheel repair --plat manylinux_2_34_x86_64 <path_to_wheel>
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 as builder
+FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04 as builder
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -66,18 +66,18 @@ RUN cd python && \
     python3 -m pip --no-cache-dir install -r install_requirements.txt && \
     python3 setup.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT
 
-FROM nvidia/cuda:11.2.2-base-ubuntu20.04
+FROM nvidia/cuda:12.2.2-base-ubuntu20.04
 
 # We remove the cuda-compat package because it conflicts with the CUDA Enhanced Compatibility.
 # See e.g. https://github.com/NVIDIA/nvidia-docker/issues/1515
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-        libcublas-11-2 \
-        libcudnn8=8.1.1.33-1+cuda11.2 \
+        libcublas-12-2 \
+        libcudnn8=8.9.7.29-1+cuda12.2 \
         libgomp1 \
         python3-pip \
         && \
-    apt-get purge -y cuda-compat-11-2 && \
+    apt-get purge -y cuda-compat-12-2 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 
@@ -42,4 +42,4 @@ build()
     fi
 }
 
-build Dockerfile ubuntu20.04-cuda11.2
+build Dockerfile ubuntu20.04-cuda12.2
@@ -15,6 +15,8 @@ namespace ctranslate2 {
     std::vector<std::vector<size_t>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
+    std::vector<float> logits;    
+    // (max_decoding_steps)
   };
 
   struct DecodingStepResult {
 
@@ -35,7 +35,8 @@ namespace ctranslate2 {
                       const Padder* queries_padder = nullptr,
                       const Padder* values_padder = nullptr,
                       bool return_normalized_attention = true,
-                      StorageView* position_bias = nullptr) const;
+                      StorageView* position_bias = nullptr,
+                      dim_t offset = 0) const;
 
       bool has_positional_embeddings() const {
         return _relative_position_keys || _relative_attention_bias || _rotary_embeddings || _alibi;
 
@@ -91,7 +91,8 @@ namespace ctranslate2 {
                       const Padder* input_padder = nullptr,
                       const Padder* memory_padder = nullptr,
                       bool return_normalized_attention = true,
-                      StorageView* position_bias = nullptr) const;
+                      StorageView* position_bias = nullptr,
+                      dim_t offset = 0) const;
 
       DataType output_type() const override {
         return _ff.output_type();
@@ -209,6 +210,7 @@ namespace ctranslate2 {
       std::vector<std::vector<dim_t>> _alignment_heads;
       bool _average_alignment_heads;
       Dense _proj;
+      const dim_t _sliding_window;
     };
 
   }
 
@@ -36,3 +36,4 @@
 #include "median_filter.h"
 #include "rotary.h"
 #include "alibi_add.h"
+#include "slide.h"
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "op.h"
+
+namespace ctranslate2 {
+  namespace ops {
+
+    class Slide : public Op {
+    public:
+      Slide(dim_t axis, const dim_t& index, const dim_t& size, bool no_copy = false);
+
+      void operator()(const StorageView& input, StorageView& output) const;
+    private:
+      dim_t _axis;
+      dim_t _index;
+      dim_t _size;
+      bool _no_copy;
+
+      void check_arguments() const;
+
+      template <Device D, typename T>
+      void compute(const StorageView& input, StorageView& output, const dim_t& index) const;
+    };
+
+  }
+}
@@ -12,6 +12,7 @@ namespace ctranslate2 {
   struct ScoringOptions {
     // Truncate the inputs after this many tokens (set 0 to disable truncation).
     size_t max_input_length = 1024;
+    dim_t offset = 0;
   };
 
   struct ScoringResult {
@@ -38,6 +39,7 @@ namespace ctranslate2 {
                   layers::DecoderState& state,
                   const std::vector<std::vector<size_t>>& sequences,
                   const Vocabulary& vocabulary,
-                  const dim_t preferred_size_multiple = 1);
+                  const dim_t preferred_size_multiple = 1,
+                  const dim_t offset=0);
 
 }
@@ -87,6 +87,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
+    std::vector<float> logits;
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_)
       : hypotheses(std::move(hypotheses_))
@@ -95,10 +96,12 @@ namespace ctranslate2 {
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_,
                       std::vector<float> scores_,
-                      std::vector<std::vector<std::vector<float>>> attention_)
+                      std::vector<std::vector<std::vector<float>>> attention_, 
+                      std::vector<float> logits_)
       : hypotheses(std::move(hypotheses_))
       , scores(std::move(scores_))
       , attention(std::move(attention_))
+      , logits(std::move(logits_))
     {
     }
 
@@ -109,6 +112,7 @@ namespace ctranslate2 {
       : hypotheses(num_hypotheses)
       , scores(with_score ? num_hypotheses : 0, static_cast<float>(0))
       , attention(with_attention ? num_hypotheses : 0)
+      , logits(with_score ? num_hypotheses : 0)
     {
     }
 
 
@@ -16,11 +16,14 @@ namespace ctranslate2 {
                       "Score of each translation hypothesis (empty if :obj:`return_scores` was disabled).")
         .def_readonly("attention", &TranslationResult::attention,
                       "Attention matrix of each translation hypothesis (empty if :obj:`return_attention` was disabled).")
+        .def_readonly("logits", &TranslationResult::logits,
+                      "Logits for each decoding step")
 
         .def("__repr__", [](const TranslationResult& result) {
           return "TranslationResult(hypotheses=" + std::string(py::repr(py::cast(result.hypotheses)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
             + ", attention=" + std::string(py::repr(py::cast(result.attention)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ")";
         })
 
@@ -39,8 +42,10 @@ namespace ctranslate2 {
             throw py::index_error();
           py::dict hypothesis;
           hypothesis["tokens"] = result.hypotheses[i];
-          if (result.has_scores())
+          if (result.has_scores()){
             hypothesis["score"] = result.scores[i];
+            hypothesis["logits"] = result.logits[i];
+          };
           if (result.has_attention())
             hypothesis["attention"] = result.attention[i];
           return hypothesis;
 
@@ -228,10 +228,12 @@ namespace ctranslate2 {
                   size_t max_batch_size,
                   const std::string& batch_type_str,
                   size_t max_input_length,
+                  dim_t offset,
                   bool asynchronous) {
         const auto batch_type = str_to_batch_type(batch_type_str);
         ScoringOptions options;
         options.max_input_length = max_input_length;
+        options.offset = offset;
 
         std::shared_lock lock(_mutex);
         assert_model_is_ready();
@@ -252,6 +254,7 @@ namespace ctranslate2 {
                                 size_t read_batch_size,
                                 const std::string& batch_type_str,
                                 size_t max_input_length,
+                                dim_t offset,
                                 bool with_tokens_score,
                                 const TokenizeFn& source_tokenize_fn,
                                 const TokenizeFn& target_tokenize_fn,
@@ -263,7 +266,7 @@ namespace ctranslate2 {
         const auto batch_type = str_to_batch_type(batch_type_str);
         ScoringOptions options;
         options.max_input_length = max_input_length;
-
+        options.offset = offset;
         std::shared_lock lock(_mutex);
         assert_model_is_ready();
 
@@ -592,6 +595,7 @@ namespace ctranslate2 {
              py::arg("max_batch_size")=0,
              py::arg("batch_type")="examples",
              py::arg("max_input_length")=1024,
+             py::arg("offset") = 0,
              py::arg("asynchronous")=false,
              py::call_guard<py::gil_scoped_release>(),
              R"pbdoc(
@@ -606,6 +610,7 @@ namespace ctranslate2 {
                      minimized.
                    batch_type: Whether :obj:`max_batch_size` is the number of "examples" or "tokens".
                    max_input_length: Truncate inputs after this many tokens (0 to disable).
+                   offset: Ignore the first n tokens in target in score calculation.
                    asynchronous: Run the scoring asynchronously.
 
                  Returns:
@@ -621,6 +626,7 @@ namespace ctranslate2 {
              py::arg("read_batch_size")=0,
              py::arg("batch_type")="examples",
              py::arg("max_input_length")=1024,
+             py::arg("offset")=0,
              py::arg("with_tokens_score")=false,
              py::arg("source_tokenize_fn")=nullptr,
              py::arg("target_tokenize_fn")=nullptr,
@@ -649,6 +655,7 @@ namespace ctranslate2 {
                    batch_type: Whether :obj:`max_batch_size` and :obj:`read_batch_size` are the
                      number of "examples" or "tokens".
                    max_input_length: Truncate inputs after this many tokens (0 to disable).
+                   offset: Ignore the first n tokens in target in score calculation.
                    with_tokens_score: Include the token-level scores in the output file.
                    source_tokenize_fn: Function to tokenize source lines.
                    target_tokenize_fn: Function to tokenize target lines.
 
@@ -104,9 +104,10 @@ def _get_model_spec_lm(opt, variables, src_vocabs, tgt_vocabs, num_source_embedd
     activation_fn = getattr(opt, "pos_ffn_activation_fn", "relu")
     num_heads = getattr(opt, "heads", 8)
     num_kv = getattr(opt, "num_kv", 0)
-    if num_kv == num_heads:
+    if num_kv == num_heads or num_kv == 0:
         num_kv = None
     rotary_dim = 0 if with_rotary else None
+    rotary_interleave = getattr(opt, "rotary_interleave", True)
     ffn_glu = activation_fn == "silu"
     sliding_window = getattr(opt, "sliding_window", 0)
 
@@ -119,7 +120,7 @@ def _get_model_spec_lm(opt, variables, src_vocabs, tgt_vocabs, num_source_embedd
         alibi=with_alibi,
         rms_norm=opt.layer_norm == "rms",
         rotary_dim=rotary_dim,
-        rotary_interleave=True,
+        rotary_interleave=rotary_interleave,
         multi_query_attention=getattr(opt, "multiquery", False),
         num_heads_kv=num_kv,
         sliding_window=sliding_window,
@@ -329,7 +330,7 @@ def set_linear(spec, variables, scope):
     spec.weight = _get_variable(variables, "%s.weight" % scope)
     bias = variables.get("%s.bias" % scope)
     if bias is not None:
-        spec.bias = bias.numpy()
+        spec.bias = bias
 
 
 def set_embeddings(spec, variables, scope):
@@ -341,7 +342,7 @@ def set_position_encodings(spec, variables, scope):
 
 
 def _get_variable(variables, name):
-    return variables[name].numpy()
+    return variables[name]
 
 
 def main():
 
@@ -889,12 +889,44 @@ def get_model_spec(self, model):
 
         return spec
 
+    def _get_lang_ids_from_tokenizer(self, tokenizer):
+        non_lang_special_tokens = [
+            "<|endoftext|>",
+            "<|startoftranscript|>",
+            "<|translate|>",
+            "<|transcribe|>",
+            "<|startoflm|>",
+            "<|startofprev|>",
+            "<|nocaptions|>",
+            "<|notimestamps|>",
+        ]
+        return [
+            token_id
+            for token_id, token in zip(
+                tokenizer.additional_special_tokens_ids,
+                tokenizer.additional_special_tokens,
+            )
+            if token not in non_lang_special_tokens
+        ]
+
     def set_config(self, config, model, tokenizer):
-        config.suppress_ids = model.config.suppress_tokens
-        config.suppress_ids_begin = model.config.begin_suppress_tokens
-        config.lang_ids = tokenizer.additional_special_tokens_ids[2:-6]
+        gen_config = getattr(model, "generation_config", None)
+
+        if gen_config is not None:
+            config.suppress_ids = gen_config.suppress_tokens
+            config.suppress_ids_begin = gen_config.begin_suppress_tokens
+            if hasattr(gen_config, "alignment_heads"):
+                config.alignment_heads = gen_config.alignment_heads
+            if hasattr(gen_config, "lang_to_id"):
+                config.lang_ids = sorted(gen_config.lang_to_id.values())
+        else:
+            config.suppress_ids = model.config.suppress_tokens
+            config.suppress_ids_begin = model.config.begin_suppress_tokens
+            config.alignment_heads = _WHISPER_ALIGNMENT_HEADS.get(model.name_or_path)
+
+        if getattr(config, "lang_ids", None) is None:
+            config.lang_ids = self._get_lang_ids_from_tokenizer(tokenizer)
 
-        config.alignment_heads = _WHISPER_ALIGNMENT_HEADS.get(model.name_or_path)
         if config.alignment_heads is None:
             # Use the last half layers for alignment by default.
             num_layers = model.config.decoder_layers
@@ -1024,7 +1056,12 @@ def set_config(self, config, model, tokenizer):
         config.bos_token = tokenizer.pad_token
         config.eos_token = tokenizer.eos_token
         config.unk_token = tokenizer.unk_token
-        config.decoder_start_token = tokenizer.pad_token
+        if hasattr(model.config, "decoder_start_token_id"):
+            config.decoder_start_token = tokenizer.convert_ids_to_tokens(
+                model.config.decoder_start_token_id
+            )
+        else:
+            config.decoder_start_token = tokenizer.pad_token
 
     def set_stack(self, spec, module, is_decoder=False):
         self.set_layer_norm(spec.layer_norm, module.final_layer_norm)
@@ -1493,6 +1530,58 @@ def set_decoder(self, spec, module):
             self.set_linear(layer_spec.ffn.linear_1, layer.mlp.fc2)
 
 
+@register_loader("PhiConfig")
+class PhiLoader(ModelLoader):
+    @property
+    def architecture_name(self):
+        return "AutoModelForCausalLM"
+
+    def get_model_spec(self, model):
+        spec = transformer_spec.TransformerDecoderModelSpec.from_config(
+            num_layers=model.config.n_layer,
+            num_heads=model.config.n_head,
+            pre_norm=True,
+            activation=_SUPPORTED_ACTIVATIONS[model.config.activation_function],
+            rotary_dim=model.config.rotary_dim,
+            rotary_interleave=False,
+            parallel_residual=True,
+            shared_layer_norm=True,
+        )
+
+        self.set_decoder(spec.decoder, model.transformer)
+        self.set_linear(spec.decoder.projection, model.lm_head.linear)
+        self.set_layer_norm(spec.decoder.layer_norm, model.lm_head.ln)
+        return spec
+
+    def get_vocabulary(self, model, tokenizer):
+        tokens = super().get_vocabulary(model, tokenizer)
+
+        extra_ids = model.config.vocab_size - len(tokens)
+        for i in range(extra_ids):
+            tokens.append("<extra_id_%d>" % i)
+
+        return tokens
+
+    def set_vocabulary(self, spec, tokens):
+        spec.register_vocabulary(tokens)
+
+    def set_config(self, config, model, tokenizer):
+        config.bos_token = tokenizer.bos_token
+        config.eos_token = tokenizer.eos_token
+        config.unk_token = tokenizer.unk_token
+
+    def set_decoder(self, spec, module):
+        spec.scale_embeddings = False
+        self.set_embeddings(spec.embeddings, module.embd.wte)
+
+        for layer_spec, layer in zip(spec.layer, module.h):
+            self.set_layer_norm(layer_spec.shared_layer_norm, layer.ln)
+            self.set_linear(layer_spec.self_attention.linear[0], layer.mixer.Wqkv)
+            self.set_linear(layer_spec.self_attention.linear[1], layer.mixer.out_proj)
+            self.set_linear(layer_spec.ffn.linear_0, layer.mlp.fc1)
+            self.set_linear(layer_spec.ffn.linear_1, layer.mlp.fc2)
+
+
 @register_loader("RWConfig")
 class RWLoader(ModelLoader):
     @property
 
@@ -171,6 +171,8 @@ def __init__(
         self.alibi = alibi
         self.alibi_use_positive_positions = alibi_use_positive_positions
         self.scale_alibi = scale_alibi
+        if sliding_window is not None:
+            self.sliding_window = np.dtype("int32").type(sliding_window)
         if (
             not relative_position
             and not relative_attention_bias
@@ -225,6 +227,7 @@ def __init__(
             relative_attention_bias=relative_attention_bias,
             rms_norm=rms_norm,
             num_heads_kv=num_heads_kv,
+            sliding_window=sliding_window,
         )
         self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)
 
 
@@ -1,3 +1,3 @@
 """Version information."""
 
-__version__ = "3.21.0"
+__version__ = "3.24.1"
@@ -112,6 +112,7 @@ def _maybe_add_library_root(lib_name):
         "numpy",
         "pyyaml>=5.3,<7",
     ],
+    include_package_data=True,
     entry_points={
         "console_scripts": [
             "ct2-fairseq-converter=ctranslate2.converters.fairseq:main",
 
@@ -1,4 +1,4 @@
-transformers==4.29.*;platform_system=='Linux'
+transformers==4.35.*;platform_system=='Linux'
 fairseq==0.12.2;platform_system=='Linux' or platform_system=='Darwin'
 OpenNMT-py==2.2.*;platform_system=='Linux' or platform_system=='Darwin'
 OpenNMT-tf==2.30.*
 
@@ -984,13 +984,17 @@ def test_transformers_wav2vec2(
         w2v2_model = transformers.Wav2Vec2ForCTC.from_pretrained(model_name)
         del w2v2_model.wav2vec2.encoder.layers
         del w2v2_model.wav2vec2.encoder.layer_norm
-        torch.save(w2v2_model, output_dir + "/wav2vec2_partial.bin")
+        w2v2_model.save_pretrained(output_dir + "/wav2vec2_partial.bin")
         w2v2_processor = transformers.Wav2Vec2Processor.from_pretrained(model_name)
         torch.save(w2v2_processor, output_dir + "/wav2vec2_processor.bin")
 
         device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu"
         cpu_threads = int(os.environ.get("OMP_NUM_THREADS", 0))
-        w2v2_model = torch.load(output_dir + "/wav2vec2_partial.bin").to(device)
+        w2v2_model = transformers.Wav2Vec2ForCTC.from_pretrained(
+            output_dir + "/wav2vec2_partial.bin"
+        ).to(device)
+        del w2v2_model.wav2vec2.encoder.layers
+        del w2v2_model.wav2vec2.encoder.layer_norm
         w2v2_processor = torch.load(output_dir + "/wav2vec2_processor.bin")
         ct2_w2v2_model = ctranslate2.models.Wav2Vec2(
             output_dir,
 
@@ -20,17 +20,15 @@ if [ "$CIBW_ARCHS" == "aarch64" ]; then
 
 else
 
-    # Install CUDA 11.2, see:
-    # * https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.2.2/centos7-x86_64/base/Dockerfile
-    # * https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.2.2/centos7-x86_64/devel/Dockerfile
+    # Install CUDA 12.2:
     yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
     yum install --setopt=obsoletes=0 -y \
-        cuda-nvcc-11-2-11.2.152-1 \
-        cuda-cudart-devel-11-2-11.2.152-1 \
-        libcurand-devel-11-2-10.2.3.152-1 \
-        libcudnn8-devel-8.1.1.33-1.cuda11.2 \
-        libcublas-devel-11-2-11.4.1.1043-1
-    ln -s cuda-11.2 /usr/local/cuda
+        cuda-nvcc-12-2-12.2.140-1 \
+        cuda-cudart-devel-12-2-12.2.140-1 \
+        libcurand-devel-12-2-10.3.3.141-1 \
+        libcudnn8-devel-8.9.7.29-1.cuda12.2 \
+        libcublas-devel-12-2-12.2.5.6-1
+    ln -s cuda-12.2 /usr/local/cuda
 
     ONEAPI_VERSION=2023.2.0
     yum-config-manager --add-repo https://yum.repos.intel.com/oneapi
 
@@ -3,15 +3,17 @@
 set -e
 set -x
 
-CUDA_ROOT="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2"
-curl -L -nv -o cuda.exe https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_461.33_win10.exe
-./cuda.exe -s nvcc_11.2 cudart_11.2 cublas_dev_11.2 curand_dev_11.2
+CUDA_ROOT="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2"
+curl -L -nv -o cuda.exe https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_537.13_windows.exe
+./cuda.exe -s nvcc_12.2 cudart_12.2 cublas_dev_12.2 curand_dev_12.2
 rm cuda.exe
 
-curl -L -nv -o cudnn.zip https://developer.download.nvidia.com/compute/redist/cudnn/v8.1.1/cudnn-11.2-windows-x64-v8.1.1.33.zip
-unzip cudnn.zip && rm cudnn.zip
-cp -r cuda/* "$CUDA_ROOT"
-rm -r cuda/
+CUDNN_ROOT="C:/Program Files/NVIDIA/CUDNN/v8.8"
+curl -L -nv -o cudnn.exe https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn_8.8.0.121_windows.exe
+./cudnn.exe -s
+sleep 10
+cp -r "$CUDNN_ROOT"/* "$CUDA_ROOT"
+rm cudnn.exe
 
 # See https://github.com/oneapi-src/oneapi-ci for installer URLs
 curl -L -nv -o webimage.exe https://registrationcenter-download.intel.com/akdlm/irc_nas/19078/w_BaseKit_p_2023.0.0.25940_offline.exe
 
@@ -779,24 +779,29 @@ namespace ctranslate2 {
     StorageView logits(dtype, device);
     std::vector<dim_t> batch_offset(batch_size);
     std::vector<DecodingResult> results(batch_size);
+
+    StorageView best_ids(DataType::INT32);
+    StorageView best_probs(dtype);
+    StorageView alive_seq(DataType::INT32);
+    StorageView attention_step;
+    StorageView attention_step_device(dtype, device);
+
+    const dim_t max_step = get_max_step(max_length, return_prefix, prefix_ids);
+
     for (dim_t i = 0; i < batch_size; ++i) {
       batch_offset[i] = i;
       sample_from.at<int32_t>(i) = start_ids[i];
       results[i].hypotheses.resize(1);
       if (return_scores)
+      {
         results[i].scores.resize(1, 0.f);
+        results[i].logits.resize(max_step);
+      };
+
       if (return_attention)
         results[i].attention.resize(1);
     }
 
-    StorageView best_ids(DataType::INT32);
-    StorageView best_probs(dtype);
-    StorageView alive_seq(DataType::INT32);
-    StorageView attention_step;
-    StorageView attention_step_device(dtype, device);
-
-    const dim_t max_step = get_max_step(max_length, return_prefix, prefix_ids);
-
     for (dim_t step = 0; step < max_step; ++step) {
       convert_to_original_word_ids(decoder, sample_from);
       decoder(start_step + step,
@@ -851,6 +856,8 @@ namespace ctranslate2 {
         const size_t batch_id = batch_offset[i];
         const dim_t prefix_length = prefix_ids ? prefix_ids->at(batch_id).size() : 0;
         const float score = best_probs.scalar_at<float>({i, 0});
+        // convert word_id from 
+        const float log_prob = log_probs.scalar_at<float>({i, static_cast<int32_t>(word_id)});
 
         if ((!is_eos(word_id, end_ids) || include_eos_in_hypotheses)
             && (return_prefix || step >= prefix_length)) {
@@ -862,7 +869,10 @@ namespace ctranslate2 {
         }
 
         if (return_scores)
+        {
           results[batch_id].scores[0] += score;
+          results[batch_id].logits[step] = log_prob;
+        };
 
         bool is_finished = ((is_eos(word_id, end_ids) && step >= prefix_length)
                             || (is_last_step(step, max_length, prefix_length, return_prefix)));
 
@@ -430,7 +430,8 @@ namespace ctranslate2 {
                                         const Padder* queries_padder,
                                         const Padder* values_padder,
                                         bool return_normalized_attention,
-                                        StorageView* position_bias) const {
+                                        StorageView* position_bias,
+                                        dim_t offset) const {
       PROFILE("MultiHeadAttention");
       const Device device = queries.device();
       const DataType dtype = queries.dtype();
@@ -449,6 +450,8 @@ namespace ctranslate2 {
 
       dim_t beam_size = 1;
 
+      bool prefilling = (_sliding_window > 0 && values_lengths);
+
       if (!_self_attention) {
         queries_proj = std::move(fused_proj);
 
@@ -507,10 +510,6 @@ namespace ctranslate2 {
         }
 
         if (_rotary_embeddings) {
-          const dim_t offset = (cached_keys && !cached_keys->empty()
-                                ? cached_keys->dim(_cache_time_dim)
-                                : 0);
-
           if (_merge_time_and_head_dims) {
             queries_proj.reshape({queries_proj.dim(0), -1, _d_model});
             split_heads(queries_proj, _num_heads);
@@ -536,6 +535,15 @@ namespace ctranslate2 {
             concat_op({&tmp, &keys_proj}, *cached_keys);
             tmp = std::move(*cached_values);
             concat_op({&tmp, &values_proj}, *cached_values);
+
+            if (!prefilling && _sliding_window > 0 && cached_keys->shape()[2] > _sliding_window) {
+              // only for generation
+              const ops::Slide slide_op(2, 1, cached_keys->shape()[2] - 1);
+              slide_op(*cached_keys, tmp);
+              *cached_keys = std::move(tmp);
+              slide_op(*cached_values, tmp);
+              *cached_values = std::move(tmp);
+            }
           }
         }
       }
@@ -564,6 +572,16 @@ namespace ctranslate2 {
                             _alibi,
                             position_bias);
 
+      if (prefilling && cached_keys && cached_keys->shape()[2] > _sliding_window) {
+        // set only last sliding_window tokens to cached_keys and cached_values after computing attention
+        const ops::Slide slide_op(2, cached_keys->shape()[2] - _sliding_window, _sliding_window);
+        StorageView tmp(dtype, device);
+        slide_op(*cached_keys, tmp);
+        *cached_keys = std::move(tmp);
+        slide_op(*cached_values, tmp);
+        *cached_values = std::move(tmp);
+      }
+
       if (_merge_time_and_head_dims) {
         context.reshape(queries.shape());
         if (queries_padder)
 
@@ -121,7 +121,8 @@ namespace ctranslate2 {
                                              const Padder* input_padder,
                                              const Padder* memory_padder,
                                              bool return_normalized_attention,
-                                             StorageView* position_bias) const {
+                                             StorageView* position_bias,
+                                             dim_t offset) const {
       PROFILE("TransformerDecoderLayer");
 
       const DataType dtype = input.dtype();
@@ -149,7 +150,8 @@ namespace ctranslate2 {
                         input_padder,
                         input_padder,
                         true,
-                        position_bias);
+                        position_bias,
+                        offset);
 
         if (_post_attention_layer_norm)
           (*_post_attention_layer_norm)(input, hidden);
@@ -172,7 +174,8 @@ namespace ctranslate2 {
                       input_padder,
                       input_padder,
                       true,
-                      position_bias);
+                      position_bias,
+                      offset);
 
       StorageView context(dtype, device);
       if (_encoder_attention) {
@@ -330,7 +333,8 @@ namespace ctranslate2 {
                           ? nullptr
                           : build_position_encoder(model, scope + "/position_encodings", _embeddings))
       , _with_encoder_attention(_layers.front()->has_cross_attention())
-      , _proj(model, scope + "/projection") {
+      , _proj(model, scope + "/projection")
+      , _sliding_window(model.get_attribute_with_default<int32_t>(scope + "/sliding_window", 0)) {
 
       dim_t alignment_layer = (
         model.get_attribute_with_default<int32_t>(scope + "/alignment_layer", -1));
@@ -467,7 +471,13 @@ namespace ctranslate2 {
         (*_layernorm_embedding)(layer_in, layer_in);
 
       const dim_t batch_size = layer_in.dim(0);
-      const dim_t max_time = layer_in.dim(1);
+      dim_t max_time;
+
+      if (_sliding_window > 0 && layer_in.dim(1) > _sliding_window) {
+        max_time = _sliding_window;
+      } else
+        max_time = layer_in.dim(1);
+
       const bool allow_padding_removal = Padder::allow_padding_removal(_device, _compute_type);
 
       std::unique_ptr<const Padder> input_padder;
@@ -479,14 +489,14 @@ namespace ctranslate2 {
         lengths = input_lengths.get();
       }
 
+      bool multi_query = _layers.front()->get_self_attention().multi_query();
+
       if (lengths) {
         if (allow_padding_removal) {
           input_padder = std::make_unique<Padder>(*lengths, max_time);
           input_padder->remove_padding(layer_in);
         }
 
-        const bool multi_query = _layers.front()->get_self_attention().multi_query();
-
         StorageView lengths_mask = layers::MultiHeadAttention::prepare_length_mask(
           *lengths,
           _num_heads,
@@ -531,47 +541,86 @@ namespace ctranslate2 {
 
       StorageView position_bias(dtype, device);
 
-      for (size_t l = 0; l < _layers.size(); ++l) {
-        StorageView* cached_self_attn_keys = nullptr;
-        StorageView* cached_self_attn_values = nullptr;
-        StorageView* cached_attn_keys = nullptr;
-        StorageView* cached_attn_values = nullptr;
-
-        if (step >= 0) {
-          const std::string l_str = std::to_string(l);
-          cached_self_attn_keys = &state.at("self_keys_" + l_str);
-          cached_self_attn_values = &state.at("self_values_" + l_str);
-          if (_with_encoder_attention) {
-            cached_attn_keys = &state.at("memory_keys_" + l_str);
-            cached_attn_values = &state.at("memory_values_" + l_str);
-          }
+      std::vector<StorageView> layer_ins;
+
+      while (true) {
+        dim_t prompt_size = layer_in.dim(1);
+        if (_sliding_window == 0 || prompt_size <= _sliding_window) {
+          layer_ins.push_back(std::move(layer_in));
+          break;
         }
+        if (layer_in.dim(1) > _sliding_window) {
+          StorageView tmp(dtype, device);
+          const ops::Split split_op(1, {_sliding_window, prompt_size - _sliding_window});
+          split_op(layer_in, tmp, layer_in);
+          layer_ins.push_back(std::move(tmp));
+        }
+      }
 
-        std::unique_ptr<StorageView> heads_to_select = get_layer_alignment_heads(l, batch_size);
-        std::unique_ptr<StorageView> layer_attention;
-        if (attention && heads_to_select)
-          layer_attention = std::make_unique<StorageView>(dtype, device);
+      for (size_t i = 0; i < layer_ins.size(); ++i) {
+        auto layer_in_chunk = layer_ins[i];
+        for (size_t l = 0; l < _layers.size(); ++l) {
+          StorageView* cached_self_attn_keys = nullptr;
+          StorageView* cached_self_attn_values = nullptr;
+          StorageView* cached_attn_keys = nullptr;
+          StorageView* cached_attn_values = nullptr;
+
+          if (step >= 0) {
+            const std::string l_str = std::to_string(l);
+            cached_self_attn_keys = &state.at("self_keys_" + l_str);
+            cached_self_attn_values = &state.at("self_values_" + l_str);
+            if (_with_encoder_attention) {
+              cached_attn_keys = &state.at("memory_keys_" + l_str);
+              cached_attn_values = &state.at("memory_values_" + l_str);
+            }
+          }
 
-        (*_layers[l])(layer_in,
-                      input_lengths_mask.get(),
-                      memory,
-                      memory_lengths_mask.get(),
-                      cached_self_attn_keys,
-                      cached_self_attn_values,
-                      cached_attn_keys,
-                      cached_attn_values,
-                      layer_out,
-                      layer_attention.get(),
-                      input_padder.get(),
-                      memory_padder.get(),
-                      return_normalized_attention(),
-                      &position_bias);
-        layer_in = std::move(layer_out);
+          std::unique_ptr<StorageView> heads_to_select = get_layer_alignment_heads(l, batch_size);
+          std::unique_ptr<StorageView> layer_attention;
+          if (attention && heads_to_select)
+            layer_attention = std::make_unique<StorageView>(dtype, device);
+
+          dim_t offset = _sliding_window * i + step;
+          offset = offset < 0 ? 0 : offset;
+          if (i > 0) {
+            auto max_tokens = _sliding_window + layer_in_chunk.dim(1);
+            StorageView tmp_lengths = StorageView(Shape{layer_in_chunk.dim(0)}, int32_t(max_tokens), device);
+            StorageView lengths_mask = layers::MultiHeadAttention::prepare_length_mask(
+              tmp_lengths,
+              _num_heads,
+              max_tokens,
+              /*mask_future=*/true,
+              multi_query);
+
+            const ops::Slide slide_lengths_op(2, _sliding_window, layer_in_chunk.dim(1));
+            // reuse tmp_lengths
+            slide_lengths_op(lengths_mask, tmp_lengths);
+            input_lengths_mask = std::make_unique<StorageView>(std::move(tmp_lengths));
+          }
 
-        if (layer_attention) {
-          alignment_heads.emplace_back(dtype, device);
-          ops::Gather(1, 1)(*layer_attention, *heads_to_select, alignment_heads.back());
+          (*_layers[l])(layer_in_chunk,
+                        input_lengths_mask.get(),
+                        memory,
+                        memory_lengths_mask.get(),
+                        cached_self_attn_keys,
+                        cached_self_attn_values,
+                        cached_attn_keys,
+                        cached_attn_values,
+                        layer_out,
+                        layer_attention.get(),
+                        input_padder.get(),
+                        memory_padder.get(),
+                        return_normalized_attention(),
+                        &position_bias,
+                        offset);
+          layer_in_chunk = std::move(layer_out);
+
+          if (layer_attention) {
+            alignment_heads.emplace_back(dtype, device);
+            ops::Gather(1, 1)(*layer_attention, *heads_to_select, alignment_heads.back());
+          }
         }
+        layer_in = std::move(layer_in_chunk);
       }
 
       if (step == 0) {
 
@@ -122,7 +122,8 @@ namespace ctranslate2 {
                              state,
                              ids,
                              vocabulary,
-                             _model->preferred_size_multiple());
+                             _model->preferred_size_multiple(),
+                             options.offset);
     }
 
     bool DecoderReplica::skip_scoring(const std::vector<std::string>& tokens,
 
@@ -256,7 +256,8 @@ namespace ctranslate2 {
                              state,
                              target_ids,
                              _model->get_target_vocabulary(),
-                             _model->preferred_size_multiple());
+                             _model->preferred_size_multiple(),
+                             options.offset);
     }
 
     bool EncoderDecoderReplica::skip_scoring(const std::vector<std::string>& source,
@@ -422,7 +423,8 @@ namespace ctranslate2 {
 
         final_results.emplace_back(std::move(hypotheses),
                                    std::move(result.scores),
-                                   std::move(result.attention));
+                                   std::move(result.attention),
+                                   std::move(result.logits));
       }
 
       return final_results;
@@ -461,6 +463,8 @@ namespace ctranslate2 {
           result.scores.emplace_back(0);
         if (options.return_attention)
           result.attention.emplace_back(attention);
+        if (options.return_scores)
+          result.logits.emplace_back(0);
       }
 
       return true;
 
@@ -1,5 +1,6 @@
 #include "ctranslate2/ops/concat.h"
 #include "ctranslate2/ops/split.h"
+#include "ctranslate2/ops/slide.h"
 
 #include "cpu/parallel.h"
 #include "type_dispatch.h"
@@ -71,13 +72,41 @@ namespace ctranslate2 {
       }
     }
 
+    template <Device D, typename T>
+    void Slide::compute(const StorageView& input, StorageView& output, const dim_t& index) const {
+      const dim_t axis = _axis < 0 ? input.rank() + _axis : _axis;
+      const dim_t stride_axis = input.stride(axis) == 0 ? 1 : input.stride(axis);
+      const dim_t step_size = input.dim(axis) * stride_axis;
+      const T* input_data = input.data<T>();
+
+      StorageView& x = output;
+      T* x_data = x.data<T>();
+
+      const dim_t copy_size = compute_copy_size(x, axis);
+      if (copy_size == 0)
+        return;
+
+      const dim_t iter_size = compute_iter_size(x, axis);
+
+      const dim_t grain_size = cpu::get_minimum_batch_copies_per_thread<T>(copy_size);
+      input_data += index * stride_axis;  // Read next with an offset.
+      cpu::parallel_for(0, iter_size, grain_size, [&](dim_t begin, dim_t end) {
+        for (dim_t i = begin; i < end; ++i)
+          primitives<D>::copy(input_data + i * step_size, x_data + i * copy_size, copy_size);
+      });
+    }
+
 #define DECLARE_IMPL(T)                                                 \
     template void                                                       \
     Concat::compute<Device::CPU, T>(const std::vector<const StorageView*>& inputs, \
                                     StorageView& output) const;         \
     template void                                                       \
     Split::compute<Device::CPU, T>(const StorageView& input,            \
-                                   std::vector<StorageView*>& outputs) const;
+                                   std::vector<StorageView*>& outputs) const; \
+    template void                                                       \
+    Slide::compute<Device::CPU, T>(const StorageView& input,            \
+                                   StorageView& output,                 \
+                                   const dim_t& index) const;
 
     DECLARE_ALL_TYPES(DECLARE_IMPL)
 
 
@@ -1,5 +1,6 @@
 #include "ctranslate2/ops/concat.h"
 #include "ctranslate2/ops/split.h"
+#include "ctranslate2/ops/slide.h"
 
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -163,14 +164,60 @@ namespace ctranslate2 {
       }
     }
 
+    template <Device D, typename T>
+    void Slide::compute(const StorageView& input, StorageView& output, const dim_t& index) const {
+      const dim_t axis = _axis < 0 ? input.rank() + _axis : _axis;
+      const dim_t input_dim = input.dim(axis);
+      const dim_t inner_size = input.stride(axis) == 0 ? 1 : input.stride(axis);
+      const dim_t inner_bytes = inner_size * sizeof (T);
+      const T* input_data = input.data<T>();
+
+      T* output_data = output.data<T>();
+      const dim_t output_size = output.size();
+      const dim_t output_bytes = output_size * sizeof (T);
+      if (axis == 0) {
+        dim_t offset = index * output.stride(axis);
+        primitives<D>::copy(input_data + offset, output_data, output_size);
+      }
+      else {
+        const dim_t output_dim = output.dim(axis);
+
+        if (inner_size == 1) {
+          auto map_ids = thrust::make_transform_iterator(
+            thrust::counting_iterator<cuda::index_t>(0),
+            depth_offset_map<cuda::index_t>(index, output_dim, input_dim));
+          THRUST_CALL(thrust::gather, map_ids, map_ids + output_size, input_data, output_data);
+        } else if (inner_bytes % sizeof(uint4) == 0 && output_bytes % sizeof(uint4) == 0) {
+          auto map_ids = thrust::make_transform_iterator(
+            thrust::counting_iterator<cuda::index_t>(0),
+            inner_dim_offset_map<cuda::index_t>(index,
+                                                output_dim,
+                                                input_dim,
+                                                inner_bytes / sizeof(uint4)));
+          THRUST_CALL(thrust::gather,
+                      map_ids,
+                      map_ids + output_bytes / sizeof(uint4),
+                      reinterpret_cast<const uint4 *>(input_data),
+                      reinterpret_cast<uint4 *>(output_data));
+        } else {
+          auto map_ids = thrust::make_transform_iterator(
+            thrust::counting_iterator<cuda::index_t>(0),
+            inner_dim_offset_map<cuda::index_t>(index, output_dim, input_dim, inner_size));
+          THRUST_CALL(thrust::gather, map_ids, map_ids + output_size, input_data, output_data);
+        }
+      }
+    }
+
 #define DECLARE_IMPL(T)                                                 \
     template void                                                       \
     Concat::compute<Device::CUDA, T>(const std::vector<const StorageView*>& inputs, \
                                      StorageView& output) const;        \
     template void                                                       \
     Split::compute<Device::CUDA, T>(const StorageView& input,           \
-                                    std::vector<StorageView*>& outputs) const;
-
+                                    std::vector<StorageView*>& outputs) const;      \
+    template void                                                       \
+    Slide::compute<Device::CUDA, T>(const StorageView& input,           \
+                                    StorageView& output, const dim_t& index) const;
     DECLARE_ALL_TYPES(DECLARE_IMPL)
 
   }
 
@@ -0,0 +1,47 @@
+#include "ctranslate2/ops/slide.h"
+
+#include <numeric>
+
+#include "dispatch.h"
+
+namespace ctranslate2 {
+  namespace ops {
+
+    Slide::Slide(dim_t axis, const dim_t& index, const dim_t& size, bool no_copy)
+      : _axis(axis)
+      , _index(index)
+      , _size(size)
+      , _no_copy(no_copy) {
+      check_arguments();
+    }
+
+    void Slide::operator()(const StorageView& input, StorageView& output) const {
+      PROFILE("Slide");
+      const dim_t axis = _axis < 0 ? input.rank() + _axis : _axis;
+
+      if (_index < 0 || _index >= input.dim(axis))
+        throw std::invalid_argument("Index or Size given is not valid");
+
+      dim_t offset = input.stride(0) * _index;
+      auto shape = input.shape();
+      shape[axis] = _size;
+      if (_no_copy) {
+        TYPE_DISPATCH(input.dtype(),
+                      output.view(const_cast<T*>(input.data<T>() + offset), std::move(shape)));
+      }
+      else {
+        output.resize(std::move(shape));
+      }
+
+      if (!_no_copy) {
+        DEVICE_AND_TYPE_DISPATCH(input.device(), input.dtype(), (compute<D, T>(input, output, _index)));
+      }
+    }
+
+    void Slide::check_arguments() const {
+      if (_no_copy && _axis != 0)
+        throw std::invalid_argument("no_copy is only defined when splitting across the first dimension");
+    }
+
+  }
+}
@@ -7,7 +7,8 @@ namespace ctranslate2 {
                   layers::DecoderState& state,
                   const std::vector<std::vector<size_t>>& sequences,
                   const Vocabulary& vocabulary,
-                  const dim_t preferred_size_multiple) {
+                  const dim_t preferred_size_multiple,
+                  const dim_t offset) {
     const dim_t batch_size = sequences.size();
     const Device device = decoder.device();
 
@@ -57,7 +58,7 @@ namespace ctranslate2 {
       auto& result = results[b];
       result.tokens.reserve(output_length);
       result.tokens_score.reserve(output_length);
-      for (dim_t t = 0; t < output_length; ++t) {
+      for (dim_t t = offset; t < output_length; ++t) {
         result.tokens.emplace_back(vocabulary.to_token(output_sequences[b][t]));
         result.tokens_score.emplace_back(scores.at<float>({b, t}));
       }
Original file line number	Diff line number	Diff line change
`@@ -42,4 +42,4 @@ build()`
`42`	`42`	`fi`
`43`	`43`	`}`
`44`	`44`
`45`		`-build Dockerfile ubuntu20.04-cuda11.2`
	`45`	`+build Dockerfile ubuntu20.04-cuda12.2`
Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,7 @@ namespace ctranslate2 {`
`87`	`87`	`std::vector<std::vector<std::string>> hypotheses;`
`88`	`88`	`std::vector<float> scores;`
`89`	`89`	`std::vector<std::vector<std::vector<float>>> attention;`
	`90`	`+ std::vector<float> logits;`
`90`	`91`
`91`	`92`	`TranslationResult(std::vector<std::vector<std::string>> hypotheses_)`
`92`	`93`	`: hypotheses(std::move(hypotheses_))`
`@@ -95,10 +96,12 @@ namespace ctranslate2 {`
`95`	`96`
`96`	`97`	`TranslationResult(std::vector<std::vector<std::string>> hypotheses_,`
`97`	`98`	`std::vector<float> scores_,`
`98`		`- std::vector<std::vector<std::vector<float>>> attention_)`
	`99`	`+ std::vector<std::vector<std::vector<float>>> attention_,`
	`100`	`+ std::vector<float> logits_)`
`99`	`101`	`: hypotheses(std::move(hypotheses_))`
`100`	`102`	`, scores(std::move(scores_))`
`101`	`103`	`, attention(std::move(attention_))`
	`104`	`+ , logits(std::move(logits_))`
`102`	`105`	`{`
`103`	`106`	`}`
`104`	`107`
`@@ -109,6 +112,7 @@ namespace ctranslate2 {`
`109`	`112`	`: hypotheses(num_hypotheses)`
`110`	`113`	`, scores(with_score ? num_hypotheses : 0, static_cast<float>(0))`
`111`	`114`	`, attention(with_attention ? num_hypotheses : 0)`
	`115`	`+ , logits(with_score ? num_hypotheses : 0)`
`112`	`116`	`{`
`113`	`117`	`}`
`114`	`118`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""Version information."""`
`2`	`2`
`3`		`-__version__ = "3.21.0"`
	`3`	`+__version__ = "3.24.1"`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-transformers==4.29.*;platform_system=='Linux'`
	`1`	`+transformers==4.35.*;platform_system=='Linux'`
`2`	`2`	`fairseq==0.12.2;platform_system=='Linux' or platform_system=='Darwin'`
`3`	`3`	`OpenNMT-py==2.2.*;platform_system=='Linux' or platform_system=='Darwin'`
`4`	`4`	`OpenNMT-tf==2.30.*`
Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,8 @@ namespace ctranslate2 {`
`122`	`122`	`state,`
`123`	`123`	`ids,`
`124`	`124`	`vocabulary,`
`125`		`- _model->preferred_size_multiple());`
	`125`	`+ _model->preferred_size_multiple(),`
	`126`	`+ options.offset);`
`126`	`127`	`}`
`127`	`128`
`128`	`129`	`bool DecoderReplica::skip_scoring(const std::vector<std::string>& tokens,`