beehive-lab
diff --git a/‎.github/workflows/build-and-run.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/build-and-run.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/model/ModelType.java‎
Lines changed: 15 additions & 15 deletions b/‎src/main/java/org/beehive/gpullama3/model/ModelType.java‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/model/loader/AbstractModelLoader.java‎
Lines changed: 23 additions & 34 deletions b/‎src/main/java/org/beehive/gpullama3/model/loader/AbstractModelLoader.java‎
Lines changed: 23 additions & 34 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/model/loader/LlamaModelLoader.java‎
Lines changed: 13 additions & 8 deletions b/‎src/main/java/org/beehive/gpullama3/model/loader/LlamaModelLoader.java‎
Lines changed: 13 additions & 8 deletions
@@ -6,8 +6,6 @@ on:
   pull_request:
     branches: [ main ]  
     types: [opened, synchronize,  reopened]
-  pull_request_review:
-    types: [submitted, edited]
 
 
 jobs:
@@ -28,11 +26,11 @@ jobs:
       - name: Check code formatting (Spotless)
         run: |
           cd ${{ github.workspace }}
-          ./mvnw -T12C -Pspotless spotless:check
+          #./mvnw -T12C -Pspotless spotless:check
           
       - name: Clone TornadoVM explicitly
         run: |
-          git clone --depth 1 --branch master \
+          git clone --depth 1 --branch develop \
             https://github.com/beehive-lab/TornadoVM.git \
             GPULlama3.java/external/tornadovm
       - name: Set up Python venv for TornadoVM
 
@@ -16,63 +16,63 @@
  * <p><b>Usage:</b> Use {@code ModelType} to specify or retrieve the type of
  * large language model (LLM), such as Llama or Qwen3. This ensures clean and structured handling of model behaviors and configurations by
  * dispatching calls to the appropriate model loader for each
- *  model type.</p>
+ * model type.</p>
  *
  * <p>Each enum value represents a distinct model type, which might be used for
  * conditional logic, initialization, or resource allocation within GPULlama3.java.</p>
  */
 public enum ModelType {
     LLAMA_3 {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-            return new LlamaModelLoader(fileChannel, gguf, contextLength, loadWeights, useTornadovm).loadModel();
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+            return new LlamaModelLoader(fileChannel, gguf, contextLength, useTornadovm).loadModel();
         }
     },
 
     MISTRAL {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-            return new MistralModelLoader(fileChannel, gguf, contextLength, loadWeights, useTornadovm).loadModel();
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+            return new MistralModelLoader(fileChannel, gguf, contextLength, useTornadovm).loadModel();
         }
     },
 
     QWEN_2 {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-            return new Qwen2ModelLoader(fileChannel, gguf, contextLength, loadWeights, useTornadovm).loadModel();
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+            return new Qwen2ModelLoader(fileChannel, gguf, contextLength, useTornadovm).loadModel();
         }
     },
 
     QWEN_3 {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-            return new Qwen3ModelLoader(fileChannel, gguf, contextLength, loadWeights, useTornadovm).loadModel();
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+            return new Qwen3ModelLoader(fileChannel, gguf, contextLength, useTornadovm).loadModel();
         }
     },
 
     DEEPSEEK_R1_DISTILL_QWEN {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-            return new Qwen2ModelLoader(fileChannel, gguf, contextLength, loadWeights, useTornadovm).loadModel();
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+            return new Qwen2ModelLoader(fileChannel, gguf, contextLength, useTornadovm).loadModel();
         }
     },
 
     PHI_3 {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-            return new Phi3ModelLoader(fileChannel, gguf, contextLength, loadWeights, useTornadovm).loadModel();
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+            return new Phi3ModelLoader(fileChannel, gguf, contextLength, useTornadovm).loadModel();
         }
     },
 
     UNKNOWN {
         @Override
-        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
             throw new UnsupportedOperationException("Cannot load unknown model type");
         }
     };
 
     // Abstract method that each enum constant must implement
-    public abstract Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm);
+    public abstract Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm);
 
     public boolean isDeepSeekR1() {
         return this == DEEPSEEK_R1_DISTILL_QWEN;
 
@@ -16,26 +16,22 @@
 /**
  * Abstract base class for model loaders using Template Method pattern. Provides common loading flow with extension points for model-specific logic.
  *
- * @param <M>
- *         The specific Model type to load
- * @param <C>
- *         The specific Configuration type for the model
+ * @param <M> The specific Model type to load
+ * @param <C> The specific Configuration type for the model
  */
 public abstract class AbstractModelLoader<M extends Model, C extends Configuration> {
 
     protected final FileChannel fileChannel;
     protected final GGUF gguf;
     protected final int contextLength;
-    protected final boolean loadWeights;
     protected final boolean useTornadovm;
 
     protected Vocabulary vocabulary;
 
-    protected AbstractModelLoader(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
+    protected AbstractModelLoader(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
         this.fileChannel = fileChannel;
         this.gguf = gguf;
         this.contextLength = contextLength;
-        this.loadWeights = loadWeights;
         this.useTornadovm = useTornadovm;
     }
 
@@ -57,13 +53,17 @@ public final M loadModel() {
             // Step 3: Create configuration
             C config = createConfiguration(metadata);
 
-            // Step 4: Load weights (if requested)
-            Weights weights = null;
-            if (loadWeights) {
-                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
-                weights = loadWeights(tensorEntries, config);
+            // Step 4: Load tensor entries
+            Map<String, GGMLTensorEntry> tensorEntries;
+            if (useTornadovm) {
+                tensorEntries = GGUF.loadTensorsTornado(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
+            } else {
+                tensorEntries = GGUF.loadTensorsStandard(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
             }
 
+            // Step 4: Load weights
+            Weights weights = loadWeights(tensorEntries, config);
+
             // Step 5: Create and return model instance
             return createModel(config, tokenizer, weights);
 
@@ -75,39 +75,33 @@ public final M loadModel() {
     /**
      * Load the vocabulary from GGUF metadata. Model-specific implementations should override this method.
      *
-     * @param metadata
-     *         The GGUF metadata map
+     * @param metadata The GGUF metadata map
      * @return The loaded Vocabulary
      */
     protected abstract Vocabulary loadVocabulary(Map<String, Object> metadata);
 
     /**
      * Create a tokenizer instance for this model.
      *
-     * @param metadata
-     *         The GGUF metadata map
-     * @param vocabulary
-     *         The loaded vocabulary
+     * @param metadata   The GGUF metadata map
+     * @param vocabulary The loaded vocabulary
      * @return The tokenizer instance
      */
     protected abstract Tokenizer createTokenizer(Map<String, Object> metadata, Vocabulary vocabulary);
 
     /**
      * Create a configuration instance from GGUF metadata.
      *
-     * @param metadata
-     *         The GGUF metadata map
+     * @param metadata The GGUF metadata map
      * @return The configuration instance
      */
     protected abstract C createConfiguration(Map<String, Object> metadata);
 
     /**
      * Load model weights from tensor entries. Default implementation handles common weight loading logic.
      *
-     * @param tensorEntries
-     *         Map of tensor names to tensor entries
-     * @param config
-     *         The model configuration
+     * @param tensorEntries Map of tensor names to tensor entries
+     * @param config        The model configuration
      * @return The loaded weights
      */
     public Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, C config) {
@@ -129,12 +123,9 @@ public Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, C config)
     /**
      * Create the final model instance.
      *
-     * @param config
-     *         The model configuration
-     * @param tokenizer
-     *         The tokenizer
-     * @param weights
-     *         The loaded weights
+     * @param config    The model configuration
+     * @param tokenizer The tokenizer
+     * @param weights   The loaded weights
      * @return The model instance
      */
     protected abstract M createModel(C config, Tokenizer tokenizer, Weights weights);
@@ -161,12 +152,10 @@ protected GGMLTensorEntry getOutputWeight(Map<String, GGMLTensorEntry> tensorEnt
     /**
      * Create standard (CPU) weights.
      */
-    protected abstract Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntries, C config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
-            GGMLTensorEntry outputWeight);
+    protected abstract Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntries, C config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight);
 
     /**
      * Create TornadoVM (GPU) weights.
      */
-    protected abstract Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, C config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
-            GGMLTensorEntry outputWeight);
+    protected abstract Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, C config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight);
 }
@@ -28,8 +28,8 @@
 
 public class LlamaModelLoader extends AbstractModelLoader<Llama, LlamaConfiguration> {
 
-    public LlamaModelLoader(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights, boolean useTornadovm) {
-        super(fileChannel, gguf, contextLength, loadWeights, useTornadovm);
+    public LlamaModelLoader(FileChannel fileChannel, GGUF gguf, int contextLength, boolean useTornadovm) {
+        super(fileChannel, gguf, contextLength, useTornadovm);
     }
 
     @Override
@@ -42,6 +42,7 @@ protected Tokenizer createTokenizer(Map<String, Object> metadata, Vocabulary voc
         return new LlamaTokenizer(metadata, vocabulary);
     }
 
+    // @formatter:off
     @Override
     protected LlamaConfiguration createConfiguration(Map<String, Object> metadata) {
         int vocabSize = metadata.containsKey("llama.vocab_size") ? (int) metadata.get("llama.vocab_size") : (int) metadata.get("tokenizer.ggml.tokens.length");
@@ -59,21 +60,22 @@ protected LlamaConfiguration createConfiguration(Map<String, Object> metadata) {
                 (float) metadata.getOrDefault("llama.attention.layer_norm_rms_epsilon", 1e-5f),
                 (float) metadata.getOrDefault("llama.rope.freq_base", 10000f)).withContextLength(contextLength);
     }
+    // @formatter:on
 
     @Override
     protected Pair<float[], float[]> precomputeRopeFrequencies(LlamaConfiguration config) {
-        return RoPE.precomputeFreqsCis(config.contextLength(), config.dim() / config.numberOfHeads(), config.ropeTheta(), false, 1.0f, 1.0f, 1.0f, config.contextLength()
-        );
+        return RoPE.precomputeFreqsCis(config.contextLength(), config.dim() / config.numberOfHeads(), config.ropeTheta(), false, 1.0f, 1.0f, 1.0f, config.contextLength());
     }
 
     @Override
     protected Llama createModel(LlamaConfiguration config, Tokenizer tokenizer, Weights weights) {
         return new Llama(config, tokenizer, weights, ChatFormat.create(tokenizer, null));
     }
 
+    // @formatter:off
     @Override
     protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntries, LlamaConfiguration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
-            GGMLTensorEntry outputWeight) {
+                                            GGMLTensorEntry outputWeight) {
 
         final int nl = config.numberOfLayers();
 
@@ -94,7 +96,9 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
                 loadTensor(outputWeight),
                 outputWeight.ggmlType());
     }
+    // @formatter:on
 
+    // @formatter:off
     @Override
     protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries,
                                              LlamaConfiguration config,
@@ -117,20 +121,21 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr
         // Load all tensors uniformly as TornadoTensor hierarchy
         return new LlamaTornadoWeights(
                 loadTornadoTensorAsFP32(tokenEmbeddings),
-                loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),
+                loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),    // fp32
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_v.weight")),
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_output.weight")),
-                loadArrayOfTornadoTensorsAsFP32(nl, i -> tensorEntries.get("blk." + i + ".ffn_norm.weight")),
+                loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".ffn_norm.weight")),     // fp32
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".ffn_gate.weight")),
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".ffn_down.weight")),
                 loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".ffn_up.weight")),
-                loadTornadoTensorAsFP32(tensorEntries.get("output_norm.weight")),
+                loadTornadoTensor(tensorEntries.get("output_norm.weight")),                                     // fp32
                 new FP32TornadoTensor(FloatArray.fromArray(ropeFreqs.first())),
                 new FP32TornadoTensor(FloatArray.fromArray(ropeFreqs.second())),
                 loadTornadoTensor(outputWeight),
                 ggmlType
         );
     }
+    // @formatter:on
 }