diff --git a/.env b/.env
index 31165d6..d2fbac8 100644
--- a/.env
+++ b/.env
@@ -9,7 +9,7 @@ RUNNING_MODE=full
 
 DATA_FOLDER=./data/experiments
 
-POPULATION_SIZE=3000
+POPULATION_SIZE=30
 MUTATION_PROBABILITY=0.50
 CROSSOVER_PROBABILITY=0.50
 
diff --git a/.gitignore b/.gitignore
index a08037f..6fea541 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,4 +53,5 @@ workspace.xml
 .idea/
 
 data/
-programs.txt
\ No newline at end of file
+programs.txt
+/qwen_model
\ No newline at end of file
diff --git a/cuda-keyring_1.1-1_all.deb b/cuda-keyring_1.1-1_all.deb
new file mode 100644
index 0000000..d022941
Binary files /dev/null and b/cuda-keyring_1.1-1_all.deb differ
diff --git a/gp.py b/gp.py
index f6e29c7..a31b64f 100644
--- a/gp.py
+++ b/gp.py
@@ -137,7 +137,7 @@ def evaluate_population(population, simulator):
 
     elitism_individual = None
 
-    for epoch in tqdm(range(1500)):
+    for epoch in tqdm(range(5)):
         min_max_length, elitism_individual = get_top_individual(population)
 
         logger.info(
diff --git a/llm.py b/llm.py
index 6bee733..062fc24 100644
--- a/llm.py
+++ b/llm.py
@@ -7,6 +7,7 @@
 import importlib
 from openai import OpenAI
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import importlib.util
 
 from logger_config import getLogger
 
@@ -31,6 +32,102 @@ def get_model(model_name: Literal["qwen", "deepseek"] = "qwen") -> int:
 
 
 class LLMModel(ABC):
+
+    def __init__(self):
+        self.model_name = None
+        self.model = None
+        self.tokenizer = None
+        self.bnb_config = None
+        self.awq = None
+        self.attn = None
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.do_sampling = False # use the model default as the default here
+
+    def check_flash_att_compatibility(self) -> bool:
+        """
+        Check if flash_attention can be used, otherwise default to sdpa
+        Args:
+            None
+        Returns:
+            bool value indicating if flash_attn can be used
+        """
+
+        if not torch.cuda.is_available():
+            print("No GPU available, defaulting to CPU")
+            return False
+
+        if importlib.util.find_spec('flash_attn') is None:
+            print(f"No package flash_attn available for import. Ensure it is installed and try again!")
+            return False
+        
+        gpu_name = torch.cuda.get_device_name()
+        gpu_idx = torch.cuda.current_device()
+        # tuple value representing the minor and major capability of the gpu
+        gpu_capability = torch.cuda.get_device_capability(gpu_idx)
+
+        print(f"The following GPU is available: ")
+        print(f"\tname: {gpu_name}")
+        print(f"\tindex: {gpu_idx}")
+        print(f"\tCapability: {gpu_capability[0]}.{gpu_capability[1]}")
+
+        if gpu_capability[0] >= 8: # ampere=8
+            return True
+        else:
+            return False
+
+    def load_model(self, device:str, vllm: bool) -> None:
+        """
+        Load a specific LLM model
+
+        Args:
+            device: the device onto which the model should be loaded. For onnx it is cpu
+            vllm: use of vllm as inference mechanism
+        """
+
+        if self.model_name is None:
+            raise ValueError("No model name specified, please set attribute model_name")
+        if self.attn is None:
+            print("attention is not specified. To change this set attribute attn")
+
+        if not vllm:
+            if self.bnb_config is None:
+                print("No quantization mechanism is implemented. To change this set attribute bnb_config")
+         
+                
+            self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    torch_dtype=torch.float16,
+                    device_map=device,
+                    quantization_config=None if self.bnb_config is None else self.bnb_config,
+                    use_sliding_window=False,
+                    do_sample=self.do_sampling, # but we need to look at accuracy
+                    attn_implementation="sdpa" if self.attn is None else self.attn
+                )
+            logger.info(f"Qwen|HF|{self.model_name}|{self.bnb_config}")
+            
+        else:
+            print(f"using vllm as inference mechanism")
+            # no parameter to pass to vLLM and thus set it in the environment
+            if self.attn != "flash_attention2":
+                os.environ["VLLM_ATTENTION_BACKEND"] = "TORCH_SDPA"
+            from vllm import LLM as vLLM, SamplingParams
+            # print(os.getcwd())
+            self.model = vLLM(model="./qwen_model/snapshots/2e1fd397ee46e1388853d2af2c993145b0f1098a", # much faster to download model first
+                              tensor_parallel_size=1,
+                              device=self.device,
+                              quantization="awq" if self.awq is not None else None, # Using AWQ (activation aware weight quantization) 4-bit quantization
+                              dtype="half",
+                              trust_remote_code=True,
+                              model_impl="llama" )
+            self.vllm_sampling = SamplingParams(
+                temperature=0.7 if hasattr(self, 'do_sampling') and self.do_sampling else 0.0,
+                top_p=0.95 if hasattr(self, 'do_sampling') and self.do_sampling else 1.0,
+                max_tokens=1500
+            )
+            logger.info(f"Qwen|HF|{self.model_name}|{self.awq}")
+            
+        print(f"Model loaded on {device}")
+
     @abstractmethod
     def __call__(self, system_prompt: str, user_prompt: str) -> str:
         """
@@ -103,32 +200,32 @@ def __init__(
             "Qwen/Qwen2.5-Coder-1.5B-Instruct", "Qwen/Qwen2.5-Coder-7B-Instruct"
         ] = "Qwen/Qwen2.5-Coder-1.5B-Instruct",
         bit_config: Literal["8bit", "4bit", "none"] = "4bit",
+        vllm: bool=True
     ):
+        super().__init__()
+        self.vllm = vllm
+        self.model_name = model_name
         if bit_config == "8bit":
-            bnb_config = BitsAndBytesConfig(load_in_8bit=True)
+            self.bnb_config = BitsAndBytesConfig(load_in_8bit=True)
         elif bit_config == "4bit":
-            bnb_config = BitsAndBytesConfig(
+            self.bnb_config = BitsAndBytesConfig(
                 load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
             )
-
-        logger.info(f"Qwen|{model_name}|{bnb_config}")
-
-        if self.check_flash_att_compatibility():
-            logger.info("Flash attention will be used as the attention mechanism")
-            self.attn = "flash_attention_2"
+        if not self.vllm:
+            if self.check_flash_att_compatibility():
+                print("Flash attention will be used as the attention mechanism")
+                self.attn = "flash_attention_2"
+            else:
+                print(f"Unable to run on GPU using flash attention, will run on {self.device} using sdpa attention mechanism")
+                self.attn = "sdpa"
         else:
-            logger.info("Unable to run on GPU using flash attention, will run on CPU using sdpa attention mechanism")
-            self.attn = "sdpa"
-
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16,
-            device_map="cuda",
-            quantization_config=bnb_config,
-            use_sliding_window=False,
-            attn_implementation=self.attn
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.awq = True
+            self.bnb_config = None
+
+        
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.load_model(device=self.device, vllm=self.vllm)
+        
 
     def __call__(self, system_prompt, user_prompt) -> str:
         print(user_prompt)
@@ -143,19 +240,21 @@ def __call__(self, system_prompt, user_prompt) -> str:
         text = self.tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
+        if not self.vllm:
+            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
 
-        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
-
-        with torch.no_grad():
-            generated_ids = self.model.generate(**model_inputs, max_new_tokens=1500)
+            with torch.no_grad():
+                generated_ids = self.model.generate(**model_inputs, max_new_tokens=1500)
 
-            generated_ids = [
-                output_ids[len(input_ids) :]
-                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-            ]
-
-            response = self.tokenizer.batch_decode(
-                generated_ids, skip_special_tokens=True
-            )[0]
+                generated_ids = [
+                    output_ids[len(input_ids) :]
+                    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+                ]
 
+                response = self.tokenizer.batch_decode(
+                    generated_ids, skip_special_tokens=True
+                )[0]
+        else:
+            outputs = self.model.generate([text], self.vllm_sampling)
+            response = [out.outputs[0].text for out in outputs]
         return response
diff --git a/llm_performance_improvements.ipynb b/llm_performance_improvements.ipynb
new file mode 100644
index 0000000..998d153
--- /dev/null
+++ b/llm_performance_improvements.ipynb
@@ -0,0 +1,263 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# optimisatons for LLM generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/pieter/anaconda3/envs/gp/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llm import Qwen\n",
+    "from tqdm import tqdm\n",
+    "import time\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "attention is not specified. To change this set attribute attn\n",
+      "using vllm as inference mechanism\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-03-10 20:52:46,493\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 03-10 20:52:48 __init__.py:207] Automatically detected platform cuda.\n",
+      "WARNING 03-10 20:52:50 config.py:2448] Casting torch.bfloat16 to torch.float16.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = Qwen(vllm=True)\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "system = \"You are a data structure and algorithm expert. You are also an expert in python programming\"\n",
+    "user = \"Generate a python program for dijkstra's algorithm\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model(system, user)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "timings = []\n",
+    "\n",
+    "\n",
+    "for n in tqdm(range(10)):\n",
+    "    start = time.time()\n",
+    "    model(system, user)\n",
+    "    end = time.time()\n",
+    "    duration = end - start\n",
+    "    timings.append(duration)\n",
+    "\n",
+    "timings\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summary(timings):\n",
+    "    print(f\"mean: {np.mean(timings)}\")\n",
+    "    print(f\"median: {np.median(timings)}\")\n",
+    "    print(f\"std: {np.std(timings)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline = [15.129080295562744,\n",
+    " 15.720300912857056,\n",
+    " 19.821353912353516,\n",
+    " 16.801652431488037,\n",
+    " 15.112785339355469,\n",
+    " 15.36365556716919,\n",
+    " 23.151497840881348,\n",
+    " 23.019291162490845,\n",
+    " 22.601115465164185,\n",
+    " 18.89482569694519]\n",
+    "\n",
+    "summary(baseline)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# kv_cache\n",
+    "kv = [20.304307222366333,\n",
+    " 12.694454908370972,\n",
+    " 13.19345235824585,\n",
+    " 13.327292442321777,\n",
+    " 21.780697345733643,\n",
+    " 17.99206781387329,\n",
+    " 19.965080499649048,\n",
+    " 23.245574712753296,\n",
+    " 18.404191255569458,\n",
+    " 16.01033854484558]\n",
+    "\n",
+    "summary(kv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# do_sample\n",
+    "\n",
+    "kv = [20.304307222366333,\n",
+    " 12.694454908370972,\n",
+    " 13.19345235824585,\n",
+    " 13.327292442321777,\n",
+    " 21.780697345733643,\n",
+    " 17.99206781387329,\n",
+    " 19.965080499649048,\n",
+    " 23.245574712753296,\n",
+    " 18.404191255569458,\n",
+    " 16.01033854484558]\n",
+    "\n",
+    "summary(kv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "do_sample = [17.019907236099243,\n",
+    " 32.08466076850891,\n",
+    " 16.29255437850952,\n",
+    " 26.581018447875977,\n",
+    " 17.966830253601074,\n",
+    " 33.007309436798096,\n",
+    " 33.01791763305664,\n",
+    " 21.371530055999756,\n",
+    " 21.834694385528564,\n",
+    " 26.789570808410645]\n",
+    "\n",
+    "summary(do_sample)\n",
+    "\n",
+    "# Sampling does take longer\n",
+    "\n",
+    "#Compared to baseline\n",
+    "# TtestResult(statistic=np.float64(-2.5633995416211652), pvalue=np.float64(0.01954406059593905), df=np.float64(18.0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compile = [20.496086835861206,\n",
+    " 26.187604188919067,\n",
+    " 19.345702409744263,\n",
+    " 26.974491119384766,\n",
+    " 24.00583553314209,\n",
+    " 32.89661765098572,\n",
+    " 29.74660015106201,\n",
+    " 22.171958208084106,\n",
+    " 26.075289487838745,\n",
+    " 25.507501363754272]\n",
+    "\n",
+    "summary(compile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy.stats as stats\n",
+    "stats.ttest_ind(a=baseline, b=compile, equal_var=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gp",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/onnx_export.py b/onnx_export.py
new file mode 100644
index 0000000..9349fc8
--- /dev/null
+++ b/onnx_export.py
@@ -0,0 +1,27 @@
+# export_onnx.py
+import torch
+import os
+from pathlib import Path
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from optimum.exporters.onnx import main_export
+
+# Ensure CUDA cache is cleared
+torch.cuda.empty_cache()
+
+# Set the model name and output path
+model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"  # Use the smaller model for testing
+onnx_path = Path("./onnx_model")
+os.makedirs(onnx_path, exist_ok=True)
+
+# Export directly from hub to ONNX
+main_export(
+    model_name_or_path=model_name,
+    output=onnx_path,
+    task="text-generation",
+    opset=14,
+    device="cpu",  # Force CPU
+    no_post_process=True,  # Skip post-processing to save memory
+    trust_remote_code=True
+)
+
+print(f"Model successfully exported to {onnx_path}")
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index bb0b64d..c66dd24 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,11 @@ tqdm
 transformers
 torch
 bitsandbytes
-accelerate
\ No newline at end of file
+accelerate
+scipy
+flash-attn
+# onnx
+# onnxruntime-gpu
+# transformers-onnx
+# optimum
+vllm
\ No newline at end of file