diff --git a/.env b/.env index 31165d6..d2fbac8 100644 --- a/.env +++ b/.env @@ -9,7 +9,7 @@ RUNNING_MODE=full DATA_FOLDER=./data/experiments -POPULATION_SIZE=3000 +POPULATION_SIZE=30 MUTATION_PROBABILITY=0.50 CROSSOVER_PROBABILITY=0.50 diff --git a/.gitignore b/.gitignore index a08037f..6fea541 100644 --- a/.gitignore +++ b/.gitignore @@ -53,4 +53,5 @@ workspace.xml .idea/ data/ -programs.txt \ No newline at end of file +programs.txt +/qwen_model \ No newline at end of file diff --git a/cuda-keyring_1.1-1_all.deb b/cuda-keyring_1.1-1_all.deb new file mode 100644 index 0000000..d022941 Binary files /dev/null and b/cuda-keyring_1.1-1_all.deb differ diff --git a/gp.py b/gp.py index f6e29c7..a31b64f 100644 --- a/gp.py +++ b/gp.py @@ -137,7 +137,7 @@ def evaluate_population(population, simulator): elitism_individual = None - for epoch in tqdm(range(1500)): + for epoch in tqdm(range(5)): min_max_length, elitism_individual = get_top_individual(population) logger.info( diff --git a/llm.py b/llm.py index 6bee733..062fc24 100644 --- a/llm.py +++ b/llm.py @@ -7,6 +7,7 @@ import importlib from openai import OpenAI from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +import importlib.util from logger_config import getLogger @@ -31,6 +32,102 @@ def get_model(model_name: Literal["qwen", "deepseek"] = "qwen") -> int: class LLMModel(ABC): + + def __init__(self): + self.model_name = None + self.model = None + self.tokenizer = None + self.bnb_config = None + self.awq = None + self.attn = None + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.do_sampling = False # use the model default as the default here + + def check_flash_att_compatibility(self) -> bool: + """ + Check if flash_attention can be used, otherwise default to sdpa + Args: + None + Returns: + bool value indicating if flash_attn can be used + """ + + if not torch.cuda.is_available(): + print("No GPU available, defaulting to CPU") + return False + + if importlib.util.find_spec('flash_attn') is None: + print(f"No package flash_attn available for import. Ensure it is installed and try again!") + return False + + gpu_name = torch.cuda.get_device_name() + gpu_idx = torch.cuda.current_device() + # tuple value representing the minor and major capability of the gpu + gpu_capability = torch.cuda.get_device_capability(gpu_idx) + + print(f"The following GPU is available: ") + print(f"\tname: {gpu_name}") + print(f"\tindex: {gpu_idx}") + print(f"\tCapability: {gpu_capability[0]}.{gpu_capability[1]}") + + if gpu_capability[0] >= 8: # ampere=8 + return True + else: + return False + + def load_model(self, device:str, vllm: bool) -> None: + """ + Load a specific LLM model + + Args: + device: the device onto which the model should be loaded. For onnx it is cpu + vllm: use of vllm as inference mechanism + """ + + if self.model_name is None: + raise ValueError("No model name specified, please set attribute model_name") + if self.attn is None: + print("attention is not specified. To change this set attribute attn") + + if not vllm: + if self.bnb_config is None: + print("No quantization mechanism is implemented. To change this set attribute bnb_config") + + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + torch_dtype=torch.float16, + device_map=device, + quantization_config=None if self.bnb_config is None else self.bnb_config, + use_sliding_window=False, + do_sample=self.do_sampling, # but we need to look at accuracy + attn_implementation="sdpa" if self.attn is None else self.attn + ) + logger.info(f"Qwen|HF|{self.model_name}|{self.bnb_config}") + + else: + print(f"using vllm as inference mechanism") + # no parameter to pass to vLLM and thus set it in the environment + if self.attn != "flash_attention2": + os.environ["VLLM_ATTENTION_BACKEND"] = "TORCH_SDPA" + from vllm import LLM as vLLM, SamplingParams + # print(os.getcwd()) + self.model = vLLM(model="./qwen_model/snapshots/2e1fd397ee46e1388853d2af2c993145b0f1098a", # much faster to download model first + tensor_parallel_size=1, + device=self.device, + quantization="awq" if self.awq is not None else None, # Using AWQ (activation aware weight quantization) 4-bit quantization + dtype="half", + trust_remote_code=True, + model_impl="llama" ) + self.vllm_sampling = SamplingParams( + temperature=0.7 if hasattr(self, 'do_sampling') and self.do_sampling else 0.0, + top_p=0.95 if hasattr(self, 'do_sampling') and self.do_sampling else 1.0, + max_tokens=1500 + ) + logger.info(f"Qwen|HF|{self.model_name}|{self.awq}") + + print(f"Model loaded on {device}") + @abstractmethod def __call__(self, system_prompt: str, user_prompt: str) -> str: """ @@ -103,32 +200,32 @@ def __init__( "Qwen/Qwen2.5-Coder-1.5B-Instruct", "Qwen/Qwen2.5-Coder-7B-Instruct" ] = "Qwen/Qwen2.5-Coder-1.5B-Instruct", bit_config: Literal["8bit", "4bit", "none"] = "4bit", + vllm: bool=True ): + super().__init__() + self.vllm = vllm + self.model_name = model_name if bit_config == "8bit": - bnb_config = BitsAndBytesConfig(load_in_8bit=True) + self.bnb_config = BitsAndBytesConfig(load_in_8bit=True) elif bit_config == "4bit": - bnb_config = BitsAndBytesConfig( + self.bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16 ) - - logger.info(f"Qwen|{model_name}|{bnb_config}") - - if self.check_flash_att_compatibility(): - logger.info("Flash attention will be used as the attention mechanism") - self.attn = "flash_attention_2" + if not self.vllm: + if self.check_flash_att_compatibility(): + print("Flash attention will be used as the attention mechanism") + self.attn = "flash_attention_2" + else: + print(f"Unable to run on GPU using flash attention, will run on {self.device} using sdpa attention mechanism") + self.attn = "sdpa" else: - logger.info("Unable to run on GPU using flash attention, will run on CPU using sdpa attention mechanism") - self.attn = "sdpa" - - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch.float16, - device_map="cuda", - quantization_config=bnb_config, - use_sliding_window=False, - attn_implementation=self.attn - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.awq = True + self.bnb_config = None + + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.load_model(device=self.device, vllm=self.vllm) + def __call__(self, system_prompt, user_prompt) -> str: print(user_prompt) @@ -143,19 +240,21 @@ def __call__(self, system_prompt, user_prompt) -> str: text = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) + if not self.vllm: + model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) - model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) - - with torch.no_grad(): - generated_ids = self.model.generate(**model_inputs, max_new_tokens=1500) + with torch.no_grad(): + generated_ids = self.model.generate(**model_inputs, max_new_tokens=1500) - generated_ids = [ - output_ids[len(input_ids) :] - for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) - ] - - response = self.tokenizer.batch_decode( - generated_ids, skip_special_tokens=True - )[0] + generated_ids = [ + output_ids[len(input_ids) :] + for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + response = self.tokenizer.batch_decode( + generated_ids, skip_special_tokens=True + )[0] + else: + outputs = self.model.generate([text], self.vllm_sampling) + response = [out.outputs[0].text for out in outputs] return response diff --git a/llm_performance_improvements.ipynb b/llm_performance_improvements.ipynb new file mode 100644 index 0000000..998d153 --- /dev/null +++ b/llm_performance_improvements.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# optimisatons for LLM generation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pieter/anaconda3/envs/gp/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from llm import Qwen\n", + "from tqdm import tqdm\n", + "import time\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "attention is not specified. To change this set attribute attn\n", + "using vllm as inference mechanism\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-03-10 20:52:46,493\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 03-10 20:52:48 __init__.py:207] Automatically detected platform cuda.\n", + "WARNING 03-10 20:52:50 config.py:2448] Casting torch.bfloat16 to torch.float16.\n" + ] + } + ], + "source": [ + "model = Qwen(vllm=True)\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "system = \"You are a data structure and algorithm expert. You are also an expert in python programming\"\n", + "user = \"Generate a python program for dijkstra's algorithm\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model(system, user)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "timings = []\n", + "\n", + "\n", + "for n in tqdm(range(10)):\n", + " start = time.time()\n", + " model(system, user)\n", + " end = time.time()\n", + " duration = end - start\n", + " timings.append(duration)\n", + "\n", + "timings\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def summary(timings):\n", + " print(f\"mean: {np.mean(timings)}\")\n", + " print(f\"median: {np.median(timings)}\")\n", + " print(f\"std: {np.std(timings)}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "baseline = [15.129080295562744,\n", + " 15.720300912857056,\n", + " 19.821353912353516,\n", + " 16.801652431488037,\n", + " 15.112785339355469,\n", + " 15.36365556716919,\n", + " 23.151497840881348,\n", + " 23.019291162490845,\n", + " 22.601115465164185,\n", + " 18.89482569694519]\n", + "\n", + "summary(baseline)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# kv_cache\n", + "kv = [20.304307222366333,\n", + " 12.694454908370972,\n", + " 13.19345235824585,\n", + " 13.327292442321777,\n", + " 21.780697345733643,\n", + " 17.99206781387329,\n", + " 19.965080499649048,\n", + " 23.245574712753296,\n", + " 18.404191255569458,\n", + " 16.01033854484558]\n", + "\n", + "summary(kv)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# do_sample\n", + "\n", + "kv = [20.304307222366333,\n", + " 12.694454908370972,\n", + " 13.19345235824585,\n", + " 13.327292442321777,\n", + " 21.780697345733643,\n", + " 17.99206781387329,\n", + " 19.965080499649048,\n", + " 23.245574712753296,\n", + " 18.404191255569458,\n", + " 16.01033854484558]\n", + "\n", + "summary(kv)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "do_sample = [17.019907236099243,\n", + " 32.08466076850891,\n", + " 16.29255437850952,\n", + " 26.581018447875977,\n", + " 17.966830253601074,\n", + " 33.007309436798096,\n", + " 33.01791763305664,\n", + " 21.371530055999756,\n", + " 21.834694385528564,\n", + " 26.789570808410645]\n", + "\n", + "summary(do_sample)\n", + "\n", + "# Sampling does take longer\n", + "\n", + "#Compared to baseline\n", + "# TtestResult(statistic=np.float64(-2.5633995416211652), pvalue=np.float64(0.01954406059593905), df=np.float64(18.0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compile = [20.496086835861206,\n", + " 26.187604188919067,\n", + " 19.345702409744263,\n", + " 26.974491119384766,\n", + " 24.00583553314209,\n", + " 32.89661765098572,\n", + " 29.74660015106201,\n", + " 22.171958208084106,\n", + " 26.075289487838745,\n", + " 25.507501363754272]\n", + "\n", + "summary(compile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import scipy.stats as stats\n", + "stats.ttest_ind(a=baseline, b=compile, equal_var=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gp", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/onnx_export.py b/onnx_export.py new file mode 100644 index 0000000..9349fc8 --- /dev/null +++ b/onnx_export.py @@ -0,0 +1,27 @@ +# export_onnx.py +import torch +import os +from pathlib import Path +from transformers import AutoModelForCausalLM, AutoTokenizer +from optimum.exporters.onnx import main_export + +# Ensure CUDA cache is cleared +torch.cuda.empty_cache() + +# Set the model name and output path +model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct" # Use the smaller model for testing +onnx_path = Path("./onnx_model") +os.makedirs(onnx_path, exist_ok=True) + +# Export directly from hub to ONNX +main_export( + model_name_or_path=model_name, + output=onnx_path, + task="text-generation", + opset=14, + device="cpu", # Force CPU + no_post_process=True, # Skip post-processing to save memory + trust_remote_code=True +) + +print(f"Model successfully exported to {onnx_path}") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bb0b64d..c66dd24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,11 @@ tqdm transformers torch bitsandbytes -accelerate \ No newline at end of file +accelerate +scipy +flash-attn +# onnx +# onnxruntime-gpu +# transformers-onnx +# optimum +vllm \ No newline at end of file