Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUNNING_MODE=full

DATA_FOLDER=./data/experiments

POPULATION_SIZE=3000
POPULATION_SIZE=30
MUTATION_PROBABILITY=0.50
CROSSOVER_PROBABILITY=0.50

Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ workspace.xml
.idea/

data/
programs.txt
programs.txt
/qwen_model
Binary file added cuda-keyring_1.1-1_all.deb
Binary file not shown.
2 changes: 1 addition & 1 deletion gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def evaluate_population(population, simulator):

elitism_individual = None

for epoch in tqdm(range(1500)):
for epoch in tqdm(range(5)):
min_max_length, elitism_individual = get_top_individual(population)

logger.info(
Expand Down
163 changes: 131 additions & 32 deletions llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import importlib
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import importlib.util

from logger_config import getLogger

Expand All @@ -31,6 +32,102 @@ def get_model(model_name: Literal["qwen", "deepseek"] = "qwen") -> int:


class LLMModel(ABC):

def __init__(self):
self.model_name = None
self.model = None
self.tokenizer = None
self.bnb_config = None
self.awq = None
self.attn = None
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.do_sampling = False # use the model default as the default here

def check_flash_att_compatibility(self) -> bool:
"""
Check if flash_attention can be used, otherwise default to sdpa
Args:
None
Returns:
bool value indicating if flash_attn can be used
"""

if not torch.cuda.is_available():
print("No GPU available, defaulting to CPU")
return False

if importlib.util.find_spec('flash_attn') is None:
print(f"No package flash_attn available for import. Ensure it is installed and try again!")
return False

gpu_name = torch.cuda.get_device_name()
gpu_idx = torch.cuda.current_device()
# tuple value representing the minor and major capability of the gpu
gpu_capability = torch.cuda.get_device_capability(gpu_idx)

print(f"The following GPU is available: ")
print(f"\tname: {gpu_name}")
print(f"\tindex: {gpu_idx}")
print(f"\tCapability: {gpu_capability[0]}.{gpu_capability[1]}")

if gpu_capability[0] >= 8: # ampere=8
return True
else:
return False

def load_model(self, device:str, vllm: bool) -> None:
"""
Load a specific LLM model

Args:
device: the device onto which the model should be loaded. For onnx it is cpu
vllm: use of vllm as inference mechanism
"""

if self.model_name is None:
raise ValueError("No model name specified, please set attribute model_name")
if self.attn is None:
print("attention is not specified. To change this set attribute attn")

if not vllm:
if self.bnb_config is None:
print("No quantization mechanism is implemented. To change this set attribute bnb_config")


self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.float16,
device_map=device,
quantization_config=None if self.bnb_config is None else self.bnb_config,
use_sliding_window=False,
do_sample=self.do_sampling, # but we need to look at accuracy
attn_implementation="sdpa" if self.attn is None else self.attn
)
logger.info(f"Qwen|HF|{self.model_name}|{self.bnb_config}")

else:
print(f"using vllm as inference mechanism")
# no parameter to pass to vLLM and thus set it in the environment
if self.attn != "flash_attention2":
os.environ["VLLM_ATTENTION_BACKEND"] = "TORCH_SDPA"
from vllm import LLM as vLLM, SamplingParams
# print(os.getcwd())
self.model = vLLM(model="./qwen_model/snapshots/2e1fd397ee46e1388853d2af2c993145b0f1098a", # much faster to download model first
tensor_parallel_size=1,
device=self.device,
quantization="awq" if self.awq is not None else None, # Using AWQ (activation aware weight quantization) 4-bit quantization
dtype="half",
trust_remote_code=True,
model_impl="llama" )
self.vllm_sampling = SamplingParams(
temperature=0.7 if hasattr(self, 'do_sampling') and self.do_sampling else 0.0,
top_p=0.95 if hasattr(self, 'do_sampling') and self.do_sampling else 1.0,
max_tokens=1500
)
logger.info(f"Qwen|HF|{self.model_name}|{self.awq}")

print(f"Model loaded on {device}")

@abstractmethod
def __call__(self, system_prompt: str, user_prompt: str) -> str:
"""
Expand Down Expand Up @@ -103,32 +200,32 @@ def __init__(
"Qwen/Qwen2.5-Coder-1.5B-Instruct", "Qwen/Qwen2.5-Coder-7B-Instruct"
] = "Qwen/Qwen2.5-Coder-1.5B-Instruct",
bit_config: Literal["8bit", "4bit", "none"] = "4bit",
vllm: bool=True
):
super().__init__()
self.vllm = vllm
self.model_name = model_name
if bit_config == "8bit":
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
self.bnb_config = BitsAndBytesConfig(load_in_8bit=True)
elif bit_config == "4bit":
bnb_config = BitsAndBytesConfig(
self.bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
)

logger.info(f"Qwen|{model_name}|{bnb_config}")

if self.check_flash_att_compatibility():
logger.info("Flash attention will be used as the attention mechanism")
self.attn = "flash_attention_2"
if not self.vllm:
if self.check_flash_att_compatibility():
print("Flash attention will be used as the attention mechanism")
self.attn = "flash_attention_2"
else:
print(f"Unable to run on GPU using flash attention, will run on {self.device} using sdpa attention mechanism")
self.attn = "sdpa"
else:
logger.info("Unable to run on GPU using flash attention, will run on CPU using sdpa attention mechanism")
self.attn = "sdpa"

self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="cuda",
quantization_config=bnb_config,
use_sliding_window=False,
attn_implementation=self.attn
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.awq = True
self.bnb_config = None


self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.load_model(device=self.device, vllm=self.vllm)


def __call__(self, system_prompt, user_prompt) -> str:
print(user_prompt)
Expand All @@ -143,19 +240,21 @@ def __call__(self, system_prompt, user_prompt) -> str:
text = self.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
if not self.vllm:
model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

with torch.no_grad():
generated_ids = self.model.generate(**model_inputs, max_new_tokens=1500)
with torch.no_grad():
generated_ids = self.model.generate(**model_inputs, max_new_tokens=1500)

generated_ids = [
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = self.tokenizer.batch_decode(
generated_ids, skip_special_tokens=True
)[0]
generated_ids = [
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = self.tokenizer.batch_decode(
generated_ids, skip_special_tokens=True
)[0]
else:
outputs = self.model.generate([text], self.vllm_sampling)
response = [out.outputs[0].text for out in outputs]
return response
Loading