Skip to content

[QEff Finetune]: Use logger in place of print statements in finetuning scripts #371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 29 additions & 19 deletions QEfficient/cloud/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# -----------------------------------------------------------------------------

import logging
import random
import warnings
from typing import Any, Dict, Optional, Union
Expand All @@ -17,7 +18,7 @@
import torch.utils.data
from peft import PeftModel, get_peft_model
from torch.optim.lr_scheduler import StepLR
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

from QEfficient.finetune.configs.training import TrainConfig
from QEfficient.finetune.utils.config_utils import (
Expand All @@ -26,18 +27,22 @@
update_config,
)
from QEfficient.finetune.utils.dataset_utils import get_dataloader
from QEfficient.finetune.utils.logging_utils import logger
from QEfficient.finetune.utils.parser import get_finetune_parser
from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
from QEfficient.utils._utils import login_and_download_hf_lm
from QEfficient.finetune.utils.train_utils import (
get_longest_seq_length,
print_model_size,
print_trainable_parameters,
train,
)
from QEfficient.utils._utils import hf_download

# Try importing QAIC-specific module, proceed without it if unavailable
try:
import torch_qaic # noqa: F401
except ImportError as e:
print(f"Warning: {e}. Proceeding without QAIC modules.")

logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.", logging.WARNING)

from transformers import AutoModelForSequenceClassification

# Suppress all warnings
warnings.filterwarnings("ignore")
Expand Down Expand Up @@ -106,7 +111,8 @@ def load_model_and_tokenizer(
- Resizes model embeddings if tokenizer vocab size exceeds model embedding size.
- Sets pad_token_id to eos_token_id if not defined in the tokenizer.
"""
pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
logger.log_rank_zero(f"Loading HuggingFace model for {train_config.model_name}")
pretrained_model_path = hf_download(train_config.model_name)
if train_config.task_type == "seq_classification":
model = AutoModelForSequenceClassification.from_pretrained(
pretrained_model_path,
Expand All @@ -116,7 +122,7 @@ def load_model_and_tokenizer(
)

if not hasattr(model, "base_model_prefix"):
raise RuntimeError("Given huggingface model does not have 'base_model_prefix' attribute.")
logger.raise_error("Given huggingface model does not have 'base_model_prefix' attribute.", RuntimeError)

for param in getattr(model, model.base_model_prefix).parameters():
param.requires_grad = False
Expand All @@ -141,11 +147,10 @@ def load_model_and_tokenizer(
# If there is a mismatch between tokenizer vocab size and embedding matrix,
# throw a warning and then expand the embedding matrix
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
print("WARNING: Resizing embedding matrix to match tokenizer vocab size.")
logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logging.WARNING)
model.resize_token_embeddings(len(tokenizer))

# FIXME (Meet): Cover below line inside the logger once it is implemented.
print_model_size(model, train_config)
print_model_size(model)

# Note: Need to call this before calling PeftModel.from_pretrained or get_peft_model.
# Because, both makes model.is_gradient_checkpointing = True which is used in peft library to
Expand All @@ -157,7 +162,9 @@ def load_model_and_tokenizer(
if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing:
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False})
else:
raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.")
logger.raise_error(
"Given model doesn't support gradient checkpointing. Please disable it and run it.", RuntimeError
)

model = apply_peft(model, train_config, peft_config_file, **kwargs)

Expand Down Expand Up @@ -192,7 +199,7 @@ def apply_peft(
else:
peft_config = generate_peft_config(train_config, peft_config_file, **kwargs)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
print_trainable_parameters(model)

return model

Expand All @@ -217,25 +224,26 @@ def setup_dataloaders(
- Length of longest sequence in the dataset.

Raises:
ValueError: If validation is enabled but the validation set is too small.
RuntimeError: If validation is enabled but the validation set is too small.

Notes:
- Applies a custom data collator if provided by get_custom_data_collator.
- Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits.
"""

train_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="train")
print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
logger.log_rank_zero(f"Number of Training Set Batches loaded = {len(train_dataloader)}")

eval_dataloader = None
if train_config.run_validation:
eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val")
if len(eval_dataloader) == 0:
raise ValueError(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
logger.raise_error(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
ValueError,
)
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")

longest_seq_length, _ = get_longest_seq_length(
torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
Expand Down Expand Up @@ -274,13 +282,15 @@ def main(peft_config_file: str = None, **kwargs) -> None:
dataset_config = generate_dataset_config(train_config.dataset)
update_config(dataset_config, **kwargs)

logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)

setup_distributed_training(train_config)
setup_seeds(train_config.seed)
model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs)

# Create DataLoaders for the training and validation dataset
train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer)
print(
logger.log_rank_zero(
f"The longest sequence length in the train data is {longest_seq_length}, "
f"passed context length is {train_config.context_length} and overall model's context length is "
f"{model.config.max_position_embeddings}"
Expand Down
5 changes: 4 additions & 1 deletion QEfficient/finetune/configs/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# -----------------------------------------------------------------------------

import logging
from dataclasses import dataclass


Expand Down Expand Up @@ -94,5 +95,7 @@ class TrainConfig:
use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
# profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler

dump_root_dir: str = "mismatches/step_"
opByOpVerifier: bool = False

dump_logs: bool = True
log_level: str = logging.INFO
10 changes: 9 additions & 1 deletion QEfficient/finetune/dataset/alpaca_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import torch
from torch.utils.data import Dataset

from QEfficient.finetune.utils.logging_utils import logger

PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
Expand All @@ -27,7 +29,13 @@

class InstructionDataset(Dataset):
def __init__(self, dataset_config, tokenizer, partition="train", context_length=None):
self.ann = json.load(open(dataset_config.data_path))
try:
self.ann = json.load(open(dataset_config.data_path))
except FileNotFoundError:
logger.raise_error(
"Loading of alpaca dataset failed! Please use (wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/) to download the alpaca dataset.",
FileNotFoundError,
)
# Use 5% of the dataset for evaluation
eval_length = int(len(self.ann) / 20)
if partition == "train":
Expand Down
28 changes: 18 additions & 10 deletions QEfficient/finetune/dataset/custom_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import importlib
from pathlib import Path

from QEfficient.finetune.utils.logging_utils import logger


def load_module_from_py_file(py_file: str) -> object:
"""
Expand All @@ -30,20 +32,22 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
module_path, func_name = dataset_config.file, "get_custom_dataset"

if not module_path.endswith(".py"):
raise ValueError(f"Dataset file {module_path} is not a .py file.")
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)

module_path = Path(module_path)
if not module_path.is_file():
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
logger.raise_error(
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
)

module = load_module_from_py_file(module_path.as_posix())
try:
return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
except AttributeError as e:
print(
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
except AttributeError:
logger.raise_error(
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
AttributeError,
)
raise e


def get_data_collator(dataset_processer, dataset_config):
Expand All @@ -53,16 +57,20 @@ def get_data_collator(dataset_processer, dataset_config):
module_path, func_name = dataset_config.file, "get_data_collator"

if not module_path.endswith(".py"):
raise ValueError(f"Dataset file {module_path} is not a .py file.")
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)

module_path = Path(module_path)
if not module_path.is_file():
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
logger.raise_error(
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
)

module = load_module_from_py_file(module_path.as_posix())
try:
return getattr(module, func_name)(dataset_processer)
except AttributeError:
print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
print("Using the default data_collator instead.")
logger.log_rank_zero(
f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
)
logger.log_rank_zero("Using the default data_collator instead.")
return None
17 changes: 8 additions & 9 deletions QEfficient/finetune/dataset/grammar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from datasets import load_dataset
from torch.utils.data import Dataset

from QEfficient.finetune.utils.logging_utils import logger


class grammar(Dataset):
def __init__(self, tokenizer, csv_name=None, context_length=None):
Expand All @@ -19,11 +21,11 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
data_files={"train": [csv_name]}, # "eval": "grammar_validation.csv"},
delimiter=",",
)
except Exception as e:
print(
"Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
except FileNotFoundError:
logger.raise_error(
"Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset.",
FileNotFoundError,
)
raise e

self.context_length = context_length
self.tokenizer = tokenizer
Expand All @@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
# Create prompt and tokenize contexts and questions

if self.print_text:
print("Input Text: ", self.clean_text(example_batch["text"]))
logger.log_rank_zero("Input Text: ", self.clean_text(example_batch["text"]))

input_ = example_batch["input"]
target_ = example_batch["target"]
Expand Down Expand Up @@ -71,9 +73,6 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
"""cover function for handling loading the working dataset"""
"""dataset loading"""
currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
print(f"Loading dataset {currPath}")
csv_name = str(currPath)
print(csv_name)
dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)
dataset = grammar(tokenizer=tokenizer, csv_name=str(currPath), context_length=context_length)

return dataset
20 changes: 8 additions & 12 deletions QEfficient/finetune/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
from utils.train_utils import evaluation, print_model_size

from QEfficient.finetune.configs.training import TrainConfig
from QEfficient.finetune.utils.logging_utils import logger

try:
import torch_qaic # noqa: F401

device = "qaic:0"
except ImportError as e:
print(f"Warning: {e}. Moving ahead without these qaic modules.")
logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Suppress all warnings
Expand Down Expand Up @@ -77,25 +78,20 @@ def main(**kwargs):
# If there is a mismatch between tokenizer vocab size and embedding matrix,
# throw a warning and then expand the embedding matrix
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.")
model.resize_token_embeddings(len(tokenizer))

print_model_size(model, train_config)
print_model_size(model)

if train_config.run_validation:
# TODO: vbaddi enable packing later in entire infra.
# if train_config.batching_strategy == "packing":
# dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)

eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="test")

print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
if len(eval_dataloader) == 0:
raise ValueError(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
logger.raise_error(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
ValueError,
)
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")

model.to(device)
_ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
Expand Down
Loading
Loading