diff --git a/ais_bench/benchmark/configs/models/mf_models/mf_model.py b/ais_bench/benchmark/configs/models/mf_models/mf_model.py new file mode 100644 index 00000000..1c39f8aa --- /dev/null +++ b/ais_bench/benchmark/configs/models/mf_models/mf_model.py @@ -0,0 +1,32 @@ +from ais_bench.benchmark.models import MindFormerModel + +models = [ + dict( + attr="local", # local or service + type=MindFormerModel, # transformers < 4.33.0 用这个,优先AutoModelForCausalLM.from_pretrained加载模型,失败则用AutoModel.from_pretrained加载 + abbr='mindformer-model', + path='THUDM/chatglm-6b', # path to model dir, current value is just a example + checkpoint = 'THUDM/your_checkpoint', # path to checkpoint file, current value is just a example + yaml_cfg_file = 'THUDM/your.yaml', + tokenizer_path='THUDM/chatglm-6b', # path to tokenizer dir, current value is just a example + model_kwargs=dict( # 模型参数参考 huggingface.co/docs/transformers/v4.50.0/en/model_doc/auto#transformers.AutoModel.from_pretrained + device_map='npu', + ), + tokenizer_kwargs=dict( # tokenizer参数参考 huggingface.co/docs/transformers/v4.50.0/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase + padding_side='right', + ), + generation_kwargs = dict( # 后处理参数参考huggingface.co/docs/transformers/main_classes/test_generation + temperature = 0.5, + top_k = 10, + top_p = 0.95, + do_sample = True, + seed = None, + repetition_penalty = 1.03, + ), + run_cfg = dict(num_gpus=1, num_procs=1), # 多卡/多机多卡 参数,使用torchrun拉起任务 + max_out_len=100, # 最大输出token长度 + batch_size=2, # 每次推理的batch size + max_seq_len=2048, + batch_padding=True, + ) +] \ No newline at end of file diff --git a/ais_bench/benchmark/models/__init__.py b/ais_bench/benchmark/models/__init__.py index 5908d946..a75274bc 100644 --- a/ais_bench/benchmark/models/__init__.py +++ b/ais_bench/benchmark/models/__init__.py @@ -14,4 +14,5 @@ from ais_bench.benchmark.models.api_models.triton_api import TritonCustomAPIStream # noqa: F401 from ais_bench.benchmark.models.api_models.tgi_api import TGICustomAPIStream # noqa: F401 from ais_bench.benchmark.models.api_models.vllm_custom_api_chat import VllmMultiturnAPIChatStream # noqa: F401 -from ais_bench.benchmark.models.local_models.vllm_offline_vl import VLLMOfflineVLModel \ No newline at end of file +from ais_bench.benchmark.models.local_models.vllm_offline_vl import VLLMOfflineVLModel +from ais_bench.benchmark.models.local_models.mindformers_model import MindFormerModel \ No newline at end of file diff --git a/ais_bench/benchmark/models/local_models/mindformers_model.py b/ais_bench/benchmark/models/local_models/mindformers_model.py new file mode 100644 index 00000000..61796211 --- /dev/null +++ b/ais_bench/benchmark/models/local_models/mindformers_model.py @@ -0,0 +1,306 @@ +import os, sys +from typing import Dict, List, Optional, Union + +import numpy as np +import torch +import transformers + +from ais_bench.benchmark.models.base import BaseModel +from ais_bench.benchmark.models.base_api import APITemplateParser +from ais_bench.benchmark.registry import MODELS +from ais_bench.benchmark.utils.logging import get_logger +from ais_bench.benchmark.utils.prompt import PromptList + +from mindspore import Tensor, Model +from mindformers import MindFormerConfig, build_context +from mindformers.models import build_network +from mindformers.core.parallel_config import build_parallel_config +from mindformers.utils.load_checkpoint_utils import get_load_path_after_hf_convert +from mindformers.trainer.utils import transform_and_load_checkpoint + +PromptType = Union[PromptList, str, dict] + + +class MultiTokenEOSCriteria(transformers.StoppingCriteria): + """Criteria to stop on the specified multi-token sequence.""" + + def __init__( + self, + sequence: str, + tokenizer: transformers.PreTrainedTokenizer, + batch_size: int, + ): + self.done_tracker = [False] * batch_size + self.sequence = sequence + self.sequence_ids = tokenizer.encode(sequence, + add_special_tokens=False) + self.sequence_id_len = len(self.sequence_ids) + self.tokenizer = tokenizer + + def __call__(self, input_ids, scores, **kwargs) -> bool: + # compare the last len(stop) tokens + lookback_ids_batch = input_ids[:, -self.sequence_id_len:] + lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch) + for i, done in enumerate(self.done_tracker): + if done: + continue + self.done_tracker[i] = self.sequence in lookback_tokens_batch[i] + return False not in self.done_tracker + + +def drop_error_generation_kwargs(generation_kwargs: dict) -> dict: + for key in ['is_synthetic', 'batch_size', 'do_performance']: + if key in generation_kwargs: + generation_kwargs.pop(key) + return generation_kwargs + + +@MODELS.register_module() +class MindFormerModel(BaseModel): + + def __init__(self, + path: str, + checkpoint: Optional[str] = None, + yaml_cfg_file: Optional[str] = None, + batch_size: int = 1, + max_seq_len: int = 2048, + tokenizer_path: Optional[str] = None, + tokenizer_kwargs: dict = dict(), + tokenizer_only: bool = False, + generation_kwargs: dict = dict(), + meta_template: Optional[Dict] = None, + extract_pred_after_decode: bool = False, + batch_padding: bool = False, + pad_token_id: Optional[int] = None, + mode: str = 'none', + use_fastchat_template: bool = False, + end_str: Optional[str] = None, + **kwargs): + super().__init__(path=path, + max_seq_len=max_seq_len, + tokenizer_only=tokenizer_only, + meta_template=meta_template) + self.logger = get_logger() + self.batch_size = batch_size + self.pad_token_id = pad_token_id + self.pretrained_model_path = path + if mode not in ['none', 'mid']: + raise ValueError(f"mode must be 'none' or 'mid', but got {mode}") + self.mode = mode + if not yaml_cfg_file: + raise ValueError('`yaml_cfg_file` is required for MindFormerModel') + self.config = MindFormerConfig(yaml_cfg_file) + self.checkpoint = checkpoint + self._load_tokenizer(path=path, + tokenizer_path=tokenizer_path, + tokenizer_kwargs=tokenizer_kwargs) + self.batch_padding = batch_padding + self.extract_pred_after_decode = extract_pred_after_decode + if not tokenizer_only: + self._load_model(self.config, self.batch_size, self.max_seq_len) + self.generation_kwargs = generation_kwargs + self.use_fastchat_template = use_fastchat_template + self.end_str = end_str + + def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], + tokenizer_kwargs: dict): + from transformers import AutoTokenizer, GenerationConfig + + DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', trust_remote_code=True) + kwargs = DEFAULT_TOKENIZER_KWARGS.copy() + kwargs.update(tokenizer_kwargs) + + load_path = tokenizer_path if tokenizer_path else path + self.tokenizer = AutoTokenizer.from_pretrained(load_path, **kwargs) + + pad_token_id = self.pad_token_id + + # A patch for some models without pad_token_id + if pad_token_id is not None: + if self.tokenizer.pad_token_id is None: + self.logger.debug(f'Using {pad_token_id} as pad_token_id') + elif self.tokenizer.pad_token_id != pad_token_id: + self.logger.warning(f'pad_token_id is not consistent. Using {pad_token_id} as pad_token_id') + self.tokenizer.pad_token_id = pad_token_id + return + if self.tokenizer.pad_token_id is not None: + return + self.logger.warning('pad_token_id is not set for the tokenizer.') + + try: + generation_config = GenerationConfig.from_pretrained(path) + except Exception: + generation_config = None + + if generation_config and generation_config.pad_token_id is not None: + self.logger.warning(f'Using {generation_config.pad_token_id} as pad_token_id.') + self.tokenizer.pad_token_id = generation_config.pad_token_id + return + if self.tokenizer.eos_token_id is not None: + self.logger.warning(f'Using eos_token_id {self.tokenizer.eos_token_id} as pad_token_id.') + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + return + raise ValueError('pad_token_id is not set for this tokenizer. Please set `pad_token_id={PAD_TOKEN_ID}` in model_cfg.') + + def _set_config_from_yaml(self): + if self.checkpoint is not None: + self.config.load_checkpoint = self.checkpoint + elif self.checkpoint is None and self.config.load_checkpoint is None: + self.config.load_checkpoint = self.path + self.config.model.pretrained_model_dir = self.pretrained_model_path + self.config.model.model_config.seq_length = self.max_seq_len + build_context(self.config) + build_parallel_config(self.config) + + def _load_model(self, config, batch_size, max_seq_len): + + self._set_config_from_yaml() + try: + self.model = build_network( + config.model, + default_args={ + "parallel_config": config.parallel_config, + "moe_config": config.moe_config + }) + self.logger.info("..........Network Built Successfully..........") + self.model.set_train(False) + config.load_checkpoint = get_load_path_after_hf_convert(config, self.model) + self.logger.info(f"load checkpoint path : {config.load_checkpoint}") + run_mode = config.get("run_mode", None) + if run_mode == "predict": + self.model.load_weights(config.load_checkpoint) + else: + model = Model(self.model) + input_ids = Tensor(np.ones((batch_size, max_seq_len), dtype=np.int32)) + infer_data = self.model.prepare_inputs_for_predict_layout(input_ids) + transform_and_load_checkpoint(config, model, self.model, infer_data, do_eval=True) + + self.logger.info("..........Checkpoint Load Successfully..........") + except ValueError as e: + raise ValueError('Failed to load MindFormers model, please check configuration') from e + + + def generate(self, + inputs: List[str], + max_out_len: int, + min_out_len: Optional[int] = None, + stopping_criteria: List[str] = [], + **kwargs) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + min_out_len (Optional[int]): The minimum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + generation_kwargs = kwargs.copy() + generation_kwargs.update(self.generation_kwargs) + + messages = list(inputs) + batch_size = len(messages) + prompt_char_lens = None + + if self.extract_pred_after_decode: + prompt_char_lens = [len(text) for text in messages] + + if self.use_fastchat_template: + try: + from fastchat.model import get_conversation_template + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'Fastchat is not implemented. You can use ' + "'pip install \"fschat[model_worker,webui]\"' " + 'to implement fastchat.') + for idx, text in enumerate(messages): + conv = get_conversation_template('vicuna') + conv.append_message(conv.roles[0], text) + conv.append_message(conv.roles[1], None) + messages[idx] = conv.get_prompt() + if self.mode == 'mid': + assert len(messages) == 1 + tokens = self.tokenizer(messages, padding=False, truncation=False, return_tensors='np') + input_ids = tokens['input_ids'] + if input_ids.shape[-1] > self.max_seq_len: + input_ids = np.concatenate([input_ids[:, : self.max_seq_len // 2], input_ids[:, - self.max_seq_len // 2:]], axis=-1) + tokens = {'input_ids': input_ids} + else: + tokenize_kwargs = dict( + padding=True, + truncation=True, + max_length=self.max_seq_len, + return_tensors='np' + ) + tokens = self.tokenizer(messages, **tokenize_kwargs) + + input_ids = tokens['input_ids'] + if len(messages) > 1: + attention_mask = tokens.get('attention_mask') + prompt_token_lens = ( + attention_mask.sum(axis=1).astype(int).tolist() + if attention_mask is not None else + [input_ids.shape[1]] * batch_size + ) + else: + prompt_token_lens = [len(ids) for ids in input_ids] + + input_ids_tensor = Tensor(input_ids) + + if min_out_len is not None: + generation_kwargs['min_new_tokens'] = min_out_len + generation_kwargs['max_new_tokens'] = max_out_len + generation_kwargs.setdefault('top_k', 1) + generation_kwargs.setdefault('return_dict_in_generate', False) + + origin_stopping_criteria = list(stopping_criteria) + if stopping_criteria: + if self.tokenizer.eos_token is not None: + stopping_criteria = stopping_criteria + [ + self.tokenizer.eos_token + ] + stopping_list = transformers.StoppingCriteriaList([ + *[ + MultiTokenEOSCriteria(sequence, self.tokenizer, + input_ids_tensor.shape[0]) + for sequence in stopping_criteria + ], + ]) + generation_kwargs['stopping_criteria'] = stopping_list + + generation_kwargs = drop_error_generation_kwargs(generation_kwargs) + + outputs = self.model.generate(input_ids=input_ids_tensor, + **generation_kwargs) + + if isinstance(outputs, dict): + outputs = outputs.get('sequences', outputs) + if outputs is None: + raise ValueError("Model output dictionary is missing 'sequence' key.") + + sequences = [seq.tolist() for seq in outputs] + + if not self.extract_pred_after_decode: + sequences = [ + seq[prompt_len:] + for seq, prompt_len in zip(sequences, prompt_token_lens) + ] + + decodeds = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in sequences + ] + + if self.extract_pred_after_decode and prompt_char_lens is not None: + decodeds = [ + text[length:] + for text, length in zip(decodeds, prompt_char_lens) + ] + + if self.end_str: + decodeds = [text.split(self.end_str)[0] for text in decodeds] + if origin_stopping_criteria: + for token in origin_stopping_criteria: + decodeds = [text.split(token)[0] for text in decodeds] + return decodeds diff --git a/ais_bench/benchmark/tasks/openicl_infer.py b/ais_bench/benchmark/tasks/openicl_infer.py index 5d20d67e..23982146 100644 --- a/ais_bench/benchmark/tasks/openicl_infer.py +++ b/ais_bench/benchmark/tasks/openicl_infer.py @@ -38,6 +38,8 @@ def __init__(self, cfg: ConfigDict): super().__init__(cfg) run_cfg = self.model_cfg.get('run_cfg', {}) self.num_gpus = run_cfg.get('num_gpus', 0) + self.worker_num = run_cfg.get("worker_num", 0) + self.local_worker_num = run_cfg.get('local_worker_num',0) self.num_procs = run_cfg.get('num_procs', 1) self.nnodes = run_cfg.get('nnodes', 1) self.node_rank = run_cfg.get('node_rank', 0) @@ -61,12 +63,38 @@ def get_command(self, cfg_path, template): for key in backend_keys) if self.num_gpus > 1 and not use_backend and self.nnodes == 1: port = random.randint(12000, 32000) - command = (f'torchrun --master_port={port} ' - f'--nproc_per_node {self.num_procs} ' - f'{script_path} {cfg_path}') + if self.abbr == 'mindformer-model': + command = ( + f"msrun " + f"--worker_num={self.num_gpus} " + f"--local_worker_num={self.num_gpus} " + f"--master_port={port} " + f"--log_dir='output/msrun_log' " + f"--join=True " + f"--cluster_time_out=7200 " + f'{script_path} {cfg_path}' + ) + else : + command = (f'torchrun --master_port={port} ' + f'--nproc_per_node {self.num_procs} ' + f'{script_path} {cfg_path}') elif self.nnodes > 1: port = 12345 - command = (f'torchrun --master_port={port} ' + if self.abbr == "mindformer-model" : + command = ( + f"msrun " + f"--worker_num={self.worker_num} " + f"--local_worker_num={self.local_worker_num} " + f"--master_port={port} " + f"--master_addr={self.master_addr} " + f"--node_rank={self.node_rank} " + f"--log_dir='output/msrun_log' " + f"--join=True " + f"--cluster_time_out=7200 " + f'{script_path} {cfg_path}' + ) + else : + command = (f'torchrun --master_port={port} ' f'--nproc_per_node {self.num_procs} ' f'--nnodes {self.nnodes} ' f'--node_rank {self.node_rank} ' diff --git a/ais_bench/benchmark/utils/config/build.py b/ais_bench/benchmark/utils/config/build.py index 46c3b99f..3ca401de 100644 --- a/ais_bench/benchmark/utils/config/build.py +++ b/ais_bench/benchmark/utils/config/build.py @@ -133,15 +133,18 @@ def build_model_from_cfg(model_cfg: ConfigDict): ) model_cfg.pop("run_cfg", None) model_cfg.pop("request_rate", None) - model_cfg.pop("batch_size", None) - model_cfg.pop("abbr", None) - model_cfg.pop("attr", None) + batch_size = model_cfg.pop("batch_size", None) + abbr = model_cfg.pop("abbr", None) + attr = model_cfg.pop("attr", None) model_cfg.pop("summarizer_abbr", None) model_cfg.pop("pred_postprocessor", None) model_cfg.pop("min_out_len", None) model_cfg.pop("returns_tool_calls", None) model_cfg.pop("traffic_cfg", None) - return MODELS.build(model_cfg) + if attr == "local" and abbr == "mindformer-model" : + return MODELS.build(model_cfg, batch_size = batch_size) + else : + return MODELS.build(model_cfg) def build_perf_metric_calculator_from_cfg(metric_cfg: ConfigDict): logger.debug(f"Building perf metric calculator config: type={metric_cfg.get('type')}")