InternLM · tastelikefeet · Feb 15, 2025 · Feb 15, 2025 · Feb 16, 2025 · Feb 17, 2025
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -223,6 +223,7 @@ class TurbomindEngineConfig:
     max_prefill_token_num: int = 8192
     num_tokens_per_iter: int = 0
     max_prefill_iters: int = 1
+    devices: List[int] = field(default_factory=lambda: [0])
 
     def __post_init__(self):
         """Check input validation."""
@@ -297,6 +298,7 @@ class PytorchEngineConfig:
     download_dir: str = None
     revision: str = None
     quant_policy: Literal[0, 4, 8] = 0
+    devices: List[int] = field(default_factory=lambda: [0])
 
     def __post_init__(self):
         """Check input validation."""

diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
+import os
 import os.path as osp
 import re
 from abc import ABC, abstractmethod
-from collections import defaultdict
+from collections import OrderedDict, defaultdict
 from functools import partial
 from glob import glob
 from typing import Iterator, Tuple
@@ -145,6 +146,31 @@ def items(self):
 
 
 def create_loader(model_path: str, pattern: str) -> BaseLoader:
+    if not isinstance(model_path, (str, os.PathLike)):
+
+        def generate():
+            generator = OrderedDict()
+            model_dict = {}
+            if not isinstance(model_path, dict):
+                for key, value in list(model_path):
+                    model_dict[key] = value
+            else:
+                model_dict = model_path
+            for key, value in model_dict.items():
+                match = re.findall(pattern, key)
+                if not match:
+                    if -1 not in generator:
+                        generator[-1] = {}
+                    generator[-1][key] = value
+                else:
+                    layer = int(match[0])
+                    if layer not in generator:
+                        generator[layer] = {}
+                    generator[layer][key] = value
+            return generator
+
+        return generate()
+
     args = (model_path, pattern)
 
     if osp.exists(osp.join(model_path, SAFE_WEIGHT_INDEX_NAME)):

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -99,6 +99,7 @@ def __init__(self,
             f' greater than 0, but got {_engine_config.max_batch_size}'
 
         self.gpu_count = _engine_config.tp
+        self.gpu_list = _engine_config.devices
 
         self.tokenizer = tokenizer
         if model_source == ModelSource.WORKSPACE:
@@ -112,10 +113,13 @@ def __init__(self,
 
         with ThreadPoolExecutor(max_workers=self.gpu_count) as e:
             ranks = [self.node_id * self.gpu_count + device_id for device_id in range(self.gpu_count)]
-            for _ in e.map(self.model_comm.process_weight, range(self.gpu_count), ranks):
-                pass
+            # This is for load_state_dict
+            # process_weight will optimizer the kernel by col major matrix and pack_b
+            # This will result in the failure of get_params
+            # for _ in e.map(self.model_comm.process_weight, range(self.gpu_count), ranks):
+            #     pass
             # implicit synchronization
-            for _ in e.map(self.model_comm.create_engine, range(self.gpu_count), ranks, repeat(self.nccl_params)):
+            for _ in e.map(self.model_comm.create_engine, self.gpu_list, ranks, repeat(self.nccl_params)):
                 pass
 
         self.session_len = self.config.session_len
@@ -130,30 +134,30 @@ def _create_weight(self, model_comm):
         torch.cuda.synchronize()
 
         # create weight
-        def _create_weight_func(device_id):
-            rank = self.node_id * self.gpu_count + device_id
+        def _create_weight_func(index, device_id):
+            rank = self.node_id * self.gpu_count + index
             model_comm.create_shared_weights(device_id, rank)
 
         with ThreadPoolExecutor(max_workers=self.gpu_count) as executor:
             futures = []
-            for device_id in range(self.gpu_count):
-                futures.append(executor.submit(_create_weight_func, device_id))
+            for idx, device_id in enumerate(self.gpu_list):
+                futures.append(executor.submit(_create_weight_func, idx, device_id))
             for future in futures:
                 future.result()
 
     def _get_model_params(self, model_comm, tm_params):
         """Get turbomind model params when loading from hf."""
 
-        def _get_params(device_id, que):
-            rank = self.node_id * self.gpu_count + device_id
+        def _get_params(idx, device_id, que):
+            rank = self.node_id * self.gpu_count + idx
             out = model_comm.get_params(device_id, rank)
             que.put(out)
 
         que = Queue()
         with ThreadPoolExecutor(max_workers=self.gpu_count) as executor:
             futures = []
-            for device_id in range(self.gpu_count):
-                futures.append(executor.submit(_get_params, device_id, que))
+            for idx, device_id in enumerate(self.gpu_list):
+                futures.append(executor.submit(_get_params, idx, device_id, que))
             for future in futures:
                 future.result()
 
@@ -215,13 +219,23 @@ def _from_hf(self, model_source: ModelSource, model_path: str, engine_config: Tu
         self._get_model_params(model_comm, tm_params)
         logger.warning(f'get {len(tm_params)} model params')
         tm_model.export()
+        self.tm_model = tm_model
         # there should be no left turbomind params.
         if len(tm_params) > 0:
             uninitialized = list(tm_params.keys())
             logger.warning('the model may not be loaded successfully '
                            f'with {len(tm_params)} uninitialized params:\n{uninitialized}')
         return model_comm
 
+    def load_weights(self, state_dict):
+        tm_params = self.tm_model.tm_params
+        self._get_model_params(self.model_comm, tm_params)
+        input_model = self.tm_model.input_model
+        model_path = input_model.model_path
+        input_model.model_path = state_dict
+        self.tm_model.export()
+        input_model.model_path = model_path
+
     def _from_workspace(self, model_path: str, engine_config: TurbomindEngineConfig):
         """Load model which is converted by `lmdeploy convert`"""
         config_path = osp.join(model_path, 'triton_models', 'weights', 'config.yaml')
@@ -302,7 +316,7 @@ def create_instance(self, cuda_stream_id=0):
         Returns:
             TurboMindInstance: an instance of turbomind
         """
-        return TurboMindInstance(self, self.config, cuda_stream_id)
+        return TurboMindInstance(self, self.config, cuda_stream_id, self.gpu_list[0])
 
 
 def _get_logits(outputs, offset: int):
@@ -396,7 +410,7 @@ class TurboMindInstance:
         cuda_stream_id(int): identity of a cuda stream
     """
 
-    def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_stream_id: int = 0):
+    def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_stream_id: int = 0, device_id: int = 0):
         self.tm_model = tm_model
         self.cuda_stream_id = cuda_stream_id
 
@@ -408,7 +422,7 @@ def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_strea
         self.nccl_params = tm_model.nccl_params
 
         # create model instances
-        self.model_inst = self._create_model_instance(0)
+        self.model_inst = self._create_model_instance(device_id)
 
         self.config = config
         self.lock = None