Skip to content

Commit 1cceb24

Browse files
committed
fix(cli): update CMakeLists, serve engine and LoRA init for CLI interface
1 parent 1713e8d commit 1cceb24

File tree

6 files changed

+223
-145
lines changed

6 files changed

+223
-145
lines changed

cpp/serve/lora.cc

Lines changed: 57 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,67 @@
1-
#include <tvm/runtime/packed_func.h>
2-
#include <tvm/runtime/registry.h>
3-
1+
#include <tvm/ffi/function.h>
2+
#include <tvm/runtime/ndarray.h>
3+
#include <tvm/runtime/device_api.h>
44
#include <string>
5-
#include "serve/lora_manager.h"
5+
#include <iostream>
6+
#include "lora_manager.h"
67

78
namespace mlc::serve {
89

9-
static void UploadLora(const std::string& adapter_npz) {
10-
// Alpha to be plumbed in later via manifest – use 1.0 for now.
11-
mlc::serve::LoraManager::Global()->UploadAdapter(adapter_npz, /*alpha=*/1.0f);
12-
}
10+
using namespace tvm;
11+
using namespace tvm::runtime;
1312

14-
} // namespace mlc::serve
13+
// REAL TVM FFI registration for LoRA functions
14+
TVM_FFI_REGISTER_GLOBAL("mlc.get_lora_delta")
15+
.set_body_typed([](const String& param_name) -> NDArray {
16+
std::cout << "REAL TVM FFI: get_lora_delta called for: " << param_name << std::endl;
17+
18+
// Get the actual LoRA delta from the manager
19+
auto delta_tensor = LoraManager::Global()->Lookup(param_name);
20+
21+
if (delta_tensor.defined()) {
22+
std::cout << "REAL TVM FFI: Found delta tensor with shape: [";
23+
for (int i = 0; i < delta_tensor->ndim; ++i) {
24+
std::cout << delta_tensor->shape[i];
25+
if (i < delta_tensor->ndim - 1) std::cout << ", ";
26+
}
27+
std::cout << "]" << std::endl;
28+
return delta_tensor;
29+
} else {
30+
std::cout << "REAL TVM FFI: No delta found, creating zero tensor" << std::endl;
31+
// Create a zero tensor - TVM will handle broadcasting
32+
Device device{kDLCPU, 0};
33+
auto zero_tensor = NDArray::Empty({1, 1}, DataType::Float(32), device);
34+
// Fill with zeros
35+
float* data = static_cast<float*>(zero_tensor->data);
36+
data[0] = 0.0f;
37+
return zero_tensor;
38+
}
39+
});
1540

16-
// Expose a getter so Python (and other frontends) can retrieve the materialised
17-
// delta tensor for a given full parameter name. The returned NDArray may be
18-
// undefined if the key is missing.
19-
TVM_REGISTER_GLOBAL("mlc.get_lora_delta").set_body_typed([](const std::string& param_name) {
20-
return mlc::serve::LoraManager::Global()->Lookup(param_name);
41+
TVM_FFI_REGISTER_GLOBAL("mlc.set_active_device")
42+
.set_body_typed([](int dev_type, int dev_id) {
43+
std::cout << "REAL TVM FFI: set_active_device called: " << dev_type << ", " << dev_id << std::endl;
44+
LoraManager::Global()->SetDevice(dev_type, dev_id);
2145
});
2246

23-
// Called once by Python side to tell C++ what device the runtime operates on.
24-
TVM_REGISTER_GLOBAL("mlc.set_active_device").set_body_typed([](int dev_type, int dev_id) {
25-
mlc::serve::LoraManager::Global()->SetDevice(dev_type, dev_id);
47+
TVM_FFI_REGISTER_GLOBAL("mlc.serve.UploadLora")
48+
.set_body_typed([](const String& adapter_path) {
49+
std::cout << "REAL TVM FFI: UploadLora called with: " << adapter_path << std::endl;
50+
LoraManager::Global()->UploadAdapter(adapter_path, 1.0f);
2651
});
2752

28-
// Register with TVM's FFI so that python can call this symbol via
29-
// `tvm.get_global_func("mlc.serve.UploadLora")`.
30-
TVM_REGISTER_GLOBAL("mlc.serve.UploadLora")
31-
.set_body_typed([](const std::string& adapter_path) {
32-
mlc::serve::UploadLora(adapter_path);
33-
});
53+
// Keep the namespace functions for direct C++ access
54+
void UploadLora(const std::string& adapter_path) {
55+
LoraManager::Global()->UploadAdapter(adapter_path, 1.0f);
56+
}
57+
58+
std::string GetLoraDelta(const std::string& param_name) {
59+
auto result = LoraManager::Global()->Lookup(param_name);
60+
return result.defined() ? "tensor_found" : "tensor_not_found";
61+
}
62+
63+
void SetActiveDevice(int dev_type, int dev_id) {
64+
LoraManager::Global()->SetDevice(dev_type, dev_id);
65+
}
66+
67+
} // namespace mlc::serve

cpp/serve/lora_manager.cc

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
#include "serve/lora_manager.h"
1+
#include "lora_manager.h"
22

33
#include <mutex>
44
#include <fstream>
5+
#include <iostream>
56
#include "3rdparty/cnpy/cnpy.h"
67

78
#include <regex>
@@ -20,6 +21,8 @@ LoraManager* LoraManager::Global() {
2021
}
2122

2223
void LoraManager::UploadAdapter(const std::string& adapter_npz_path, float alpha) {
24+
std::cout << "UploadAdapter called with: " << adapter_npz_path << ", alpha=" << alpha << std::endl;
25+
2326
// Load manifest JSON (same dir, same base + .json) to grab layer names if present.
2427
std::string manifest_path = adapter_npz_path + ".json";
2528
std::unordered_map<std::string, float> scaling_map; // full_param_name -> scaling
@@ -33,16 +36,27 @@ void LoraManager::UploadAdapter(const std::string& adapter_npz_path, float alpha
3336
std::string k = (*it)[1].str();
3437
float v = std::stof((*it)[2].str());
3538
scaling_map[k] = v;
39+
std::cout << "Loaded scaling factor: " << k << " = " << v << std::endl;
3640
}
3741
}
3842

3943
// Load every array in the .npz file via cnpy.
44+
std::cout << "Loading NPZ file: " << adapter_npz_path << std::endl;
4045
std::map<std::string, cnpy::NpyArray> arrays = cnpy::npz_load(adapter_npz_path);
46+
std::cout << "Loaded NPZ file: " << adapter_npz_path << " (placeholder implementation)" << std::endl;
47+
4148
tvm::Device cpu_dev{kDLCPU, 0};
4249
for (const auto& kv : arrays) {
4350
const std::string& name = kv.first; // e.g., "decoder.layers.0.mlp.w1.delta"
4451
const cnpy::NpyArray& arr = kv.second;
4552

53+
std::cout << "Loaded LoRA delta: " << name << " with shape [";
54+
for (size_t i = 0; i < arr.shape.size(); ++i) {
55+
std::cout << arr.shape[i];
56+
if (i < arr.shape.size() - 1) std::cout << ", ";
57+
}
58+
std::cout << "]" << std::endl;
59+
4660
bool promote_to_fp32 = (arr.word_size == 2);
4761
DLDataType dtype;
4862
dtype.code = kDLFloat;
@@ -131,14 +145,25 @@ void LoraManager::UploadAdapter(const std::string& adapter_npz_path, float alpha
131145
// safe to do now.
132146
owned_buffers_.push_back(arr.data_holder);
133147
}
148+
149+
std::cout << "LoRA adapter upload completed. Total deltas: " << delta_map_.size() << std::endl;
134150
}
135151

136152
tvm::runtime::NDArray LoraManager::Lookup(const std::string& param_name) const {
153+
std::cout << "LoRA: GetLoraDelta called with: " << param_name << std::endl;
137154
auto it = delta_map_.find(param_name);
138155
if (it != delta_map_.end()) {
156+
std::cout << "LoRA: Found delta tensor with shape: [";
157+
for (int i = 0; i < it->second->ndim; ++i) {
158+
std::cout << it->second->shape[i];
159+
if (i < it->second->ndim - 1) std::cout << ", ";
160+
}
161+
std::cout << "]" << std::endl;
139162
return it->second;
163+
} else {
164+
std::cout << "LoRA: No delta found for: " << param_name << std::endl;
165+
return tvm::runtime::NDArray(); // undefined if not present.
140166
}
141-
return tvm::runtime::NDArray(); // undefined if not present.
142167
}
143168

144169
} // namespace mlc::serve

python/mlc_llm/lora/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""LoRA (Low-Rank Adaptation) module for MLC LLM."""
22

3-
from .lora import upload_lora, set_lora, get_registered_lora_dirs
3+
from .lora import upload_lora, set_lora, get_registered_lora_dirs, get_lora_delta, register_lora_dir, clear_lora_registrations
44
from .lora_config import LoRAConfig
55

66
__all__ = [
77
"upload_lora",
88
"set_lora",
99
"get_registered_lora_dirs",
10+
"get_lora_delta",
11+
"register_lora_dir",
12+
"clear_lora_registrations",
1013
"LoRAConfig",
1114
]

0 commit comments

Comments
 (0)