@@ -357,6 +357,7 @@ struct cmd_params {
357
357
std::vector<bool > use_mmap;
358
358
std::vector<bool > embeddings;
359
359
std::vector<bool > no_op_offload;
360
+ std::vector<bool > no_host;
360
361
ggml_numa_strategy numa;
361
362
int reps;
362
363
ggml_sched_priority prio;
@@ -394,6 +395,7 @@ static const cmd_params cmd_params_defaults = {
394
395
/* use_mmap */ { true },
395
396
/* embeddings */ { false },
396
397
/* no_op_offload */ { false },
398
+ /* no_host */ { false },
397
399
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
398
400
/* reps */ 5 ,
399
401
/* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -474,6 +476,8 @@ static void print_usage(int /* argc */, char ** argv) {
474
476
printf (" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n " );
475
477
printf (" (default: disabled)\n " );
476
478
printf (" -nopo, --no-op-offload <0|1> (default: 0)\n " );
479
+ printf (" --no-host <0|1> (default: %s)\n " ,
480
+ join (cmd_params_defaults.no_host , " ," ).c_str ());
477
481
printf (" \n " );
478
482
printf (
479
483
" Multiple values can be given for each parameter by separating them with ','\n "
@@ -803,6 +807,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
803
807
}
804
808
auto p = string_split<bool >(argv[i], split_delim);
805
809
params.no_op_offload .insert (params.no_op_offload .end (), p.begin (), p.end ());
810
+ } else if (arg == " --no-host" ) {
811
+ if (++i >= argc) {
812
+ invalid_param = true ;
813
+ break ;
814
+ }
815
+ auto p = string_split<bool >(argv[i], split_delim);
816
+ params.no_host .insert (params.no_host .end (), p.begin (), p.end ());
806
817
} else if (arg == " -ts" || arg == " --tensor-split" ) {
807
818
if (++i >= argc) {
808
819
invalid_param = true ;
@@ -1024,6 +1035,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
1024
1035
if (params.no_op_offload .empty ()) {
1025
1036
params.no_op_offload = cmd_params_defaults.no_op_offload ;
1026
1037
}
1038
+ if (params.no_host .empty ()) {
1039
+ params.no_host = cmd_params_defaults.no_host ;
1040
+ }
1027
1041
if (params.n_threads .empty ()) {
1028
1042
params.n_threads = cmd_params_defaults.n_threads ;
1029
1043
}
@@ -1065,6 +1079,7 @@ struct cmd_params_instance {
1065
1079
bool use_mmap;
1066
1080
bool embeddings;
1067
1081
bool no_op_offload;
1082
+ bool no_host;
1068
1083
1069
1084
llama_model_params to_llama_mparams () const {
1070
1085
llama_model_params mparams = llama_model_default_params ();
@@ -1077,6 +1092,7 @@ struct cmd_params_instance {
1077
1092
mparams.main_gpu = main_gpu;
1078
1093
mparams.tensor_split = tensor_split.data ();
1079
1094
mparams.use_mmap = use_mmap;
1095
+ mparams.no_host = no_host;
1080
1096
1081
1097
if (n_cpu_moe <= 0 ) {
1082
1098
if (tensor_buft_overrides.empty ()) {
@@ -1159,6 +1175,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1159
1175
for (const auto & mmp : params.use_mmap )
1160
1176
for (const auto & embd : params.embeddings )
1161
1177
for (const auto & nopo : params.no_op_offload )
1178
+ for (const auto & noh : params.no_host )
1162
1179
for (const auto & nb : params.n_batch )
1163
1180
for (const auto & nub : params.n_ubatch )
1164
1181
for (const auto & tk : params.type_k )
@@ -1199,6 +1216,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1199
1216
/* .use_mmap = */ mmp,
1200
1217
/* .embeddings = */ embd,
1201
1218
/* .no_op_offload= */ nopo,
1219
+ /* .no_host = */ noh,
1202
1220
};
1203
1221
instances.push_back (instance);
1204
1222
}
@@ -1232,6 +1250,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1232
1250
/* .use_mmap = */ mmp,
1233
1251
/* .embeddings = */ embd,
1234
1252
/* .no_op_offload= */ nopo,
1253
+ /* .no_host = */ noh,
1235
1254
};
1236
1255
instances.push_back (instance);
1237
1256
}
@@ -1265,6 +1284,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1265
1284
/* .use_mmap = */ mmp,
1266
1285
/* .embeddings = */ embd,
1267
1286
/* .no_op_offload= */ nopo,
1287
+ /* .no_host = */ noh,
1268
1288
};
1269
1289
instances.push_back (instance);
1270
1290
}
@@ -1303,6 +1323,7 @@ struct test {
1303
1323
bool use_mmap;
1304
1324
bool embeddings;
1305
1325
bool no_op_offload;
1326
+ bool no_host;
1306
1327
int n_prompt;
1307
1328
int n_gen;
1308
1329
int n_depth;
@@ -1339,6 +1360,7 @@ struct test {
1339
1360
use_mmap = inst.use_mmap ;
1340
1361
embeddings = inst.embeddings ;
1341
1362
no_op_offload = inst.no_op_offload ;
1363
+ no_host = inst.no_host ;
1342
1364
n_prompt = inst.n_prompt ;
1343
1365
n_gen = inst.n_gen ;
1344
1366
n_depth = inst.n_depth ;
@@ -1386,8 +1408,8 @@ struct test {
1386
1408
" type_k" , " type_v" , " n_gpu_layers" , " n_cpu_moe" , " split_mode" ,
1387
1409
" main_gpu" , " no_kv_offload" , " flash_attn" , " devices" , " tensor_split" ,
1388
1410
" tensor_buft_overrides" , " use_mmap" , " embeddings" , " no_op_offload" ,
1389
- " n_prompt " , " n_gen " , " n_depth " , " test_time " , " avg_ns " ,
1390
- " stddev_ns" , " avg_ts" , " stddev_ts"
1411
+ " no_host " , " n_prompt " , " n_gen " , " n_depth " , " test_time " ,
1412
+ " avg_ns " , " stddev_ns" , " avg_ts" , " stddev_ts"
1391
1413
};
1392
1414
return fields;
1393
1415
}
@@ -1402,7 +1424,7 @@ struct test {
1402
1424
return INT;
1403
1425
}
1404
1426
if (field == " f16_kv" || field == " no_kv_offload" || field == " cpu_strict" || field == " flash_attn" ||
1405
- field == " use_mmap" || field == " embeddings" ) {
1427
+ field == " use_mmap" || field == " embeddings" || field == " no_host " ) {
1406
1428
return BOOL;
1407
1429
}
1408
1430
if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -1477,6 +1499,7 @@ struct test {
1477
1499
std::to_string (use_mmap),
1478
1500
std::to_string (embeddings),
1479
1501
std::to_string (no_op_offload),
1502
+ std::to_string (no_host),
1480
1503
std::to_string (n_prompt),
1481
1504
std::to_string (n_gen),
1482
1505
std::to_string (n_depth),
@@ -1665,6 +1688,9 @@ struct markdown_printer : public printer {
1665
1688
if (field == " no_op_offload" ) {
1666
1689
return 4 ;
1667
1690
}
1691
+ if (field == " no_host" ) {
1692
+ return 4 ;
1693
+ }
1668
1694
1669
1695
int width = std::max ((int ) field.length (), 10 );
1670
1696
@@ -1699,6 +1725,9 @@ struct markdown_printer : public printer {
1699
1725
if (field == " no_op_offload" ) {
1700
1726
return " nopo" ;
1701
1727
}
1728
+ if (field == " no_host" ) {
1729
+ return " noh" ;
1730
+ }
1702
1731
if (field == " devices" ) {
1703
1732
return " dev" ;
1704
1733
}
@@ -1779,6 +1808,9 @@ struct markdown_printer : public printer {
1779
1808
if (params.no_op_offload .size () > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload ) {
1780
1809
fields.emplace_back (" no_op_offload" );
1781
1810
}
1811
+ if (params.no_host .size () > 1 || params.no_host != cmd_params_defaults.no_host ) {
1812
+ fields.emplace_back (" no_host" );
1813
+ }
1782
1814
fields.emplace_back (" test" );
1783
1815
fields.emplace_back (" t/s" );
1784
1816
0 commit comments