Implement llama-pull tool

ericcurtin · ericcurtin · commit 0907efa6302c · 2025-10-07T13:12:09.000+01:00
Complete llama-pull tool with documentation

Signed-off-by: Eric Curtin &lt;eric.curtin@docker.com&gt;
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1393,6 +1393,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
+    // Both cannot be specified at the same time
+    if (!params.model.hf_repo.empty() && !params.model.docker_repo.empty()) {
+        throw std::invalid_argument("error: cannot specify both -hf and -dr options\n");
+    }
+
     // handle model and download
     {
         auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
@@ -1727,6 +1732,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params &) {
             fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
             fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            fprintf(stderr, "model cache path: %s\n", fs_get_cache_directory().c_str());
             exit(0);
         }
     ));
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
@@ -18,6 +18,7 @@ else()
     add_subdirectory(gguf-split)
     add_subdirectory(imatrix)
     add_subdirectory(llama-bench)
+    add_subdirectory(pull)
     add_subdirectory(main)
     add_subdirectory(perplexity)
     add_subdirectory(quantize)
diff --git a/tools/pull/CMakeLists.txt b/tools/pull/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-pull)
+add_executable(${TARGET} pull.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/pull/README.md b/tools/pull/README.md
@@ -0,0 +1,43 @@
+# llama-pull - Model Download Tool
+
+A command-line tool for downloading AI models from HuggingFace and [Docker Hub](https://hub.docker.com/u/ai) for use with llama.cpp.
+
+## Usage
+
+```bash
+# Download from HuggingFace
+llama-pull -hf <user>/<model>[:<quant>]
+
+# Download from Docker Hub
+llama-pull -dr [<repo>/]<model>[:<quant>]
+```
+
+## Options
+
+- `-hf, --hf-repo REPO` - Download model from HuggingFace repository
+- `-dr, --docker-repo REPO` - Download model from Docker Hub
+- `--hf-token TOKEN` - HuggingFace token for private repositories
+- `-h, --help` - Show help message
+
+## Examples
+
+```bash
+# Download a HuggingFace model
+llama-pull -hf microsoft/DialoGPT-medium
+
+# Download a Docker model (ai/ repo is default)
+llama-pull -dr gemma3
+
+# Download with specific quantization
+llama-pull -hf bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M
+```
+
+## Model Storage
+
+Downloaded models are stored in the standard llama.cpp cache directory:
+- Linux: `~/.cache/llama.cpp/`, macOS: `~/Library/Caches/llama.cpp`
+- The models can then be used with other llama.cpp tools
+
+## Requirements
+
+- Built with `LLAMA_USE_CURL=ON` (default) for download functionality
diff --git a/tools/pull/pull.cpp b/tools/pull/pull.cpp
@@ -0,0 +1,65 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+
+#include <cstdio>
+#include <string>
+
+static void print_usage(int, char ** argv) {
+    LOG("Usage: %s [options]\n", argv[0]);
+    LOG("\n");
+    LOG("Download models from HuggingFace or Docker Hub\n");
+    LOG("\n");
+    LOG("Options:\n");
+    LOG("  -h, --help                show this help message and exit\n");
+    LOG("  -hf, -hfr, --hf-repo REPO download model from HuggingFace repo\n");
+    LOG("                            format: <user>/<model>[:<quant>]\n");
+    LOG("                            example: microsoft/DialoGPT-medium\n");
+    LOG("  -dr, --docker-repo REPO   download model from Docker Hub\n");
+    LOG("                            format: [<repo>/]<model>[:<quant>]\n");
+    LOG("                            example: gemma3\n");
+    LOG("  --hf-token TOKEN          HuggingFace token for private repos\n");
+    LOG("\n");
+    LOG("Examples:\n");
+    LOG("  %s -hf microsoft/DialoGPT-medium\n", argv[0]);
+    LOG("  %s -dr gemma3\n", argv[0]);
+    LOG("  %s -hf microsoft/DialoGPT-medium\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    // Parse command line arguments
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    // Check if help was requested or no download option provided
+    if (params.model.hf_repo.empty() && params.model.docker_repo.empty()) {
+        LOG_ERR("error: must specify either -hf <repo> or -dr <repo>\n");
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    LOG_INF("llama-pull: downloading model...\n");
+    try {
+        // Use the existing model handling logic which downloads the model
+        common_init_result llama_init = common_init_from_params(params);
+        if (llama_init.model != nullptr) {
+            LOG_INF("Model downloaded and loaded successfully to: %s\n", params.model.path.c_str());
+
+            // We only want to download, not keep the model loaded
+            // The download happens during common_init_from_params
+        } else {
+            LOG_ERR("Failed to download or load model\n");
+            return 1;
+        }
+    } catch (const std::exception & e) {
+        LOG_ERR("Error: %s\n", e.what());
+        return 1;
+    }
+
+    return 0;
+}

Original file line number	Diff line number	Diff line change
`@@ -1393,6 +1393,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context`
`1393`	`1393`	`throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");`
`1394`	`1394`	`}`
`1395`	`1395`
	`1396`	`+ // Both cannot be specified at the same time`
	`1397`	`+ if (!params.model.hf_repo.empty() && !params.model.docker_repo.empty()) {`
	`1398`	`+ throw std::invalid_argument("error: cannot specify both -hf and -dr options\n");`
	`1399`	`+ }`
	`1400`	`+`
`1396`	`1401`	`// handle model and download`
`1397`	`1402`	`{`
`1398`	`1403`	`auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);`
`@@ -1727,6 +1732,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`1727`	`1732`	`[](common_params &) {`
`1728`	`1733`	`fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);`
`1729`	`1734`	`fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);`
	`1735`	`+ fprintf(stderr, "model cache path: %s\n", fs_get_cache_directory().c_str());`
`1730`	`1736`	`exit(0);`
`1731`	`1737`	`}`
`1732`	`1738`	`));`