From 24941181631355962664646b61c2f11c7b500e17 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Tue, 28 Jan 2025 10:50:17 +0000 Subject: [PATCH] Add perplexity subcommand to RamaLama CLI - Added a new subcommand `perplexity` to the RamaLama CLI in `cli.py`. - Implemented the `perplexity` method in the `Model` class in `model.py`. - Updated the documentation in `ramalama.1.md` to include the new `perplexity` command. Signed-off-by: Eric Curtin --- docs/ramalama-cuda.7.md | 2 +- docs/ramalama-perplexity.1.md | 44 +++++++++++++++++++++++++++++++++++ docs/ramalama.1.md | 4 +++- ramalama/cli.py | 12 ++++++++++ ramalama/model.py | 22 +++++++++++++++++- 5 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 docs/ramalama-perplexity.1.md diff --git a/docs/ramalama-cuda.7.md b/docs/ramalama-cuda.7.md index f2ab8197..b76d5cd8 100644 --- a/docs/ramalama-cuda.7.md +++ b/docs/ramalama-cuda.7.md @@ -77,7 +77,7 @@ Follow the installation instructions provided in the [NVIDIA Container Toolkit i ``` # **Expected Output** - Verry everything is configured correctly, with output similar to this: + Verify everything is configured correctly, with output similar to this: ```text Thu Dec 5 19:58:40 2024 diff --git a/docs/ramalama-perplexity.1.md b/docs/ramalama-perplexity.1.md new file mode 100644 index 00000000..cb449a08 --- /dev/null +++ b/docs/ramalama-perplexity.1.md @@ -0,0 +1,44 @@ +% ramalama-perplexity 1 + +## NAME +ramalama\-perplexity - calculate the perplexity value of an AI Model + +## SYNOPSIS +**ramalama perplexity** [*options*] *model* [arg ...] + +## MODEL TRANSPORTS + +| Transports | Prefix | Web Site | +| ------------- | ------ | --------------------------------------------------- | +| URL based | https://, http://, file:// | `https://web.site/ai.model`, `file://tmp/ai.model`| +| HuggingFace | huggingface://, hf://, hf.co/ | [`huggingface.co`](https://www.huggingface.co) | +| Ollama | ollama:// | [`ollama.com`](https://www.ollama.com) | +| OCI Container Registries | oci:// | [`opencontainers.org`](https://opencontainers.org)| +|||Examples: [`quay.io`](https://quay.io), [`Docker Hub`](https://docker.io),[`Artifactory`](https://artifactory.com)| + +RamaLama defaults to the Ollama registry transport. This default can be overridden in the `ramalama.conf` file or via the RAMALAMA_TRANSPORTS +environment. `export RAMALAMA_TRANSPORT=huggingface` Changes RamaLama to use huggingface transport. + +Modify individual model transports by specifying the `huggingface://`, `oci://`, `ollama://`, `https://`, `http://`, `file://` prefix to the model. + +URL support means if a model is on a web site or even on your local system, you can run it directly. + +## OPTIONS + +#### **--help**, **-h** +show this help message and exit + +## DESCRIPTION +Calculate the perplexity of an AI Model. Perplexity measures how well the model can predict the next token with lower values being better. + +## EXAMPLES + +``` +ramalama perplexity granite-moe3 +``` + +## SEE ALSO +**[ramalama(1)](ramalama.1.md)** + +## HISTORY +Jan 2025, Originally compiled by Eric Curtin diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md index 35f87c4f..425f2b3a 100644 --- a/docs/ramalama.1.md +++ b/docs/ramalama.1.md @@ -139,9 +139,11 @@ show RamaLama version | [ramalama-push(1)](ramalama-push.1.md) | push AI Models from local storage to remote registries | | [ramalama-rm(1)](ramalama-rm.1.md) | remove AI Models from local storage | | [ramalama-run(1)](ramalama-run.1.md) | run specified AI Model as a chatbot | +| [ramalama-perplexity(1)](ramalama-perplexity.1.md)| calculate the perplexity value of an AI Model | | [ramalama-serve(1)](ramalama-serve.1.md) | serve REST API on specified AI Model | | [ramalama-stop(1)](ramalama-stop.1.md) | stop named container that is running AI Model | -| [ramalama-version(1)](ramalama-version.1.md) | display version of RamaLama +| [ramalama-version(1)](ramalama-version.1.md) | display version of RamaLama | + ## CONFIGURATION FILES diff --git a/ramalama/cli.py b/ramalama/cli.py index e069bc9c..0b3577b4 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -228,6 +228,7 @@ def configure_subcommands(parser): list_parser(subparsers) login_parser(subparsers) logout_parser(subparsers) + perplexity_parser(subparsers) pull_parser(subparsers) push_parser(subparsers) rm_parser(subparsers) @@ -874,3 +875,14 @@ def New(model, args): return OCI(model, args.engine) raise KeyError(f'transport "{transport}" not supported. Must be oci, huggingface, or ollama.') + + +def perplexity_parser(subparsers): + parser = subparsers.add_parser("perplexity", help="calculate perplexity for specified AI Model") + parser.add_argument("MODEL") # positional argument + parser.set_defaults(func=perplexity_cli) + + +def perplexity_cli(args): + model = New(args.MODEL, args) + model.perplexity(args) diff --git a/ramalama/model.py b/ramalama/model.py index c2cd1741..710a27ac 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -197,7 +197,8 @@ def gpu_args(self, force=False, runner=False): or os.getenv("CUDA_VISIBLE_DEVICES") or ( # linux and macOS report aarch64 differently - platform.machine() in {"aarch64", "arm64"} and os.path.exists("/dev/dri") + platform.machine() in {"aarch64", "arm64"} + and os.path.exists("/dev/dri") ) ): if runner: @@ -244,6 +245,25 @@ def run(self, args): exec_args = self.build_exec_args_run(args, model_path, prompt) self.execute_model(model_path, exec_args, args) + def perplexity(self, args): + self.check_name_and_container(args) + model_path = self.get_model_path(args) + exec_args = self.build_exec_args_perplexity(args, model_path) + self.execute_model(model_path, exec_args, args) + + def build_exec_args_perplexity(self, args, model_path): + exec_model_path = MNT_FILE if args.container else model_path + exec_args = ["llama-perplexity"] + + get_gpu() + gpu_args = self.gpu_args(force=args.gpu) + if gpu_args is not None: + exec_args.extend(gpu_args) + + exec_args += ["-m", exec_model_path] + + return exec_args + def check_name_and_container(self, args): if hasattr(args, "name") and args.name: if not args.container: