From 24941181631355962664646b61c2f11c7b500e17 Mon Sep 17 00:00:00 2001
From: Eric Curtin <ecurtin@redhat.com>
Date: Tue, 28 Jan 2025 10:50:17 +0000
Subject: [PATCH] Add perplexity subcommand to RamaLama CLI

- Added a new subcommand `perplexity` to the RamaLama CLI in `cli.py`.
- Implemented the `perplexity` method in the `Model` class in `model.py`.
- Updated the documentation in `ramalama.1.md` to include the new `perplexity` command.

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
---
 docs/ramalama-cuda.7.md       |  2 +-
 docs/ramalama-perplexity.1.md | 44 +++++++++++++++++++++++++++++++++++
 docs/ramalama.1.md            |  4 +++-
 ramalama/cli.py               | 12 ++++++++++
 ramalama/model.py             | 22 +++++++++++++++++-
 5 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 docs/ramalama-perplexity.1.md

diff --git a/docs/ramalama-cuda.7.md b/docs/ramalama-cuda.7.md
index f2ab8197..b76d5cd8 100644
--- a/docs/ramalama-cuda.7.md
+++ b/docs/ramalama-cuda.7.md
@@ -77,7 +77,7 @@ Follow the installation instructions provided in the [NVIDIA Container Toolkit i
    ```
 
 # **Expected Output**
-   Verry everything is configured correctly, with output similar to this:
+   Verify everything is configured correctly, with output similar to this:
 
    ```text
       Thu Dec  5 19:58:40 2024
diff --git a/docs/ramalama-perplexity.1.md b/docs/ramalama-perplexity.1.md
new file mode 100644
index 00000000..cb449a08
--- /dev/null
+++ b/docs/ramalama-perplexity.1.md
@@ -0,0 +1,44 @@
+% ramalama-perplexity 1
+
+## NAME
+ramalama\-perplexity - calculate the perplexity value of an AI Model
+
+## SYNOPSIS
+**ramalama perplexity** [*options*] *model* [arg ...]
+
+## MODEL TRANSPORTS
+
+| Transports    | Prefix | Web Site                                            |
+| ------------- | ------ | --------------------------------------------------- |
+| URL based    | https://, http://, file:// | `https://web.site/ai.model`, `file://tmp/ai.model`|
+| HuggingFace   | huggingface://, hf://, hf.co/ | [`huggingface.co`](https://www.huggingface.co)      |
+| Ollama        | ollama:// | [`ollama.com`](https://www.ollama.com)              |
+| OCI Container Registries | oci:// | [`opencontainers.org`](https://opencontainers.org)|
+|||Examples: [`quay.io`](https://quay.io),  [`Docker Hub`](https://docker.io),[`Artifactory`](https://artifactory.com)|
+
+RamaLama defaults to the Ollama registry transport. This default can be overridden in the `ramalama.conf` file or via the RAMALAMA_TRANSPORTS
+environment. `export RAMALAMA_TRANSPORT=huggingface` Changes RamaLama to use huggingface transport.
+
+Modify individual model transports by specifying the `huggingface://`, `oci://`, `ollama://`, `https://`, `http://`, `file://` prefix to the model.
+
+URL support means if a model is on a web site or even on your local system, you can run it directly.
+
+## OPTIONS
+
+#### **--help**, **-h**
+show this help message and exit
+
+## DESCRIPTION
+Calculate the perplexity of an AI Model. Perplexity measures how well the model can predict the next token with lower values being better.
+
+## EXAMPLES
+
+```
+ramalama perplexity granite-moe3
+```
+
+## SEE ALSO
+**[ramalama(1)](ramalama.1.md)**
+
+## HISTORY
+Jan 2025, Originally compiled by Eric Curtin <ecurtin@redhat.com>
diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md
index 35f87c4f..425f2b3a 100644
--- a/docs/ramalama.1.md
+++ b/docs/ramalama.1.md
@@ -139,9 +139,11 @@ show RamaLama version
 | [ramalama-push(1)](ramalama-push.1.md)            | push AI Models from local storage to remote registries     |
 | [ramalama-rm(1)](ramalama-rm.1.md)                | remove AI Models from local storage                        |
 | [ramalama-run(1)](ramalama-run.1.md)              | run specified AI Model as a chatbot                        |
+| [ramalama-perplexity(1)](ramalama-perplexity.1.md)| calculate the perplexity value of an AI Model              |
 | [ramalama-serve(1)](ramalama-serve.1.md)          | serve REST API on specified AI Model                       |
 | [ramalama-stop(1)](ramalama-stop.1.md)            | stop named container that is running AI Model              |
-| [ramalama-version(1)](ramalama-version.1.md)      | display version of RamaLama
+| [ramalama-version(1)](ramalama-version.1.md)      | display version of RamaLama                                |
+
 ## CONFIGURATION FILES
 
 
diff --git a/ramalama/cli.py b/ramalama/cli.py
index e069bc9c..0b3577b4 100644
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -228,6 +228,7 @@ def configure_subcommands(parser):
     list_parser(subparsers)
     login_parser(subparsers)
     logout_parser(subparsers)
+    perplexity_parser(subparsers)
     pull_parser(subparsers)
     push_parser(subparsers)
     rm_parser(subparsers)
@@ -874,3 +875,14 @@ def New(model, args):
         return OCI(model, args.engine)
 
     raise KeyError(f'transport "{transport}" not supported. Must be oci, huggingface, or ollama.')
+
+
+def perplexity_parser(subparsers):
+    parser = subparsers.add_parser("perplexity", help="calculate perplexity for specified AI Model")
+    parser.add_argument("MODEL")  # positional argument
+    parser.set_defaults(func=perplexity_cli)
+
+
+def perplexity_cli(args):
+    model = New(args.MODEL, args)
+    model.perplexity(args)
diff --git a/ramalama/model.py b/ramalama/model.py
index c2cd1741..710a27ac 100644
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -197,7 +197,8 @@ def gpu_args(self, force=False, runner=False):
             or os.getenv("CUDA_VISIBLE_DEVICES")
             or (
                 # linux and macOS report aarch64 differently
-                platform.machine() in {"aarch64", "arm64"} and os.path.exists("/dev/dri")
+                platform.machine() in {"aarch64", "arm64"}
+                and os.path.exists("/dev/dri")
             )
         ):
             if runner:
@@ -244,6 +245,25 @@ def run(self, args):
         exec_args = self.build_exec_args_run(args, model_path, prompt)
         self.execute_model(model_path, exec_args, args)
 
+    def perplexity(self, args):
+        self.check_name_and_container(args)
+        model_path = self.get_model_path(args)
+        exec_args = self.build_exec_args_perplexity(args, model_path)
+        self.execute_model(model_path, exec_args, args)
+
+    def build_exec_args_perplexity(self, args, model_path):
+        exec_model_path = MNT_FILE if args.container else model_path
+        exec_args = ["llama-perplexity"]
+
+        get_gpu()
+        gpu_args = self.gpu_args(force=args.gpu)
+        if gpu_args is not None:
+            exec_args.extend(gpu_args)
+
+        exec_args += ["-m", exec_model_path]
+
+        return exec_args
+
     def check_name_and_container(self, args):
         if hasattr(args, "name") and args.name:
             if not args.container: