grctest
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎Dockerfile
+45 b/‎Dockerfile
+45
diff --git a/‎README.md
+44-1 b/‎README.md
+44-1
diff --git a/‎app/__init__.py b/‎app/__init__.py
diff --git a/‎app/lib/__init__.py b/‎app/lib/__init__.py
diff --git a/‎app/lib/endpoints.py
+153 b/‎app/lib/endpoints.py
+153
diff --git a/‎app/lib/models.py
+103 b/‎app/lib/models.py
+103
@@ -160,3 +160,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+app/models/*/*
+
+# Allow all files in app/lib/
+!app/lib
@@ -0,0 +1,45 @@
+FROM python:3.9
+
+WORKDIR /code
+
+COPY ./app /code
+
+RUN if [ -z "$(ls -A /code/models)" ]; then \
+        echo "Error: No models found in /code/models" && exit 1; \
+    fi
+
+RUN apt-get update && apt-get install -y \
+    wget \
+    lsb-release \
+    software-properties-common \
+    gnupg \
+    cmake && \
+    bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git clone --recursive https://github.com/microsoft/BitNet.git /tmp/BitNet && \
+    cp -r /tmp/BitNet/* /code && \
+    rm -rf /tmp/BitNet
+
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt && \
+    pip install "fastapi[standard]" "uvicorn[standard]"
+
+RUN if [ -d "/code/models/Llama3-8B-1.58-100B-tokens" ]; then \
+        python /code/setup_env.py -md /code/models/Llama3-8B-1.58-100B-tokens -q i2_s --use-pretuned && \
+        find /code/models/Llama3-8B-1.58-100B-tokens -type f -name "*f32*.gguf" -delete; \
+    fi
+
+RUN if [ -d "/code/models/bitnet_b1_58-large" ]; then \
+        python /code/setup_env.py -md /code/models/bitnet_b1_58-large -q i2_s --use-pretuned && \
+        find /code/models/bitnet_b1_58-large -type f -name "*f32*.gguf" -delete; \
+    fi
+
+RUN if [ -d "/code/models/bitnet_b1_58-3B" ]; then \
+        python /code/setup_env.py -md /code/models/bitnet_b1_58-3B -q i2_s --use-pretuned && \
+        find /code/models/bitnet_b1_58-3B -type f -name "*f32*.gguf" -delete; \
+    fi
+
+EXPOSE 8080
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
@@ -1 +1,44 @@
-# FastAPI-BitNet
+# FastAPI-BitNet
+
+Install Conda: https://anaconda.org/anaconda/conda
+
+Initialize the python environment:
+```
+conda init
+conda create -n bitnet python=3.9
+conda activate bitnet
+```
+
+Install the Huggingface-CLI tool to download the models:
+```
+pip install -U "huggingface_hub[cli]"
+```
+
+Download one/many of the 1-bit models from Huggingface below:
+```
+huggingface-cli download 1bitLLM/bitnet_b1_58-large --local-dir app/models/bitnet_b1_58-large
+huggingface-cli download 1bitLLM/bitnet_b1_58-3B --local-dir app/models/bitnet_b1_58-3B
+huggingface-cli download HF1BitLLM/Llama3-8B-1.58-100B-tokens --local-dir app/models/Llama3-8B-1.58-100B-tokens
+```
+
+Build the docker image:
+```
+docker build -t fastapi_bitnet .
+```
+
+Run the docker image:
+```
+docker run -d --name ai_container -p 8080:8080 fastapi_bitnet
+```
+
+Once it's running navigate to http://127.0.0.1:8080/docs
+
+---
+
+Note:
+
+If seeking to use this in production, make sure to extend the docker image with additional [authentication security](https://github.com/mjhea0/awesome-fastapi?tab=readme-ov-file#auth) steps. In its current state it's intended for use locally.
+
+Building the docker file image requires upwards of 40GB RAM for `Llama3-8B-1.58-100B-tokens`, if you have less than 64GB RAM you will probably run into issues.
+
+The Dockerfile deletes the larger f32 files, so as to reduce the time to build the docker image file, you'll need to comment out the `find /code/models/....` lines if you want the larger f32 files included.
@@ -0,0 +1,153 @@
+from fastapi import FastAPI, HTTPException, Query, Depends
+from .models import ModelEnum, BenchmarkRequest, PerplexityRequest, InferenceRequest
+from .utils import run_command, parse_benchmark_data, parse_perplexity_data
+import os
+import subprocess
+
+async def run_benchmark(
+    model: ModelEnum,
+    n_token: int = Query(128, gt=0),
+    threads: int = Query(2, gt=0, le=os.cpu_count()),
+    n_prompt: int = Query(32, gt=0)
+):
+    """Run benchmark on specified model"""
+    request = BenchmarkRequest(model=model, n_token=n_token, threads=threads, n_prompt=n_prompt)
+    
+    build_dir = os.getenv("BUILD_DIR", "build")
+    bench_path = os.path.join(build_dir, "bin", "llama-bench")
+
+    if not os.path.exists(bench_path):
+        raise HTTPException(status_code=500, detail="Benchmark binary not found")
+        
+    command = [
+        bench_path,
+        '-m', request.model.value,
+        '-n', str(request.n_token),
+        '-ngl', '0', 
+        '-b', '1',
+        '-t', str(request.threads),
+        '-p', str(request.n_prompt),
+        '-r', '5'
+    ]
+    
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
+        parsed_data = parse_benchmark_data(result.stdout)
+        return parsed_data
+    except subprocess.CalledProcessError as e:
+        raise HTTPException(status_code=500, detail=f"Benchmark failed: {str(e)}")
+
+def validate_prompt_length(prompt: str = Query(..., description="Input text for perplexity calculation"), ctx_size: int = Query(10, gt=0)) -> str:
+    token_count = len(prompt.split())
+    min_tokens = 2 * ctx_size
+    
+    if token_count < min_tokens:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Prompt too short. Needs at least {min_tokens} tokens, got {token_count}"
+        )
+    return prompt
+
+async def run_perplexity(
+    model: ModelEnum,
+    prompt: str = Depends(validate_prompt_length),
+    threads: int = Query(2, gt=0, le=os.cpu_count()),
+    ctx_size: int = Query(10, gt=3),
+    ppl_stride: int = Query(0, ge=0)
+):
+    """Calculate perplexity for given text and model"""
+    try:
+        request = PerplexityRequest(
+            model=model, 
+            prompt=prompt, 
+            threads=threads, 
+            ctx_size=ctx_size, 
+            ppl_stride=ppl_stride
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    
+    build_dir = os.getenv("BUILD_DIR", "build")
+    ppl_path = os.path.join(build_dir, "bin", "llama-perplexity")
+
+    if not os.path.exists(ppl_path):
+        raise HTTPException(status_code=500, detail="Perplexity binary not found")
+        
+    command = [
+        ppl_path,
+        '--model', request.model.value,
+        '--prompt', request.prompt,
+        '--threads', str(request.threads),
+        '--ctx-size', str(request.ctx_size),
+        '--perplexity',
+        '--ppl-stride', str(request.ppl_stride)
+    ]
+    
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
+        parsed_data = parse_perplexity_data(result.stderr)
+        return parsed_data
+    except subprocess.CalledProcessError as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+def get_model_sizes():
+    """Endpoint to get the file sizes of supported .gguf models."""
+    model_sizes = {}
+    models_dir = "models"
+    for subdir in os.listdir(models_dir):
+        subdir_path = os.path.join(models_dir, subdir)
+        if os.path.isdir(subdir_path):
+            for file in os.listdir(subdir_path):
+                if file.endswith(".gguf"):
+                    file_path = os.path.join(subdir_path, file)
+                    file_size_bytes = os.path.getsize(file_path)
+                    file_size_mb = round(file_size_bytes / (1024 * 1024), 3)
+                    file_size_gb = round(file_size_bytes / (1024 * 1024 * 1024), 3)
+                    model_sizes[file] = {
+                        "bytes": file_size_bytes,
+                        "MB": file_size_mb,
+                        "GB": file_size_gb
+                    }
+    return model_sizes
+
+async def run_inference_endpoint(
+    model: ModelEnum,
+    n_predict: int = Query(128, gt=0, le=100000),
+    prompt: str = "",
+    threads: int = Query(2, gt=0, le=os.cpu_count()),
+    ctx_size: int = Query(2048, gt=0),
+    temperature: float = Query(0.8, gt=0.0, le=2.0)
+):
+    """Endpoint to run inference with the given parameters."""
+    request = InferenceRequest(
+        model=model,
+        n_predict=n_predict,
+        prompt=prompt,
+        threads=threads,
+        ctx_size=ctx_size,
+        temperature=temperature
+    )
+    output = run_inference(request)
+    return {"result": output}
+
+def run_inference(args: InferenceRequest) -> str:
+    """Run the inference command with the given arguments."""
+    build_dir = os.getenv("BUILD_DIR", "build")
+    main_path = os.path.join(build_dir, "bin", "llama-cli")
+
+    if not os.path.exists(main_path):
+        raise HTTPException(status_code=500, detail="Inference binary not found")
+    
+    command = [
+        main_path,
+        '-m', args.model.value,
+        '-n', str(args.n_predict),
+        '-t', str(args.threads),
+        '-p', args.prompt,
+        '-ngl', '0',
+        '-c', str(args.ctx_size),
+        '--temp', str(args.temperature),
+        "-b", "1"
+    ]
+    output = run_command(command)
+    return output
@@ -0,0 +1,103 @@
+from typing import Dict, Any
+from pydantic import BaseModel, validator, root_validator
+from enum import Enum
+import os
+
+def create_model_enum(directory: str):
+    """Dynamically create an Enum for models based on files in the directory."""
+    models = {}
+    for subdir in os.listdir(directory):
+        subdir_path = os.path.join(directory, subdir)
+        if os.path.isdir(subdir_path):
+            for file in os.listdir(subdir_path):
+                if file.endswith(".gguf"):
+                    model_name = f"{subdir}_{file.replace('-', '_').replace('.', '_')}"
+                    models[model_name] = os.path.join(subdir_path, file)
+    return Enum("ModelEnum", models)
+
+# Create the ModelEnum based on the models directory
+ModelEnum = create_model_enum("models")
+
+max_n_predict = 100000
+
+class BenchmarkRequest(BaseModel):
+    model: ModelEnum
+    n_token: int = 128
+    threads: int = 2
+    n_prompt: int = 32
+    
+    @validator('threads')
+    def validate_threads(cls, v):
+        max_threads = os.cpu_count()
+        if v > max_threads:
+            raise ValueError(f"Number of threads cannot exceed {max_threads}")
+        return v
+    
+    @validator('n_token', 'n_prompt', 'threads')
+    def validate_positive(cls, v):
+        if v <= 0:
+            raise ValueError("Value must be positive")
+        return v
+
+class PerplexityRequest(BaseModel):
+    model: ModelEnum
+    prompt: str
+    threads: int = 2
+    ctx_size: int = 3
+    ppl_stride: int = 0
+
+    @validator('threads')
+    def validate_threads(cls, v):
+        max_threads = os.cpu_count()
+        if v > max_threads:
+            raise ValueError(f"Number of threads cannot exceed {max_threads}")
+        elif v <= 0:
+            raise ValueError("Value must be positive")
+        return v
+    
+    @validator('ctx_size')
+    def validate_positive(cls, v):
+        if v < 3:
+            raise ValueError("Value must be greater than 3")
+        return v
+
+    @root_validator(pre=True)
+    def validate_prompt_length(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        prompt = values.get('prompt')
+        ctx_size = values.get('ctx_size')
+        
+        if prompt and ctx_size:
+            token_count = len(prompt.split())
+            min_tokens = 2 * ctx_size
+            
+            if token_count < min_tokens:
+                raise ValueError(f"Prompt too short. Needs at least {min_tokens} tokens, got {token_count}")
+        
+        return values
+
+class InferenceRequest(BaseModel):
+    model: ModelEnum
+    n_predict: int = 128
+    prompt: str
+    threads: int = 2
+    ctx_size: int = 2048
+    temperature: float = 0.8
+
+    @validator('threads')
+    def validate_threads(cls, v):
+        max_threads = os.cpu_count()
+        if v > max_threads:
+            raise ValueError(f"Number of threads cannot exceed {max_threads}")
+        return v
+
+    @validator('n_predict')
+    def validate_n_predict(cls, v):
+        if v > max_n_predict:
+            raise ValueError(f"Number of predictions cannot exceed {max_n_predict}")
+        return v
+    
+    @validator('threads', 'ctx_size', 'temperature', 'n_predict')
+    def validate_positive(cls, v):
+        if v <= 0:
+            raise ValueError("Value must be positive")
+        return v