Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions examples/quantized_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Quantized Inference Example for OpenMythos.

Demonstrates running mythos_1b with INT4 quantization and expert offloading
on consumer hardware (RTX 3060 12GB).

Usage:
python examples/quantized_inference.py
"""

import torch
import time
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from open_mythos import OpenMythos, mythos_1b
from open_mythos.quantization import quantize_model, print_quantization_summary
from open_mythos.expert_offloader import ExpertOffloader


def main():
print("=" * 60)
print("OpenMythos Quantized Inference Demo")
print("=" * 60)

# 1. Create model
print("\n[1/5] Creating mythos_1b model...")
cfg = mythos_1b()
model = OpenMythos(cfg)
print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")

# 2. Quantize to INT4
print("\n[2/5] Quantizing to INT4 (expert FFN layers only)...")
model = quantize_model(model, bits=4, group_size=128)
print_quantization_summary(model)

# 3. Setup expert offloading
print("\n[3/5] Setting up expert offloading...")
print(f" GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

if torch.cuda.is_available():
offloader = ExpertOffloader(
model,
gpu_experts=4, # Keep 4 experts on GPU
cache_experts=16, # Keep 16 in CPU RAM
)
offloader.prepare()
print(f" GPU experts: 4 | CPU cache: 16 | Disk: rest")
else:
print(" Running on CPU (no offloading needed)")

# 4. Generate text
print("\n[4/5] Generating text...")
input_ids = torch.randint(0, cfg.vocab_size, (1, 32))
if torch.cuda.is_available():
input_ids = input_ids.cuda()

# Warmup
_ = model.generate(input_ids, max_new_tokens=4, n_loops=2)

# Benchmark
start = time.time()
with torch.no_grad():
output = model.generate(input_ids, max_new_tokens=64, n_loops=4)
elapsed = time.time() - start

tokens_generated = output.shape[1] - input_ids.shape[1]
tokens_per_sec = tokens_generated / elapsed

print(f" Generated {tokens_generated} tokens in {elapsed:.2f}s")
print(f" Speed: {tokens_per_sec:.1f} tokens/sec")

# 5. Memory usage
print("\n[5/5] Memory usage:")
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024 / 1024
reserved = torch.cuda.memory_reserved() / 1024 / 1024
print(f" GPU allocated: {allocated:.1f} MB")
print(f" GPU reserved: {reserved:.1f} MB")

if torch.cuda.is_available():
print(f"\nOffloader stats:")
offloader.print_stats()

print("\n" + "=" * 60)
print("Done! Model runs successfully with INT4 quantization.")
print("=" * 60)


if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions open_mythos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@
precompute_rope_freqs,
)
from open_mythos.tokenizer import MythosTokenizer
from open_mythos.quantization import (
QuantizedLinear,
quantize_linear_layer,
quantize_moe_experts,
quantize_model,
get_model_memory_mb,
print_quantization_summary,
)
from open_mythos.expert_offloader import (
ExpertOffloader,
create_offloaded_model,
)
from open_mythos.variants import (
mythos_1b,
mythos_1t,
Expand Down Expand Up @@ -52,4 +64,14 @@
"load_tokenizer",
"get_vocab_size",
"MythosTokenizer",
# Quantization
"QuantizedLinear",
"quantize_linear_layer",
"quantize_moe_experts",
"quantize_model",
"get_model_memory_mb",
"print_quantization_summary",
# Expert Offloading
"ExpertOffloader",
"create_offloaded_model",
]
Loading