From 9632be38d4ab66c3c0b3a01adcc121e8890cea1c Mon Sep 17 00:00:00 2001
From: Aleksandr Bobrov <al.bobrov.mail@gmail.com>
Date: Sun, 15 Mar 2026 17:21:59 +0500
Subject: [PATCH] feat: add MLX inference for Apple Silicon (CTC + RNNT)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add native MLX inference for GigaAM v3 on Apple Silicon:

- Full Conformer encoder (16 layers, 768d) with RoPE attention
- CTC and RNNT heads with greedy decoding
- PyTorch → MLX weight conversion (safetensors + config.json)
- Streaming transcription (growing buffer, live mic + file)
- CLI tools: gigaam-cli, gigaam-stream, gigaam-transcribe
- Python API: load_model, transcribe, stream_generate, stream_live

Performance on Apple M4:
- CTC: 139x realtime (11s audio in 81ms), fp16 421 MB
- RNNT: 48x realtime (11s in 230ms), ~9% lower WER than CTC

New files in mlx_convert/:
  gigaam_mlx.py              — MLX model + inference + streaming
  convert_gigaam_to_mlx.py   — PyTorch → MLX conversion
  gigaam-cli                 — single-file transcription CLI
  gigaam-stream              — real-time streaming CLI
  gigaam-transcribe          — shell wrapper
  README.md                  — documentation, API, benchmarks
---
 .gitignore                           |   7 +-
 mlx_convert/README.md                | 224 ++++++++
 mlx_convert/convert_gigaam_to_mlx.py | 235 +++++++++
 mlx_convert/gigaam-cli               |  64 +++
 mlx_convert/gigaam-stream            | 204 ++++++++
 mlx_convert/gigaam-transcribe        |  15 +
 mlx_convert/gigaam_mlx.py            | 743 +++++++++++++++++++++++++++
 7 files changed, 1491 insertions(+), 1 deletion(-)
 create mode 100644 mlx_convert/README.md
 create mode 100644 mlx_convert/convert_gigaam_to_mlx.py
 create mode 100755 mlx_convert/gigaam-cli
 create mode 100755 mlx_convert/gigaam-stream
 create mode 100755 mlx_convert/gigaam-transcribe
 create mode 100644 mlx_convert/gigaam_mlx.py

diff --git a/.gitignore b/.gitignore
index 4775fdd..ff28261 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,9 @@ build
 *.wav
 .DS_Store
 *tmp*
-onnx
\ No newline at end of file
+onnx
+.venv
+mlx_convert/.venv
+mlx_convert/gigaam-v3-ctc-mlx/
+mlx_convert/gigaam-v3-ctc-mlx-fp32/
+mlx_convert/gigaam-v3-rnnt-mlx/
diff --git a/mlx_convert/README.md b/mlx_convert/README.md
new file mode 100644
index 0000000..0265b06
--- /dev/null
+++ b/mlx_convert/README.md
@@ -0,0 +1,224 @@
+# GigaAM v3 MLX — Russian ASR on Apple Silicon
+
+GigaAM v3 (Conformer encoder, 16 layers, 768d) converted to [MLX](https://github.com/ml-explore/mlx) for fast inference on Apple Silicon.
+Supports both **CTC** and **RNNT** models.
+
+**139x realtime** for CTC on M4 — transcribes 11 seconds of Russian speech in 81ms.
+**48x realtime** for RNNT on M4 — transcribes 11 seconds in 230ms (higher quality, sequential decode).
+
+## Quick Start
+
+### 1. Install dependencies
+
+```bash
+uv venv .venv
+uv pip install mlx safetensors numpy
+# For streaming from microphone:
+uv pip install sounddevice
+```
+
+### 2. Convert the model (one-time)
+
+```bash
+# CTC model (fastest, 139x realtime)
+python convert_gigaam_to_mlx.py --model v3_ctc --output ./gigaam-v3-ctc-mlx
+
+# RNNT model (higher quality, ~9% lower WER, 48x realtime)
+python convert_gigaam_to_mlx.py --model v3_rnnt --output ./gigaam-v3-rnnt-mlx
+
+# Optional: fp32 version
+python convert_gigaam_to_mlx.py --model v3_ctc --output ./gigaam-v3-ctc-mlx-fp32 --dtype float32
+```
+
+This creates a directory with:
+- `model.safetensors` — weights (421 MB fp16, 842 MB fp32)
+- `config.json` — model configuration + vocabulary
+
+### 3. Transcribe
+
+```bash
+# Single file
+python gigaam-cli -f audio.wav
+
+# Streaming from file
+python gigaam-stream --file audio.wav
+
+# Live microphone streaming
+python gigaam-stream
+```
+
+---
+
+## Python API
+
+### Basic transcription
+
+```python
+from gigaam_mlx import load_model, load_audio
+
+model = load_model("./gigaam-v3-ctc-mlx")
+audio = load_audio("audio.wav")  # any format, resampled to 16kHz via ffmpeg
+text = model.transcribe(audio)
+print(text)
+# → ничьих не требуя похвал счастлив уж я надеждой сладкой
+```
+
+### Streaming (pre-recorded file)
+
+Process audio incrementally, yielding results every N seconds:
+
+```python
+from gigaam_mlx import load_model, load_audio, StreamingConfig
+
+model = load_model("./gigaam-v3-ctc-mlx")
+audio = load_audio("audio.wav")
+
+config = StreamingConfig(step_duration=1.0)  # yield every 1s
+
+for result in model.stream_generate(audio, config):
+    print(f"[{result.audio_position:.1f}s] {result.cumulative_text}")
+    # [1.0s] ничьих не требуя
+    # [2.0s] ничьих не требуя похвал
+    # [3.0s] ничьих не требуя похвал счастлив уж я надеж
+    # ...
+```
+
+`StreamingResult` fields:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `text` | `str` | New text since last emission |
+| `cumulative_text` | `str` | Full transcription so far |
+| `is_final` | `bool` | `True` if last chunk |
+| `audio_position` | `float` | Current position in seconds |
+| `audio_duration` | `float` | Total audio duration |
+| `progress` | `float` | 0.0–1.0 |
+| `language` | `str` | Always `"ru"` |
+
+### Streaming (live microphone)
+
+For real-time transcription, call `stream_live()` with a growing audio buffer:
+
+```python
+import numpy as np
+import mlx.core as mx
+from gigaam_mlx import load_model
+
+model = load_model("./gigaam-v3-ctc-mlx")
+
+# Accumulate audio from microphone (16kHz float32 mono)
+buffer = np.zeros(0, dtype=np.float32)
+
+# Called every N ms with new audio
+def on_audio_chunk(chunk: np.ndarray):
+    global buffer
+    buffer = np.concatenate([buffer, chunk])
+
+    result = model.stream_live(mx.array(buffer))
+    print(f"\r{result.cumulative_text}", end="", flush=True)
+```
+
+### StreamingConfig options
+
+```python
+from gigaam_mlx import StreamingConfig
+
+config = StreamingConfig(
+    step_duration=1.0,      # process every 1s (default: 2s)
+    chunk_duration=2.0,     # unused for stream_generate (kept for compat)
+    context_duration=3.0,   # unused for stream_generate (kept for compat)
+)
+```
+
+---
+
+## mlx-audio Compatibility
+
+The `StreamingResult` dataclass follows the same contract as [mlx-audio](https://github.com/Blaizzy/mlx-audio) Parakeet/Whisper streaming, making it straightforward to integrate GigaAM as an mlx-audio STT model.
+
+---
+
+## CLI Tools
+
+### `gigaam-cli` — Single-file transcription
+
+```bash
+python gigaam-cli -f audio.wav                    # default model
+python gigaam-cli -f audio.wav -m /path/to/model  # custom model path
+python gigaam-cli -f audio.wav --no-prints         # only text to stdout
+```
+
+### `gigaam-stream` — Real-time streaming
+
+```bash
+# Live microphone
+python gigaam-stream
+python gigaam-stream --step 1000              # update every 1s
+python gigaam-stream --step 500               # update every 0.5s
+
+# File streaming (simulates real-time)
+python gigaam-stream --file audio.wav
+python gigaam-stream --file audio.wav --step 1000 --no-overwrite
+```
+
+Options:
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--step N` | 2000 | Process every N ms |
+| `--file PATH` | — | File mode instead of microphone |
+| `--model PATH` | auto | Model directory |
+| `--no-overwrite` | off | Print incrementally (don't clear line) |
+| `--vad-threshold` | 0.003 | Energy threshold for speech detection |
+
+### `gigaam-transcribe` — Shell wrapper
+
+```bash
+# Uses bundled Python venv automatically
+gigaam-transcribe -f audio.wav --no-prints
+
+# Symlink for PATH access
+ln -s /path/to/mlx_convert/gigaam-transcribe /usr/local/bin/gigaam-transcribe
+```
+
+---
+
+## Benchmarks (Apple M4)
+
+| | GigaAM CTC MLX | GigaAM RNNT MLX | GigaAM PyTorch | Whisper CPP (small) |
+|--|---|---|---|---|
+| **Batch (11s audio)** | **81ms** | **230ms** | 400ms | 1130ms |
+| **Realtime factor** | **139x** | **48x** | 28x | 10x |
+| **Stream (1s step)** | **57ms/step** | — | — | ~300ms/step |
+| **Model size** | 421 MB | 423 MB | 842 MB | 465 MB |
+| **Language** | Russian | Russian | Russian | Multilingual |
+
+RNNT provides ~9% lower WER than CTC due to autoregressive joint language modeling.
+
+## Architecture
+
+Both CTC and RNNT share the same Conformer encoder:
+
+```
+Audio (16kHz) → Log-Mel Spectrogram (64 bins)
+             → Conv1d Subsampling (4x stride)
+             → 16× Conformer Layers:
+                  ├─ FFN₁ (half-step residual)
+                  ├─ RoPE Multi-Head Self-Attention (16 heads)
+                  ├─ Convolution Module (GLU + depthwise conv)
+                  └─ FFN₂ (half-step residual)
+             → CTC Head (Conv1d → 35 classes → greedy decode)
+                or
+             → RNNT Head (Joint + LSTM Decoder → greedy decode)
+```
+
+Key implementation details:
+- **RoPE before projections**: GigaAM applies rotary embeddings to raw input *before* Q/K/V linear projections (non-standard)
+- **Exact mel filterbank**: Saved from PyTorch to avoid HTK recomputation differences
+- **All Conv1d weights transposed**: `[out, in, K]` → `[out, K, in]` for MLX convention
+- **RNNT LSTM weights**: PyTorch `(weight_ih, weight_hh, bias_ih, bias_hh)` mapped to MLX `(Wx, Wh, bias)` layout
+
+## License
+
+GigaAM model weights: [ai-sage/GigaAM](https://huggingface.co/ai-sage/GigaAM) — check their license.
+MLX conversion code: MIT.
diff --git a/mlx_convert/convert_gigaam_to_mlx.py b/mlx_convert/convert_gigaam_to_mlx.py
new file mode 100644
index 0000000..63c7606
--- /dev/null
+++ b/mlx_convert/convert_gigaam_to_mlx.py
@@ -0,0 +1,235 @@
+"""
+Convert GigaAM v3 PyTorch checkpoint to MLX safetensors format.
+
+Handles weight shape transpositions required by MLX conventions:
+- PyTorch Conv1d: [out_ch, in_ch, kernel] → MLX Conv1d: [out_ch, kernel, in_ch]
+- PyTorch Linear: same (no change needed, MLX nn.Linear uses same layout)
+- RNNT LSTM weights properly mapped to MLX layout
+- BatchNorm running stats need special handling
+
+Usage:
+    python convert_gigaam_to_mlx.py --model v3_ctc --output ./gigaam-v3-ctc-mlx
+    python convert_gigaam_to_mlx.py --model v3_rnnt --output ./gigaam-v3-rnnt-mlx
+"""
+import argparse
+import json
+import os
+import sys
+
+sys.path.insert(0, "..")
+
+import numpy as np
+import torch
+from safetensors.numpy import save_file
+
+VOCABULARY_V3 = [
+    " ", "а", "б", "в", "г", "д", "е", "ж", "з", "и", "й", "к", "л", "м",
+    "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ъ",
+    "ы", "ь", "э", "ю", "я"
+]
+
+
+def transpose_conv1d_weight(w: np.ndarray) -> np.ndarray:
+    """PyTorch Conv1d: [out, in, kernel] → MLX Conv1d: [out, kernel, in]"""
+    return np.transpose(w, (0, 2, 1))
+
+
+def sanitize_weights(state_dict: dict) -> dict:
+    """
+    Convert PyTorch state_dict to MLX-compatible weight dict.
+    
+    Key transformations:
+    1. Conv1d weights: transpose [out, in, K] → [out, K, in]
+    2. Skip preprocessor (mel filterbank computed at runtime in MLX)
+    3. Handle LayerNorm batch_norm naming
+    """
+    mlx_weights = {}
+    
+    skipped = []
+    for key, tensor in state_dict.items():
+        w = tensor.detach().cpu().float().numpy()
+        
+        # Keep preprocessor mel filterbank and window for exact reproduction
+        if key == "preprocessor.featurizer.0.mel_scale.fb":
+            mlx_weights["mel_filterbank"] = w  # [n_fft//2+1, n_mels]
+            skipped.append(key + " → mel_filterbank")
+            continue
+        if key == "preprocessor.featurizer.0.spectrogram.window":
+            mlx_weights["stft_window"] = w  # [win_length]
+            skipped.append(key + " → stft_window")
+            continue
+        if key.startswith("preprocessor."):
+            skipped.append(key)
+            continue
+        
+        # ALL 3D weights are Conv1d weights and need transposition
+        # PyTorch Conv1d: [out, in, kernel] → MLX Conv1d: [out, kernel, in]
+        # This covers: encoder.pre_encode.conv.*, encoder.layers.*.conv.*, head.decoder_layers.*
+        if "weight" in key and len(w.shape) == 3:
+            w = transpose_conv1d_weight(w)
+            
+        # RNNT LSTM conversions
+        if "lstm.weight_ih" in key:
+            mlx_weights[key.replace(".weight_ih_l0", ".Wx")] = w
+            continue
+        if "lstm.weight_hh" in key:
+            mlx_weights[key.replace(".weight_hh_l0", ".Wh")] = w
+            continue
+        if "lstm.bias_ih" in key:
+            # We must add bias_ih and bias_hh
+            hh_key = key.replace(".bias_ih_l0", ".bias_hh_l0")
+            hh_w = state_dict[hh_key].detach().cpu().float().numpy()
+            mlx_weights[key.replace(".bias_ih_l0", ".bias")] = w + hh_w
+            continue
+        if "lstm.bias_hh" in key:
+            # Already handled with bias_ih
+            continue
+            
+        # RNNT Joint conversions
+        if "joint.joint_net.1.weight" in key:
+            mlx_weights[key.replace("joint_net.1.weight", "joint_net_linear.weight")] = w
+            continue
+        if "joint.joint_net.1.bias" in key:
+            mlx_weights[key.replace("joint_net.1.bias", "joint_net_linear.bias")] = w
+            continue
+            
+        # BatchNorm / LayerNorm:
+        # GigaAM v3 uses layer_norm for conv_norm_type, so batch_norm is actually LayerNorm
+        # The keys already use "batch_norm" name but the module is nn.LayerNorm
+        # In MLX, this maps to nn.LayerNorm with weight/bias — same key structure works
+        
+        mlx_weights[key] = w
+    
+    if skipped:
+        print(f"Skipped {len(skipped)} preprocessor keys: {skipped}")
+    
+    return mlx_weights
+
+
+def build_config(model_name: str, cfg) -> dict:
+    """Build config.json for the MLX model."""
+    from omegaconf import OmegaConf
+    cfg_dict = OmegaConf.to_container(cfg, resolve=True)
+    
+    config = {
+        "model_type": "gigaam",
+        "model_name": model_name,
+        "sample_rate": cfg_dict.get("sample_rate", 16000),
+        "preprocessor": {
+            "sample_rate": cfg_dict["preprocessor"].get("sample_rate", 16000),
+            "features": cfg_dict["preprocessor"].get("features", 64),
+            "win_length": cfg_dict["preprocessor"].get("win_length", 320),
+            "hop_length": cfg_dict["preprocessor"].get("hop_length", 160),
+            "n_fft": cfg_dict["preprocessor"].get("n_fft", 320),
+            "center": cfg_dict["preprocessor"].get("center", False),
+        },
+        "encoder": {
+            "feat_in": cfg_dict["encoder"].get("feat_in", 64),
+            "n_layers": cfg_dict["encoder"].get("n_layers", 16),
+            "d_model": cfg_dict["encoder"].get("d_model", 768),
+            "subsampling": cfg_dict["encoder"].get("subsampling", "conv1d"),
+            "subs_kernel_size": cfg_dict["encoder"].get("subs_kernel_size", 5),
+            "subsampling_factor": cfg_dict["encoder"].get("subsampling_factor", 4),
+            "ff_expansion_factor": cfg_dict["encoder"].get("ff_expansion_factor", 4),
+            "self_attention_model": cfg_dict["encoder"].get("self_attention_model", "rotary"),
+            "pos_emb_max_len": cfg_dict["encoder"].get("pos_emb_max_len", 5000),
+            "n_heads": cfg_dict["encoder"].get("n_heads", 16),
+            "conv_kernel_size": cfg_dict["encoder"].get("conv_kernel_size", 5),
+            "conv_norm_type": cfg_dict["encoder"].get("conv_norm_type", "layer_norm"),
+        },
+    }
+    
+    # Model-specific head config
+    if "ctc" in model_name:
+        config["head_type"] = "ctc"
+        config["head"] = {
+            "feat_in": cfg_dict["head"].get("feat_in", 768),
+            "num_classes": cfg_dict["head"].get("num_classes", 34),
+        }
+        config["vocabulary"] = cfg_dict["decoding"].get("vocabulary", VOCABULARY_V3)
+    elif "rnnt" in model_name:
+        config["head_type"] = "rnnt"
+        config["head"] = {
+            "decoder": cfg_dict["head"]["decoder"],
+            "joint": cfg_dict["head"]["joint"],
+        }
+        if "vocabulary" in cfg_dict["decoding"]:
+            config["vocabulary"] = cfg_dict["decoding"]["vocabulary"]
+        else:
+            config["vocabulary"] = VOCABULARY_V3
+        # RNNT uses tokenizer
+        config["tokenizer_model"] = "tokenizer.model"
+    
+    return config
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert GigaAM to MLX format")
+    parser.add_argument("--model", type=str, default="v3_ctc",
+                        help="Model name: v3_ctc, v3_rnnt, v3_e2e_ctc, v3_e2e_rnnt, v3_ssl")
+    parser.add_argument("--output", type=str, default="./gigaam-v3-ctc-mlx",
+                        help="Output directory")
+    parser.add_argument("--dtype", type=str, default="float16",
+                        choices=["float16", "bfloat16", "float32"],
+                        help="Output dtype")
+    args = parser.parse_args()
+    
+    print(f"Loading GigaAM model: {args.model}")
+    import gigaam
+    model = gigaam.load_model(args.model, device="cpu", fp16_encoder=False, use_flash=False)
+    
+    print("Extracting state dict...")
+    state_dict = model.state_dict()
+    print(f"  Total PyTorch keys: {len(state_dict)}")
+    print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
+    
+    print("Sanitizing weights for MLX...")
+    mlx_weights = sanitize_weights(state_dict)
+    print(f"  Total MLX keys: {len(mlx_weights)}")
+    
+    # Convert dtype
+    np_dtype = {
+        "float16": np.float16,
+        "bfloat16": np.float16,  # safetensors doesn't support bf16 natively in numpy
+        "float32": np.float32,
+    }[args.dtype]
+    
+    mlx_weights = {k: v.astype(np_dtype) for k, v in mlx_weights.items()}
+    
+    # Build config
+    print("Building config.json...")
+    config = build_config(args.model, model.cfg)
+    
+    # Save
+    os.makedirs(args.output, exist_ok=True)
+    
+    safetensors_path = os.path.join(args.output, "model.safetensors")
+    print(f"Saving weights to {safetensors_path}...")
+    save_file(mlx_weights, safetensors_path)
+    
+    config_path = os.path.join(args.output, "config.json")
+    print(f"Saving config to {config_path}...")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2, ensure_ascii=False)
+    
+    # Copy tokenizer if rnnt
+    if "rnnt" in args.model:
+        import shutil
+        tokenizer_src = os.path.join(os.path.expanduser("~/.cache/gigaam"), 
+                                      f"{args.model}_tokenizer.model")
+        if os.path.exists(tokenizer_src):
+            tokenizer_dst = os.path.join(args.output, "tokenizer.model")
+            shutil.copy2(tokenizer_src, tokenizer_dst)
+            print(f"Copied tokenizer to {tokenizer_dst}")
+    
+    # Summary
+    total_bytes = os.path.getsize(safetensors_path)
+    print(f"\n✅ Conversion complete!")
+    print(f"   Output: {args.output}")
+    print(f"   Weights: {total_bytes / 1024 / 1024:.1f} MB ({args.dtype})")
+    print(f"   Keys: {len(mlx_weights)}")
+    print(f"   Config: {config_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlx_convert/gigaam-cli b/mlx_convert/gigaam-cli
new file mode 100755
index 0000000..a8acb07
--- /dev/null
+++ b/mlx_convert/gigaam-cli
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""
+gigaam-cli — GigaAM v3 transcription CLI for MLX (CTC and RNNT).
+
+Usage:
+    gigaam-cli -f audio.wav [-m /path/to/model_dir]
+    gigaam-cli --help
+"""
+import argparse
+import os
+import sys
+import time
+
+# Resolve default model dir relative to this script
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_MODEL = os.path.join(SCRIPT_DIR, "gigaam-v3-ctc-mlx")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GigaAM v3 transcription (MLX)")
+    parser.add_argument("-f", "--file", required=True, help="Audio file path")
+    parser.add_argument("-m", "--model", default=DEFAULT_MODEL, help="Model directory")
+    parser.add_argument("-l", "--language", default="ru", help="Language (ignored, always ru)")
+    parser.add_argument("-nt", "--no-timestamps", action="store_true", help="(compat flag, ignored)")
+    parser.add_argument("--no-prints", action="store_true", help="Suppress info messages")
+    args = parser.parse_args()
+
+    if not os.path.isfile(args.file):
+        print(f"Error: file not found: {args.file}", file=sys.stderr)
+        sys.exit(1)
+
+    if not os.path.isdir(args.model):
+        print(f"Error: model dir not found: {args.model}", file=sys.stderr)
+        sys.exit(1)
+
+    # Add script directory to path so gigaam_mlx can be imported
+    sys.path.insert(0, SCRIPT_DIR)
+    from gigaam_mlx import load_model, load_audio
+
+    if not args.no_prints:
+        print(f"gigaam-cli: loading model from {args.model}", file=sys.stderr)
+
+    t0 = time.time()
+    model = load_model(args.model)
+    load_time = time.time() - t0
+
+    if not args.no_prints:
+        print(f"gigaam-cli: model loaded in {load_time:.2f}s", file=sys.stderr)
+
+    audio = load_audio(args.file)
+
+    t0 = time.time()
+    text = model.transcribe(audio)
+    transcribe_time = time.time() - t0
+
+    if not args.no_prints:
+        print(f"gigaam-cli: transcribed in {transcribe_time:.2f}s", file=sys.stderr)
+
+    # Output ONLY the text to stdout (like whisper-cli -nt)
+    print(text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlx_convert/gigaam-stream b/mlx_convert/gigaam-stream
new file mode 100755
index 0000000..3841685
--- /dev/null
+++ b/mlx_convert/gigaam-stream
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+gigaam-stream — Real-time speech recognition using GigaAM v3 on MLX.
+
+Growing-buffer pseudo-streaming: captures audio, transcribes from start
+up to current position each step. GigaAM is fast enough on Apple Silicon
+that even 30s audio processes in well under a second.
+
+Usage:
+    gigaam-stream                           # live microphone
+    gigaam-stream --file audio.wav          # file streaming
+    gigaam-stream --step 1000 --context 5000  # custom timing
+
+Requirements:
+    pip install sounddevice   (for live microphone capture)
+"""
+import argparse
+import os
+import sys
+import time
+import threading
+import signal
+
+import numpy as np
+import mlx.core as mx
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_MODEL = os.path.join(SCRIPT_DIR, "gigaam-v3-ctc-mlx")
+SAMPLE_RATE = 16000
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description="GigaAM v3 real-time streaming (MLX)")
+    p.add_argument("-m", "--model", default=DEFAULT_MODEL, help="Model directory")
+    p.add_argument("--step", type=int, default=2000,
+                   help="Process every N ms (default: 2000)")
+    p.add_argument("--file", "-f", type=str, default=None,
+                   help="Transcribe file instead of microphone")
+    p.add_argument("--no-overwrite", action="store_true",
+                   help="Print incrementally instead of overwriting line")
+    p.add_argument("--vad-threshold", type=float, default=0.003,
+                   help="VAD energy threshold (default: 0.003)")
+    return p.parse_args()
+
+
+def clear_line():
+    sys.stdout.write("\33[2K\r")
+    sys.stdout.flush()
+
+
+def stream_from_file(model, audio, args):
+    """Stream a pre-recorded file with growing buffer."""
+    from gigaam_mlx import StreamingConfig
+
+    config = StreamingConfig(step_duration=args.step / 1000.0)
+    total_dur = audio.shape[0] / SAMPLE_RATE
+
+    print(f"\033[90m[Streaming {total_dur:.1f}s audio, step={args.step}ms]\033[0m\n")
+
+    t0 = time.time()
+    for result in model.stream_generate(audio, config=config):
+        elapsed = time.time() - t0
+        if args.no_overwrite:
+            if result.text:
+                sys.stdout.write(result.text)
+                sys.stdout.flush()
+        else:
+            clear_line()
+            # Show current transcription with position indicator
+            pos = f"[{result.audio_position:.1f}s]"
+            sys.stdout.write(f"\r\033[90m{pos}\033[0m {result.cumulative_text}")
+            sys.stdout.flush()
+
+        if result.is_final:
+            print()
+            break
+
+    total_time = time.time() - t0
+    print(f"\n\033[90m[Done in {total_time:.2f}s | {total_dur/total_time:.0f}x realtime]\033[0m")
+
+
+def stream_from_microphone(model, args):
+    """Live streaming from microphone with growing buffer."""
+    try:
+        import sounddevice as sd
+    except ImportError:
+        print("Error: sounddevice not installed. Run: uv pip install sounddevice",
+              file=sys.stderr)
+        sys.exit(1)
+
+    step_ms = args.step
+    step_sec = step_ms / 1000.0
+
+    # Shared state
+    audio_buffer = np.zeros(0, dtype=np.float32)
+    buffer_lock = threading.Lock()
+    running = True
+    prev_text = ""
+
+    def audio_callback(indata, frames, time_info, status):
+        nonlocal audio_buffer
+        if status:
+            print(f"\033[90m[audio: {status}]\033[0m", file=sys.stderr)
+        chunk = indata[:, 0].copy()
+        with buffer_lock:
+            audio_buffer = np.concatenate([audio_buffer, chunk])
+            # Cap at 30s
+            max_buf = SAMPLE_RATE * 30
+            if len(audio_buffer) > max_buf:
+                audio_buffer = audio_buffer[-max_buf:]
+
+    def signal_handler(sig, frame):
+        nonlocal running
+        running = False
+
+    signal.signal(signal.SIGINT, signal_handler)
+
+    print(f"\033[90m[GigaAM streaming — step={step_ms}ms, Ctrl+C to stop]\033[0m")
+    print(f"\033[90m[Start speaking...]\033[0m\n")
+
+    stream = sd.InputStream(
+        samplerate=SAMPLE_RATE,
+        channels=1,
+        dtype="float32",
+        blocksize=int(SAMPLE_RATE * 0.1),
+        callback=audio_callback,
+    )
+    stream.start()
+
+    last_process_time = 0
+    silence_count = 0
+    MAX_SILENCE = 5  # stop after 5 consecutive silent steps
+
+    try:
+        while running:
+            now = time.time()
+            if now - last_process_time < step_sec:
+                time.sleep(0.05)
+                continue
+
+            last_process_time = now
+
+            with buffer_lock:
+                if len(audio_buffer) < int(SAMPLE_RATE * 0.3):
+                    continue
+                buf_copy = audio_buffer.copy()
+
+            # Simple energy VAD on latest step
+            latest = buf_copy[-min(int(step_sec * SAMPLE_RATE), len(buf_copy)):]
+            energy = np.sqrt(np.mean(latest ** 2))
+
+            if energy < args.vad_threshold:
+                silence_count += 1
+                continue
+            else:
+                silence_count = 0
+
+            # Transcribe entire buffer (growing buffer approach)
+            t0 = time.time()
+            audio_mx = mx.array(buf_copy)
+            result = model.stream_live(audio_mx)
+            dt = time.time() - t0
+
+            text = result.cumulative_text
+
+            if text and text != prev_text:
+                clear_line()
+                dur = len(buf_copy) / SAMPLE_RATE
+                sys.stdout.write(
+                    f"\r\033[90m[{dur:.1f}s {dt*1000:.0f}ms]\033[0m {text}"
+                )
+                sys.stdout.flush()
+                prev_text = text
+
+    except KeyboardInterrupt:
+        pass
+    finally:
+        stream.stop()
+        stream.close()
+        print(f"\n")
+        if prev_text:
+            print(f"\033[1mFinal:\033[0m {prev_text}")
+
+
+def main():
+    args = parse_args()
+
+    sys.path.insert(0, SCRIPT_DIR)
+    from gigaam_mlx import load_model, load_audio
+
+    print(f"\033[90mLoading GigaAM v3 MLX...\033[0m", file=sys.stderr)
+    t0 = time.time()
+    model = load_model(args.model)
+    print(f"\033[90mLoaded in {time.time() - t0:.2f}s\033[0m", file=sys.stderr)
+
+    if args.file:
+        audio = load_audio(args.file)
+        stream_from_file(model, audio, args)
+    else:
+        stream_from_microphone(model, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlx_convert/gigaam-transcribe b/mlx_convert/gigaam-transcribe
new file mode 100755
index 0000000..af387fa
--- /dev/null
+++ b/mlx_convert/gigaam-transcribe
@@ -0,0 +1,15 @@
+#!/bin/bash
+# gigaam-transcribe — shell wrapper for gigaam-cli
+# Activates the correct Python venv and runs gigaam-cli
+# Install: ln -s /path/to/gigaam-transcribe /usr/local/bin/gigaam-transcribe
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PYTHON="${SCRIPT_DIR}/.venv/bin/python"
+
+if [ ! -f "$PYTHON" ]; then
+    echo "Error: Python venv not found at ${SCRIPT_DIR}/.venv" >&2
+    echo "Run: cd ${SCRIPT_DIR} && uv venv .venv && uv pip install mlx safetensors numpy" >&2
+    exit 1
+fi
+
+exec "$PYTHON" "${SCRIPT_DIR}/gigaam-cli" "$@"
diff --git a/mlx_convert/gigaam_mlx.py b/mlx_convert/gigaam_mlx.py
new file mode 100644
index 0000000..dd461b0
--- /dev/null
+++ b/mlx_convert/gigaam_mlx.py
@@ -0,0 +1,743 @@
+"""
+GigaAM v3 CTC model in MLX.
+
+Architecture:
+  - FeatureExtractor: log-mel spectrogram (computed via numpy/MLX at runtime)
+  - ConformerEncoder: conv1d subsampling + rotary positional embeddings + 16 conformer layers
+  - CTCHead: Conv1d(768, num_classes, kernel=1)
+  - CTC greedy decoding
+
+Supports pseudo-streaming via sliding window (like whisper-stream / mlx-audio Parakeet).
+"""
+import json
+import math
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Generator, List, Optional, Tuple
+
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+
+
+# ─────────────────────────── Config ───────────────────────────
+
+@dataclass
+class GigaAMConfig:
+    model_type: str = "gigaam"
+    model_name: str = "v3_ctc"
+    sample_rate: int = 16000
+    # preprocessor
+    features: int = 64
+    win_length: int = 320
+    hop_length: int = 160
+    n_fft: int = 320
+    center: bool = False
+    # encoder
+    feat_in: int = 64
+    n_layers: int = 16
+    d_model: int = 768
+    subsampling: str = "conv1d"
+    subs_kernel_size: int = 5
+    subsampling_factor: int = 4
+    ff_expansion_factor: int = 4
+    self_attention_model: str = "rotary"
+    pos_emb_max_len: int = 5000
+    n_heads: int = 16
+    conv_kernel_size: int = 5
+    conv_norm_type: str = "layer_norm"
+    # head
+    head_type: str = "ctc"
+    num_classes: int = 34
+    
+    # rnnt specific
+    rnnt_pred_hidden: int = 320
+    rnnt_joint_hidden: int = 320
+    rnnt_max_symbols: int = 10
+    
+    # vocabulary
+    vocabulary: Optional[List[str]] = None
+
+    @classmethod
+    def from_file(cls, path: str) -> "GigaAMConfig":
+        with open(path) as f:
+            d = json.load(f)
+        enc = d.get("encoder", {})
+        pre = d.get("preprocessor", {})
+        head = d.get("head", {})
+        return cls(
+            model_type=d.get("model_type", "gigaam"),
+            model_name=d.get("model_name", "v3_ctc"),
+            sample_rate=d.get("sample_rate", 16000),
+            features=pre.get("features", 64),
+            win_length=pre.get("win_length", 320),
+            hop_length=pre.get("hop_length", 160),
+            n_fft=pre.get("n_fft", 320),
+            center=pre.get("center", False),
+            feat_in=enc.get("feat_in", 64),
+            n_layers=enc.get("n_layers", 16),
+            d_model=enc.get("d_model", 768),
+            subsampling=enc.get("subsampling", "conv1d"),
+            subs_kernel_size=enc.get("subs_kernel_size", 5),
+            subsampling_factor=enc.get("subsampling_factor", 4),
+            ff_expansion_factor=enc.get("ff_expansion_factor", 4),
+            self_attention_model=enc.get("self_attention_model", "rotary"),
+            pos_emb_max_len=enc.get("pos_emb_max_len", 5000),
+            n_heads=enc.get("n_heads", 16),
+            conv_kernel_size=enc.get("conv_kernel_size", 5),
+            conv_norm_type=enc.get("conv_norm_type", "layer_norm"),
+            head_type=d.get("head_type", "ctc"),
+            num_classes=head.get("num_classes", 34) if "num_classes" in head else head.get("decoder", {}).get("num_classes", 34),
+            rnnt_pred_hidden=head.get("decoder", {}).get("pred_hidden", 320),
+            rnnt_joint_hidden=head.get("joint", {}).get("joint_hidden", 320),
+            vocabulary=d.get("vocabulary"),
+        )
+
+
+# ─────────────────────── Audio preprocessing ───────────────────────
+
+def mel_filters(sr: int, n_fft: int, n_mels: int) -> mx.array:
+    """HTK mel filterbank (matching torchaudio default for GigaAM)."""
+    def hz_to_mel(f):
+        return 2595.0 * math.log10(1.0 + f / 700.0)
+    def mel_to_hz(m):
+        return 700.0 * (10.0 ** (m / 2595.0) - 1.0)
+
+    f_min, f_max = 0.0, sr / 2.0
+    mel_min, mel_max = hz_to_mel(f_min), hz_to_mel(f_max)
+    mel_points = np.linspace(mel_min, mel_max, n_mels + 2)
+    hz_points = np.array([mel_to_hz(m) for m in mel_points])
+
+    bins = np.floor((n_fft + 1) * hz_points / sr).astype(int)
+
+    fb = np.zeros((n_fft // 2 + 1, n_mels), dtype=np.float32)
+    for i in range(n_mels):
+        lo, mid, hi = bins[i], bins[i + 1], bins[i + 2]
+        for k in range(lo, mid):
+            if mid != lo:
+                fb[k, i] = (k - lo) / (mid - lo)
+        for k in range(mid, hi):
+            if hi != mid:
+                fb[k, i] = (hi - k) / (hi - mid)
+    return mx.array(fb)
+
+
+def hanning_window(size: int) -> mx.array:
+    """Hanning window."""
+    n = np.arange(size, dtype=np.float32)
+    return mx.array(0.5 - 0.5 * np.cos(2.0 * np.pi * n / size))
+
+
+def stft(signal: mx.array, n_fft: int, hop: int, win_len: int, window: mx.array) -> mx.array:
+    """Simple STFT using mx operations."""
+    # Pad if necessary
+    pad_amount = n_fft // 2
+    # For center=False, no padding
+    length = signal.shape[-1]
+    # Number of frames
+    n_frames = 1 + (length - win_len) // hop
+
+    # Build frames via strided indexing
+    indices = mx.arange(win_len)[None, :] + (mx.arange(n_frames) * hop)[:, None]
+    frames = signal[indices] * window[None, :]
+
+    # Zero-pad to n_fft if needed
+    if win_len < n_fft:
+        pad_size = n_fft - win_len
+        frames = mx.pad(frames, ((0, 0), (0, pad_size)))
+
+    # Real FFT
+    spectrum = mx.fft.rfft(frames)
+    return spectrum  # [n_frames, n_fft//2 + 1]
+
+
+def log_mel_spectrogram(audio: mx.array, cfg: GigaAMConfig,
+                        mel_fb: Optional[mx.array] = None,
+                        stft_win: Optional[mx.array] = None) -> mx.array:
+    """Compute log-mel spectrogram matching GigaAM FeatureExtractor."""
+    window = stft_win if stft_win is not None else hanning_window(cfg.win_length)
+    spec = stft(audio, cfg.n_fft, cfg.hop_length, cfg.win_length, window)
+    # Power spectrum
+    power = mx.square(mx.abs(spec))  # [T, n_fft//2+1]
+    # Mel filterbank
+    if mel_fb is not None:
+        filters = mel_fb
+    else:
+        filters = mel_filters(cfg.sample_rate, cfg.n_fft, cfg.features)  # [n_fft//2+1, n_mels]
+    mel = power @ filters  # [T, n_mels]
+    # Log
+    log_mel = mx.log(mx.clip(mel, 1e-9, 1e9))
+    return log_mel  # [T, n_mels]
+
+
+# ─────────────────────── Model layers ───────────────────────
+
+class Conv1dSubsampling(nn.Module):
+    """Conv1d striding subsampling: 2 conv1d layers with stride=2, ReLU."""
+    def __init__(self, cfg: GigaAMConfig):
+        super().__init__()
+        ks = cfg.subs_kernel_size
+        pad = (ks - 1) // 2
+        n_subs = int(math.log2(cfg.subsampling_factor))
+        
+        layers = []
+        in_ch = cfg.feat_in
+        for _ in range(n_subs):
+            layers.append(nn.Conv1d(in_ch, cfg.d_model, kernel_size=ks, stride=2, padding=pad))
+            layers.append(nn.ReLU())
+            in_ch = cfg.d_model
+        self.conv = layers
+        self._n_subs = n_subs
+        self._ks = ks
+        self._pad = pad
+
+    def __call__(self, x: mx.array, lengths: mx.array) -> Tuple[mx.array, mx.array]:
+        """x: [B, T, feat_in] → [B, T', d_model]"""
+        for layer in self.conv:
+            x = layer(x)
+        # Compute output lengths
+        for _ in range(self._n_subs):
+            lengths = mx.floor((lengths.astype(mx.float32) + 2 * self._pad - self._ks) / 2 + 1).astype(mx.int32)
+        return x, lengths
+
+
+class RotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embeddings (RoPE)."""
+    def __init__(self, dim: int, base: int = 10000, max_len: int = 5000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        inv_freq = 1.0 / (base ** (np.arange(0, dim, 2, dtype=np.float32) / dim))
+        self._inv_freq = mx.array(inv_freq)
+
+    def __call__(self, seq_len: int) -> Tuple[mx.array, mx.array]:
+        t = mx.arange(seq_len).astype(mx.float32)
+        freqs = mx.outer(t, self._inv_freq)
+        emb = mx.concatenate([freqs, freqs], axis=-1)
+        return mx.cos(emb), mx.sin(emb)
+
+
+def rotate_half(x: mx.array) -> mx.array:
+    """Rotates half the hidden dims of the input."""
+    d = x.shape[-1] // 2
+    x1, x2 = x[..., :d], x[..., d:]
+    return mx.concatenate([-x2, x1], axis=-1)
+
+
+def apply_rotary_emb(q: mx.array, k: mx.array, cos: mx.array, sin: mx.array) -> Tuple[mx.array, mx.array]:
+    """Apply rotary embeddings. q,k: [B, H, T, D], cos,sin: [T, D]"""
+    cos = cos[None, None, :, :]  # [1, 1, T, D]
+    sin = sin[None, None, :, :]
+    q_rot = q * cos + rotate_half(q) * sin
+    k_rot = k * cos + rotate_half(k) * sin
+    return q_rot, k_rot
+
+
+class RotaryMultiHeadAttention(nn.Module):
+    """Multi-head attention with rotary positional embeddings.
+    
+    GigaAM applies RoPE BEFORE linear projections:
+    1. Reshape raw input to multi-head: [B, T, D] → [B, T, H, d_k]
+    2. Apply RoPE to query/key heads
+    3. Reshape back to [B, T, D]
+    4. Project through linear_q/k/v
+    5. Standard scaled dot-product attention
+    """
+    def __init__(self, d_model: int, n_heads: int):
+        super().__init__()
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+        self.linear_q = nn.Linear(d_model, d_model)
+        self.linear_k = nn.Linear(d_model, d_model)
+        self.linear_v = nn.Linear(d_model, d_model)
+        self.linear_out = nn.Linear(d_model, d_model)
+
+    def __call__(self, x: mx.array, cos: mx.array, sin: mx.array,
+                 mask: Optional[mx.array] = None) -> mx.array:
+        B, T, D = x.shape
+        H, d_k = self.n_heads, self.d_k
+
+        # 1. Reshape raw input to multi-head for RoPE: [B, T, H, d_k]
+        x_heads = x.reshape(B, T, H, d_k)
+
+        # 2. Apply RoPE to query and key (same input for self-attention)
+        # cos, sin: [T, d_k] → [1, T, 1, d_k]
+        cos_e = cos[None, :, None, :]
+        sin_e = sin[None, :, None, :]
+        q_rot = x_heads * cos_e + rotate_half(x_heads) * sin_e
+        k_rot = x_heads * cos_e + rotate_half(x_heads) * sin_e
+
+        # 3. Reshape back to [B, T, D]
+        q_rot = q_rot.reshape(B, T, D)
+        k_rot = k_rot.reshape(B, T, D)
+
+        # 4. Project through linear layers
+        q = self.linear_q(q_rot).reshape(B, T, H, d_k).transpose(0, 2, 1, 3)  # [B, H, T, d_k]
+        k = self.linear_k(k_rot).reshape(B, T, H, d_k).transpose(0, 2, 1, 3)
+        v = self.linear_v(x).reshape(B, T, H, d_k).transpose(0, 2, 1, 3)  # value uses original x
+
+        # 5. Scaled dot-product attention
+        scale = math.sqrt(d_k)
+        attn = (q @ k.transpose(0, 1, 3, 2)) / scale
+        if mask is not None:
+            attn = attn + mask
+        attn = mx.softmax(attn, axis=-1)
+        out = attn @ v  # [B, H, T, d_k]
+        out = out.transpose(0, 2, 1, 3).reshape(B, T, -1)
+        return self.linear_out(out)
+
+
+class ConformerConvolution(nn.Module):
+    """Conformer convolution module with LayerNorm."""
+    def __init__(self, d_model: int, kernel_size: int):
+        super().__init__()
+        pad = (kernel_size - 1) // 2
+        self.pointwise_conv1 = nn.Conv1d(d_model, d_model * 2, kernel_size=1)
+        self.depthwise_conv = nn.Conv1d(d_model, d_model, kernel_size=kernel_size,
+                                         padding=pad, groups=d_model)
+        # GigaAM v3 uses layer_norm for conv_norm_type
+        self.batch_norm = nn.LayerNorm(d_model)
+        self.activation = nn.SiLU()
+        self.pointwise_conv2 = nn.Conv1d(d_model, d_model, kernel_size=1)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        # x: [B, T, D]
+        x = self.pointwise_conv1(x)  # → [B, T, 2*D]
+        # GLU: split last dim in half, apply sigmoid gate
+        half = x.shape[-1] // 2
+        x = x[..., :half] * mx.sigmoid(x[..., half:])  # GLU
+        x = self.depthwise_conv(x)   # → [B, T, D]
+        x = self.batch_norm(x)
+        x = self.activation(x)
+        x = self.pointwise_conv2(x)  # → [B, T, D]
+        return x
+
+
+class ConformerFeedForward(nn.Module):
+    """Conformer feed-forward module."""
+    def __init__(self, d_model: int, d_ff: int):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff)
+        self.activation = nn.SiLU()
+        self.linear2 = nn.Linear(d_ff, d_model)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.linear2(self.activation(self.linear1(x)))
+
+
+class ConformerLayer(nn.Module):
+    """Single conformer layer."""
+    def __init__(self, cfg: GigaAMConfig):
+        super().__init__()
+        d = cfg.d_model
+        d_ff = d * cfg.ff_expansion_factor
+
+        self.norm_feed_forward1 = nn.LayerNorm(d)
+        self.feed_forward1 = ConformerFeedForward(d, d_ff)
+        self.norm_self_att = nn.LayerNorm(d)
+        self.self_attn = RotaryMultiHeadAttention(d, cfg.n_heads)
+        self.norm_conv = nn.LayerNorm(d)
+        self.conv = ConformerConvolution(d, cfg.conv_kernel_size)
+        self.norm_feed_forward2 = nn.LayerNorm(d)
+        self.feed_forward2 = ConformerFeedForward(d, d_ff)
+        self.norm_out = nn.LayerNorm(d)
+
+    def __call__(self, x: mx.array, cos: mx.array, sin: mx.array,
+                 mask: Optional[mx.array] = None) -> mx.array:
+        # FF1
+        residual = x
+        x = self.norm_feed_forward1(x)
+        x = self.feed_forward1(x)
+        residual = residual + x * 0.5
+
+        # Self-attention
+        x = self.norm_self_att(residual)
+        x = self.self_attn(x, cos, sin, mask=mask)
+        residual = residual + x
+
+        # Conv
+        x = self.norm_conv(residual)
+        x = self.conv(x)
+        residual = residual + x
+
+        # FF2
+        x = self.norm_feed_forward2(residual)
+        x = self.feed_forward2(x)
+        residual = residual + x * 0.5
+
+        return self.norm_out(residual)
+
+
+class ConformerEncoder(nn.Module):
+    """GigaAM Conformer encoder."""
+    def __init__(self, cfg: GigaAMConfig):
+        super().__init__()
+        self.pre_encode = Conv1dSubsampling(cfg)
+        self.pos_enc = RotaryPositionalEmbedding(
+            cfg.d_model // cfg.n_heads,
+            base=10000,
+            max_len=cfg.pos_emb_max_len,
+        )
+        self.layers = [ConformerLayer(cfg) for _ in range(cfg.n_layers)]
+
+    def __call__(self, features: mx.array, lengths: mx.array) -> Tuple[mx.array, mx.array]:
+        """features: [B, T, feat_in] → encoded: [B, T', D], lengths: [B]"""
+        x, lengths = self.pre_encode(features, lengths)
+        T = x.shape[1]
+        cos, sin = self.pos_enc(T)
+        for layer in self.layers:
+            x = layer(x, cos, sin)
+        return x, lengths
+
+
+class RNNTJoint(nn.Module):
+    def __init__(self, enc_hidden: int, pred_hidden: int, joint_hidden: int, num_classes: int):
+        super().__init__()
+        self.enc = nn.Linear(enc_hidden, joint_hidden)
+        self.pred = nn.Linear(pred_hidden, joint_hidden)
+        self.joint_net_linear = nn.Linear(joint_hidden, num_classes)
+
+    def __call__(self, encoder_out: mx.array, decoder_out: mx.array) -> mx.array:
+        enc = mx.expand_dims(self.enc(encoder_out), 2)
+        pred = mx.expand_dims(self.pred(decoder_out), 1)
+        joint_rep = mx.maximum(enc + pred, 0.0)
+        return self.joint_net_linear(joint_rep)
+
+
+class RNNTDecoder(nn.Module):
+    def __init__(self, num_classes: int, pred_hidden: int):
+        super().__init__()
+        self.blank_id = num_classes - 1
+        self.pred_hidden = pred_hidden
+        self.embed = nn.Embedding(num_classes, pred_hidden)
+        self.lstm = nn.LSTM(pred_hidden, pred_hidden)
+
+    def predict(self, x: Optional[int], state: Optional[Tuple[mx.array, mx.array]], batch_size: int = 1):
+        if x is not None:
+            emb = self.embed(mx.array([[x]]))
+        else:
+            emb = mx.zeros((batch_size, 1, self.pred_hidden))
+
+        h, c = state if state is not None else (None, None)
+        out, cell = self.lstm(emb, hidden=h, cell=c)
+        return out, (out[:, -1, :], cell[:, -1, :])
+
+
+class RNNTHead(nn.Module):
+    def __init__(self, feat_in: int, pred_hidden: int, joint_hidden: int, num_classes: int):
+        super().__init__()
+        self.decoder = RNNTDecoder(num_classes, pred_hidden)
+        self.joint = RNNTJoint(feat_in, pred_hidden, joint_hidden, num_classes)
+
+
+class CTCHead(nn.Module):
+    """CTC decoder head: Conv1d(d_model, num_classes, kernel=1)."""
+    def __init__(self, feat_in: int, num_classes: int):
+        super().__init__()
+        self.decoder_layers = [nn.Conv1d(feat_in, num_classes, kernel_size=1)]
+
+    def __call__(self, x: mx.array) -> mx.array:
+        """x: [B, T, D] → log_probs: [B, T, C]"""
+        logits = self.decoder_layers[0](x)  # [B, T, C]
+        return mx.softmax(logits, axis=-1)  # we'll use log later
+
+
+# ─────────────────────── Streaming dataclasses ───────────────────────
+
+@dataclass
+class StreamingConfig:
+    """Configuration for pseudo-streaming transcription.
+
+    Attributes:
+        chunk_duration: Duration of each audio chunk in seconds.
+        context_duration: Extra audio from before the chunk to keep as context.
+            The model always sees [context + chunk] for better accuracy.
+        step_duration: How much to advance between steps. If None, equals chunk_duration.
+    """
+    chunk_duration: float = 2.0
+    context_duration: float = 3.0
+    step_duration: Optional[float] = None
+
+    def __post_init__(self):
+        if self.step_duration is None:
+            self.step_duration = self.chunk_duration
+
+
+@dataclass
+class StreamingResult:
+    """Result from streaming transcription — mlx-audio compatible.
+
+    Attributes:
+        text: New text since last emission.
+        is_final: True if this is the final result.
+        start_time: Start time in seconds.
+        end_time: End time in seconds.
+        progress: Progress 0.0–1.0.
+        audio_position: Current position in audio (seconds).
+        audio_duration: Total audio duration (seconds), 0 for live.
+        cumulative_text: Full accumulated transcription so far.
+        language: Language code.
+    """
+    text: str
+    is_final: bool
+    start_time: float
+    end_time: float
+    progress: float = 0.0
+    audio_position: float = 0.0
+    audio_duration: float = 0.0
+    cumulative_text: str = ""
+    language: str = "ru"
+
+
+class GigaAM(nn.Module):
+    """Full GigaAM model (supports CTC and RNNT)."""
+    def __init__(self, cfg: GigaAMConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.encoder = ConformerEncoder(cfg)
+        
+        if cfg.head_type == "rnnt":
+            self.head = RNNTHead(
+                feat_in=cfg.d_model,
+                pred_hidden=cfg.rnnt_pred_hidden,
+                joint_hidden=cfg.rnnt_joint_hidden,
+                num_classes=cfg.num_classes
+            )
+        else:
+            self.head = CTCHead(cfg.d_model, cfg.num_classes)
+            
+        self.mel_filterbank: Optional[mx.array] = None
+        self.stft_window: Optional[mx.array] = None
+
+    def __call__(self, features: mx.array, lengths: mx.array) -> Tuple[mx.array, mx.array]:
+        """Returns encoded features and their lengths."""
+        encoded, enc_lengths = self.encoder(features, lengths)
+        return encoded, enc_lengths
+
+    def _ctc_decode(self, log_probs: mx.array, enc_length: int) -> str:
+        """CTC greedy decode: collapse repeated + remove blanks."""
+        vocab = self.cfg.vocabulary
+        blank_id = len(vocab)
+        labels = mx.argmax(log_probs, axis=-1)
+        labels_list = labels.tolist()
+
+        result = []
+        prev = -1
+        for t, label in enumerate(labels_list):
+            if t >= enc_length:
+                break
+            if label == blank_id:
+                prev = label
+                continue
+            if label == prev:
+                continue
+            result.append(label)
+            prev = label
+
+        return "".join(vocab[i] for i in result)
+
+    def _compute_features(self, audio: mx.array) -> Tuple[mx.array, mx.array]:
+        """Audio → mel features [1, T, features] + lengths [1]."""
+        mel = log_mel_spectrogram(audio, self.cfg,
+                                   mel_fb=self.mel_filterbank,
+                                   stft_win=self.stft_window)
+        mel = mx.expand_dims(mel, 0)
+        lengths = mx.array([mel.shape[1]])
+        return mel, lengths
+
+    def transcribe(self, audio: mx.array) -> str:
+        """Transcribe raw audio waveform → text."""
+        mel, lengths = self._compute_features(audio)
+        encoded, enc_lengths = self(mel, lengths)
+        mx.eval(encoded, enc_lengths)
+        
+        if self.cfg.head_type == "rnnt":
+            return self._rnnt_decode(encoded[0], int(enc_lengths[0]))
+        else:
+            log_probs = self.head(encoded)
+            return self._ctc_decode(log_probs[0], int(enc_lengths[0]))
+
+    def _rnnt_decode(self, encoded: mx.array, enc_length: int) -> str:
+        """Greedy decode for RNN-T."""
+        vocab = self.cfg.vocabulary
+        blank_id = len(vocab)
+        max_symbols = self.cfg.rnnt_max_symbols
+        
+        hyp = []
+        dec_state = None
+        last_label = None
+        
+        for t in range(enc_length):
+            # encoded: [T, D] -> [1, 1, D]
+            f = encoded[t:t+1, :].reshape(1, 1, -1)
+            
+            not_blank = True
+            new_symbols = 0
+            while not_blank and new_symbols < max_symbols:
+                g, hidden = self.head.decoder.predict(last_label, dec_state, batch_size=1)
+                # g: [1, 1, D]
+                # joint output is [1, 1, 1, num_classes], we want argmax over classes
+                logits = self.head.joint(f, g)  # [1, 1, 1, C]
+                k = int(mx.argmax(logits[0, 0, 0, :]).item())
+                
+                if k == blank_id:
+                    not_blank = False
+                else:
+                    hyp.append(k)
+                    dec_state = hidden
+                    last_label = k
+                    new_symbols += 1
+                    
+        return "".join(vocab[i] for i in hyp)
+
+    def transcribe_chunk(self, audio: mx.array) -> str:
+        """Transcribe a single chunk (for streaming). Same as transcribe but clearer name."""
+        return self.transcribe(audio)
+
+    def stream_generate(
+        self,
+        audio: mx.array,
+        config: Optional[StreamingConfig] = None,
+    ) -> Generator[StreamingResult, None, None]:
+        """Pseudo-streaming transcription over pre-recorded audio.
+
+        Uses growing buffer approach: each step transcribes from the start
+        up to the current position. GigaAM at 85x realtime makes this fast
+        even for 30s audio (~0.4s inference).
+
+        For very long audio (>30s), falls back to sliding window of last 30s.
+
+        Args:
+            audio: Raw audio waveform, mx.array, 16kHz mono.
+            config: StreamingConfig with step duration.
+
+        Yields:
+            StreamingResult with incremental text for each step.
+        """
+        if config is None:
+            config = StreamingConfig()
+
+        sr = self.cfg.sample_rate
+        total_samples = audio.shape[0]
+        audio_duration = total_samples / sr
+
+        step_samples = int(config.step_duration * sr)
+        max_window = int(30.0 * sr)  # cap at 30s for memory/speed
+
+        previous_text = ""
+        position = step_samples  # start with first step
+
+        while position <= total_samples:
+            is_last = position >= total_samples
+            # Growing buffer from start, capped at 30s
+            window_start = max(0, position - max_window)
+            window = audio[window_start:position]
+
+            current_text = self.transcribe(window)
+
+            # Incremental text
+            new_text = _incremental_text(previous_text, current_text)
+            previous_text = current_text
+
+            audio_pos = position / sr
+            yield StreamingResult(
+                text=new_text,
+                is_final=is_last,
+                start_time=window_start / sr,
+                end_time=audio_pos,
+                progress=position / total_samples,
+                audio_position=audio_pos,
+                audio_duration=audio_duration,
+                cumulative_text=current_text,
+                language="ru",
+            )
+
+            if is_last:
+                break
+
+            position = min(position + step_samples, total_samples)
+
+    def stream_live(
+        self,
+        audio_buffer: mx.array,
+    ) -> StreamingResult:
+        """Transcribe a growing audio buffer for live microphone use.
+
+        Call this repeatedly as new audio arrives. Transcribes the full buffer
+        (capped at 30s from the end). GigaAM is 85x realtime so this is fast.
+
+        Args:
+            audio_buffer: The full accumulated audio so far.
+
+        Returns:
+            StreamingResult with current full transcription.
+        """
+        sr = self.cfg.sample_rate
+        total_samples = audio_buffer.shape[0]
+        max_window = int(30.0 * sr)
+
+        start = max(0, total_samples - max_window)
+        window = audio_buffer[start:]
+
+        text = self.transcribe(window)
+
+        return StreamingResult(
+            text=text,
+            is_final=False,
+            start_time=start / sr,
+            end_time=total_samples / sr,
+            progress=0.0,
+            audio_position=total_samples / sr,
+            audio_duration=0.0,
+            cumulative_text=text,
+            language="ru",
+        )
+
+
+def _incremental_text(previous: str, current: str) -> str:
+    """Find new text added to current vs previous.
+
+    If current starts with previous → return the new suffix.
+    If model corrected → return full current (with marker).
+    """
+    if not previous:
+        return current
+    if current.startswith(previous):
+        return current[len(previous):]
+    # Model self-corrected — return full updated text
+    return current
+
+
+def load_model(model_dir: str) -> GigaAM:
+    """Load converted GigaAM MLX model from directory."""
+    model_dir = Path(model_dir)
+    cfg = GigaAMConfig.from_file(str(model_dir / "config.json"))
+    model = GigaAM(cfg)
+    weights = mx.load(str(model_dir / "model.safetensors"))
+    
+    # Extract preprocessing weights
+    mel_fb = weights.pop("mel_filterbank", None)
+    stft_win = weights.pop("stft_window", None)
+    
+    if mel_fb is not None:
+        model.mel_filterbank = mel_fb.astype(mx.float32)
+    if stft_win is not None:
+        model.stft_window = stft_win.astype(mx.float32)
+    
+    # Load model weights
+    model.load_weights(list(weights.items()), strict=False)
+    mx.eval(model.parameters())
+    return model
+
+
+def load_audio(path: str, sample_rate: int = 16000) -> mx.array:
+    """Load audio via ffmpeg → mx.array."""
+    import subprocess
+    cmd = [
+        "ffmpeg", "-nostdin", "-threads", "0", "-i", path,
+        "-f", "s16le", "-ac", "1", "-acodec", "pcm_s16le",
+        "-ar", str(sample_rate), "-"
+    ]
+    result = subprocess.run(cmd, capture_output=True, check=True)
+    audio_np = np.frombuffer(result.stdout, dtype=np.int16).astype(np.float32) / 32768.0
+    return mx.array(audio_np)