Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions demos/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,13 @@ python3 demos/check_ckpt_for_gelu_shift.py \

`adam_vs_adamw.sh` trains two tiny Shakespeare models, one with Adam and one
with AdamW, then compares their statistics using `view_model_stats.py`.

## ExecuTorch export

Use `export_ckpt_to_executorch.sh` to convert a training checkpoint into an ExecuTorch `.pte` program.

```bash
./demos/export_ckpt_to_executorch.sh out/ckpt.pt
```

Pass a second argument to control the output path or forward additional options to the Python exporter.
16 changes: 16 additions & 0 deletions demos/export_ckpt_to_executorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail

if [[ ${1:-} == "" ]]; then
echo "Usage: $0 <ckpt-path> [pte-path]"
exit 1
fi

CKPT_PATH=$1
PTE_PATH=${2:-}

if [[ -n "$PTE_PATH" ]]; then
python -m model_exports.executorch.export_checkpoint --ckpt "$CKPT_PATH" --pte-path "$PTE_PATH" "${@:3}"
else
python -m model_exports.executorch.export_checkpoint --ckpt "$CKPT_PATH" "${@:2}"
fi
7 changes: 7 additions & 0 deletions hardware_targets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Hardware profiling targets

This directory contains automation helpers for running exported ExecuTorch programs on specific devices.

## Android

Use `android/profile_pte.py` to stage a runner and `.pte` file onto an attached device via `adb`, invoke the runner, and parse energy/latency metrics emitted between `EXECUTORCH_METRICS_BEGIN` and `EXECUTORCH_METRICS_END` markers.
139 changes: 139 additions & 0 deletions hardware_targets/android/profile_pte.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""Utility for profiling ExecuTorch `.pte` programs on Android devices via `adb`."""

from __future__ import annotations

import argparse
import json
import re
import shlex
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional

METRICS_BEGIN = "EXECUTORCH_METRICS_BEGIN"
METRICS_END = "EXECUTORCH_METRICS_END"


@dataclass(slots=True)
class MetricsSummary:
phase: str
tokens: int
latency_ms: float
energy_mj: float

@property
def latency_per_token_ms(self) -> float:
return self.latency_ms / max(self.tokens, 1)

@property
def energy_per_token_mj(self) -> float:
return self.energy_mj / max(self.tokens, 1)


def _adb_cmd(args: list[str], serial: Optional[str] = None, **kwargs: Any) -> subprocess.CompletedProcess[str]:
base = ["adb"]
if serial:
base += ["-s", serial]
result = subprocess.run(base + args, check=True, capture_output=True, text=True, **kwargs)
return result


def _extract_metrics(stdout: str) -> Dict[str, MetricsSummary]:
pattern = re.compile(rf"{METRICS_BEGIN}(.*?){METRICS_END}", re.DOTALL)
match = pattern.search(stdout)
if not match:
return {}
payload = match.group(1).strip()
data = json.loads(payload)
summaries: Dict[str, MetricsSummary] = {}
for phase, values in data.items():
summaries[phase] = MetricsSummary(
phase=phase,
tokens=int(values.get("tokens", 0)),
latency_ms=float(values.get("latency_ms", 0.0)),
energy_mj=float(values.get("energy_mj", 0.0)),
)
return summaries


def _format_summary(summary: MetricsSummary) -> str:
return (
f"{summary.phase}: tokens={summary.tokens} "
f"latency={summary.latency_ms:.2f}ms (per token {summary.latency_per_token_ms:.2f}ms) "
f"energy={summary.energy_mj:.3f}mJ (per token {summary.energy_per_token_mj:.3f}mJ)"
)


def profile(args: argparse.Namespace) -> None:
remote_dir = Path(args.remote_dir)
remote_dir_str = str(remote_dir)
remote_runner = remote_dir / Path(args.runner).name
remote_pte = remote_dir / Path(args.pte).name

print(f"[INFO] Pushing runner to {remote_runner}")
_adb_cmd(["push", args.runner, str(remote_runner)], serial=args.serial)
print(f"[INFO] Pushing PTE to {remote_pte}")
_adb_cmd(["push", args.pte, str(remote_pte)], serial=args.serial)

prompt = args.prompt or "Hello world!"
runner_invocation = (
f"cd {shlex.quote(remote_dir_str)} && "
f"chmod +x {shlex.quote(remote_runner.name)} && "
f"echo {shlex.quote(prompt)} | "
f"{shlex.quote('./' + remote_runner.name)}"
)

print(f"[INFO] Launching runner via adb shell: {runner_invocation}")
result = _adb_cmd(["shell", runner_invocation], serial=args.serial)
stdout = result.stdout
if stdout:
print("[DEVICE OUTPUT]")
print(stdout)

summaries = _extract_metrics(stdout)
if not summaries:
print(
"[WARN] No ExecuTorch metrics detected. Ensure the runner prints JSON between "
f"{METRICS_BEGIN} and {METRICS_END}."
)
return

print("[INFO] Parsed metrics:")
for summary in summaries.values():
print(" " + _format_summary(summary))


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--runner", required=True, help="Path to the compiled ExecuTorch runner binary.")
parser.add_argument("--pte", required=True, help="Path to the exported ExecuTorch .pte program.")
parser.add_argument(
"--remote-dir",
default="/data/local/tmp/nanogpt",
help="Directory on the device where artifacts will be staged.",
)
parser.add_argument(
"--prompt",
help="Prompt text to feed into the runner. Defaults to 'Hello world!'.",
)
parser.add_argument(
"--serial",
help="Optional adb serial number when multiple devices are connected.",
)
return parser.parse_args()


def main() -> None:
args = parse_args()
try:
profile(args)
except FileNotFoundError as exc:
print(f"[ERROR] Failed to invoke external tool: {exc}")
except subprocess.CalledProcessError as exc:
print("[ERROR] adb command failed:")
print(exc.stderr)


if __name__ == "__main__":
main()
128 changes: 121 additions & 7 deletions hyperparam_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os
import subprocess
import sys
from dataclasses import dataclass
from contextlib import contextmanager
import re
from copy import deepcopy
Expand All @@ -34,6 +35,56 @@
import ast


# ExecuTorch export settings
@dataclass(slots=True)
class ExecuTorchExportOptions:
enabled: bool = False
delegate: str = 'none'
smoke_test_tokens: int = 0
smoke_test_prompt: str | None = None
tokenizer_vocab: Path | None = None
max_output_tokens: int = 32


def maybe_export_executorch(ckpt_dir: Path, run_label: str, options: ExecuTorchExportOptions) -> None:
if not options.enabled:
return

ckpt_path = ckpt_dir / 'ckpt.pt'
if not ckpt_path.exists():
print(f"[WARN] ExecuTorch export skipped (missing {ckpt_path}).")
return

try:
from model_exports.executorch.exporter import ExportConfig, export_checkpoint_to_pte
except ImportError as exc:
print(f"[WARN] ExecuTorch export unavailable: {exc}")
return

export_dir = ckpt_path.parent / 'executorch'
export_dir.mkdir(parents=True, exist_ok=True)
safe_name = re.sub(r'[^A-Za-z0-9._-]+', '_', run_label).strip('_') or 'model'
pte_path = export_dir / f"{safe_name}.pte"

config = ExportConfig(
delegate=options.delegate,
generate_etrecord=False,
smoke_test_tokens=max(0, options.smoke_test_tokens),
smoke_test_prompt=options.smoke_test_prompt,
tokenizer_path=options.tokenizer_vocab,
max_output_tokens=options.max_output_tokens,
metadata=True,
)

try:
export_checkpoint_to_pte(ckpt_path, pte_path, config)
print(f"[INFO] ExecuTorch export ready: {pte_path}")
except ImportError as exc:
print(f"[WARN] ExecuTorch export failed (missing dependency): {exc}")
except Exception as exc:
print(f"[ERROR] ExecuTorch export failed for {run_label}: {exc}")


# ───────────────────────── helpers ──────────────────────────
def dict_to_cli(d: Dict[str, Any]) -> List[str]:
"""
Expand Down Expand Up @@ -70,7 +121,11 @@ def patched_argv(argv: List[str]):
sys.argv = old


def run_trial_inproc(cfg: Dict[str, Any]) -> Tuple[float, float, int, float, float]:
def run_trial_inproc(
cfg: Dict[str, Any],
export_options: ExecuTorchExportOptions,
run_label: str,
) -> Tuple[float, float, int, float, float]:
"""Return (best_val_loss, num_params, best_iter, peak_gpu_mb, iter_latency_ms)."""
from train import Trainer
from train_args import parse_args as parse_train_args
Expand All @@ -85,13 +140,18 @@ def run_trial_inproc(cfg: Dict[str, Any]) -> Tuple[float, float, int, float, flo
best_iter = int(getattr(tr, "iter_num_best_val_loss", 0))
peak_gpu_mb = float(getattr(tr, "peak_gpu_usage", 0.0) / (1024 ** 2))
iter_latency_ms = float(getattr(tr, "iter_latency_avg", 0.0))
maybe_export_executorch(Path(cfg.get("out_dir", "out")), run_label, export_options)
del tr
torch.cuda.empty_cache()
gc.collect()
return loss, nparam, best_iter, peak_gpu_mb, iter_latency_ms


def run_trial_subproc(cfg: Dict[str, Any]) -> Tuple[float, float, int, float, float]:
def run_trial_subproc(
cfg: Dict[str, Any],
export_options: ExecuTorchExportOptions,
run_label: str,
) -> Tuple[float, float, int, float, float]:
script_dir = Path(__file__).parent
cmd = [sys.executable, str(script_dir / "train.py")] + dict_to_cli(cfg)
env = {k: v for k, v in os.environ.items() if k not in {"RANK", "WORLD_SIZE"}}
Expand All @@ -101,12 +161,13 @@ def run_trial_subproc(cfg: Dict[str, Any]) -> Tuple[float, float, int, float, fl
raise RuntimeError("train.py failed")

out_dir = Path(cfg.get("out_dir", "out"))
line = (out_dir / "best_val_loss_and_iter.txt").read_text().strip().split(",")
line = (out_dir / "best_val_loss_and_iter.txt").read_text().strip().split(',')
loss = float(line[0])
best_iter = int(line[1])
nparam = float(line[2])
peak_gpu_mb = float(line[5])
iter_latency_ms = float(line[6])
maybe_export_executorch(out_dir, run_label, export_options)
torch.cuda.empty_cache()
gc.collect()
return loss, nparam, best_iter, peak_gpu_mb, iter_latency_ms
Expand Down Expand Up @@ -174,6 +235,45 @@ def main():
"'vram' for peak GPU memory in MB, or 'iter' for average iteration latency in ms."
),
)
ap.add_argument(
"--executorch_export",
dest="executorch_export",
action='store_true',
default=True,
Copy link

Copilot AI Oct 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as in run_experiments.py: default=True with action='store_true' is redundant. Remove the default=True parameter.

Suggested change
default=True,

Copilot uses AI. Check for mistakes.
help="Automatically export ExecuTorch programs for each candidate run.",
)
ap.add_argument(
"--no-executorch-export",
dest="executorch_export",
action='store_false',
help="Disable ExecuTorch exports.",
)
ap.add_argument(
"--executorch_delegate",
choices=['none', 'xnnpack'],
default='none',
help="Delegate to target when exporting to ExecuTorch.",
)
ap.add_argument(
"--executorch_smoke_test_tokens",
type=int,
default=0,
help="If >0, run a random-token smoke test after export.",
)
ap.add_argument(
"--executorch_smoke_test_prompt",
help="Optional prompt to evaluate with the exported program.",
)
ap.add_argument(
"--executorch_tokenizer_vocab",
help="Path to a vocab.json for ExecuTorch prompt smoke tests.",
)
ap.add_argument(
"--executorch_max_output_tokens",
type=int,
default=32,
help="Maximum decode tokens when running ExecuTorch smoke tests.",
)



Expand All @@ -186,7 +286,21 @@ def main():
sys.exit("--increments length mismatch")

inc_map = dict(zip(args.param_names, args.increments))
run_fn = run_trial_subproc if args.spawn_subprocess else run_trial_inproc
export_options = ExecuTorchExportOptions(
enabled=args.executorch_export,
delegate=args.executorch_delegate,
smoke_test_tokens=max(0, args.executorch_smoke_test_tokens),
smoke_test_prompt=args.executorch_smoke_test_prompt,
tokenizer_vocab=Path(args.executorch_tokenizer_vocab) if args.executorch_tokenizer_vocab else None,
max_output_tokens=args.executorch_max_output_tokens,
)

if args.spawn_subprocess:
def run_trial(cfg: Dict[str, Any], label: str) -> Tuple[float, float, int, float, float]:
return run_trial_subproc(cfg, export_options, label)
else:
def run_trial(cfg: Dict[str, Any], label: str) -> Tuple[float, float, int, float, float]:
return run_trial_inproc(cfg, export_options, label)

baseline_cfg_master = yaml.safe_load(Path(args.orig_settings).read_text())
log_path = Path(args.results_file)
Expand Down Expand Up @@ -259,9 +373,9 @@ def _extend_layerlists(cfg: Dict[str, Any], dup_idx: int) -> None:
_apply_overrides_to_active_config(baseline_cfg, args.override_cfg, "initial baseline_cfg for new sweep")

print("[BASELINE] measuring initial config …")
# run_fn receives a deepcopy of the (potentially overridden) baseline_cfg
# run_trial receives a deepcopy of the (potentially overridden) baseline_cfg

base_loss, base_params, base_best_iter, base_gpu, base_iter_ms = run_fn(deepcopy(baseline_cfg))
base_loss, base_params, base_best_iter, base_gpu, base_iter_ms = run_trial(deepcopy(baseline_cfg), 'baseline')
base_score = 1 / math.exp(base_loss)
log["iterations"].append(
{
Expand Down Expand Up @@ -319,7 +433,7 @@ def _evaluate(cfg_template: Dict[str, Any],

print(f"[TEST] {label_for_log}={value_for_log} seed={cfg_run['seed']}")
try:
loss, nparam, best_it, peak_mb, iter_ms = run_fn(cfg_run)
loss, nparam, best_it, peak_mb, iter_ms = run_trial(cfg_run, f"{label_for_log}-seed{cfg_run['seed']}")
except Exception as exc:
print(" ⚠", exc)
return # discard this candidate
Expand Down
5 changes: 5 additions & 0 deletions model_exports/executorch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""ExecuTorch export utilities for nanoGPT checkpoints."""

from .exporter import ExportConfig, export_checkpoint_to_pte

__all__ = ["ExportConfig", "export_checkpoint_to_pte"]
Loading
Loading