NVIDIA-NeMo · mvansegbroeck · Mar 19, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 31, 2026
@@ -0,0 +1,32 @@
+# Dockerfile for NeMo Safe Synthesizer on DGX Spark (aarch64)
+#
+# Base: NVIDIA vLLM container with torch + vLLM + Triton pre-installed
+#
+# Build:
+#   docker build -f containers/Dockerfile.cuda-aarch64 -t nss-spark .
+#
+# Run:
+#   docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
+#     -it nss-spark
+#
+FROM nvcr.io/nvidia/vllm:26.02-py3
+
+ENV TRITON_CACHE_DIR=/workspace/.triton_cache
+ENV BNB_CUDA_VERSION=130
+
+WORKDIR /workspace/Safe-Synthesizer
+COPY . .
+
+# 1. Install NSS package (no deps — we manage them explicitly)
+RUN pip install --no-deps -e .
+
+# 2. Torch-dependent packages — --no-deps preserves the container's torch/CUDA
+RUN pip install --no-deps \
+    peft accelerate bitsandbytes datasets==4.3.0 trl==0.26.1 \
+    hf_transfer unsloth unsloth_zoo \
+    opacus sentence-transformers gliner kernels
+
+# 3. Remaining deps (safe to resolve normally)
+RUN pip install -e ".[engine,cuda-aarch64]"
+
+ENTRYPOINT ["/usr/bin/bash"]
@@ -0,0 +1,107 @@
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+
+# NeMo Safe Synthesizer on DGX Spark
+
+Run NeMo Safe Synthesizer on DGX Spark (aarch64 / GB10) using a pre-built container with the correct Triton, vLLM, and PyTorch versions.
+
+## Quick Start
+
+### 1. Build and launch the container
+
+```bash
+git clone https://github.com/NVIDIA-NeMo/Safe-Synthesizer.git
+cd Safe-Synthesizer
+docker build -f containers/Dockerfile.cuda-aarch64 -t nss-spark .
+docker run --gpus all --ipc=host --ulimit memlock=-1 -it --ulimit stack=67108864 nss-spark
+```
+
+### 2. Run
+
+```python
+python -c "
+import pandas as pd, numpy as np
+from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer
+
+# Sample data — replace with your own CSV or DataFrame
+np.random.seed(42)
+df = pd.DataFrame({
+    'age': np.random.randint(18, 85, 500),
+    'income': np.random.lognormal(10.5, 0.8, 500).astype(int),
+    'credit_score': np.random.randint(300, 850, 500),
+    'default': np.random.choice(['yes', 'no'], 500, p=[0.15, 0.85]),
+})
+
+builder = (
+    SafeSynthesizer()
+    .with_data_source(df)
+    .with_replace_pii()
+    .with_generate(num_records=500)
+    .with_evaluate()
+)
+builder.run()
+
+s = builder.results.summary
+print(f'Quality (SQS): {s.synthetic_data_quality_score}/10')
+print(f'Privacy (DPS): {s.data_privacy_score}/10')
+builder.save_results()
+"
+```
+
+Expected: SQS ~8-9, DPS ~9-10.
+
+> **First run is slower.** Model weights (~6 GB) download from HuggingFace and Triton
+> JIT-compiles LoRA kernels for the GB10. Subsequent runs reuse cached weights and kernels.
+
+## Use Your Own Data
+
+```python
+from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer
+
+builder = (
+    SafeSynthesizer()
+    .with_data_source("your_data.csv")  # or pass a DataFrame
+    .with_replace_pii()
+    .with_generate(num_records=1000)
+    .with_evaluate()
+)
+builder.run()
+builder.save_results()
+```
+
+Outputs are saved to `safe-synthesizer-artifacts/` — synthetic CSV and an HTML evaluation report.
+
+## Optional: Improve PII Detection
+
+Set a NIM API key for LLM-based column classification (more accurate than NER-only):
+
+```bash
+export NIM_ENDPOINT_URL="https://integrate.api.nvidia.com/v1"
+export NIM_API_KEY="<your-api-key>"  # pragma: allowlist secret  # get one at build.nvidia.com/settings/api-keys
+```
+
+## Optional: Differential Privacy
+
+```python
+builder = (
+    SafeSynthesizer()
+    .with_data_source(df)
+    .with_replace_pii()
+    .with_generate(num_records=1000)
+    .with_differential_privacy(dp_enabled=True, epsilon=8.0)
+    .with_evaluate()
+)
+```
+
+## Troubleshooting
+
+**Slow first generation batch?** Triton JIT-compiles LoRA kernels for the GB10 on first use. This is normal and only happens once per container session.
+
+**Memory issues between runs?** Flush the cache:
+```bash
+sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'
+```
+
+**Why a container?** DGX Spark's CUDA 13 + aarch64 requires specific Triton, vLLM, and PyTorch versions. The container (`nvcr.io/nvidia/vllm:26.02-py3`) provides a tested stack where Unsloth training and vLLM generation work natively.
+
+**Full documentation:** [Safe Synthesizer User Guide](https://nvidia-nemo.github.io/Safe-Synthesizer/user-guide/getting-started/)
@@ -172,6 +172,7 @@ nav:
       - Architecture: developer-guide/architecture.md
       - Example Generation: developer-guide/example-generation.md
       - Docker: developer-guide/docker.md
+      - DGX Spark: developer-guide/dgx-spark.md
       - API Reference: reference/
   - Developer Notes:
       - blog/index.md
@@ -148,26 +148,29 @@ cu128 = [
   "kernels>=0.12.1",
   "nvidia-cublas-cu12; sys_platform == 'linux'",
   "nvidia-ml-py; sys_platform == 'linux'",
+  "onnxruntime",
   "opacus",
+  "opt_einsum",
   "peft",
   "sentence-transformers",
   "torch==2.9.1+cu128; sys_platform == 'linux'",
   "torch-c-dlpack-ext",
-  "torchvision==0.24.1+cu128; sys_platform == 'linux'",
-  "torchao==0.15.0; sys_platform == 'linux'",
+  "torchvision==0.24.1+cu128; sys_platform == 'linux' and platform_machine == 'x86_64'",
+  "torchvision==0.24.1; sys_platform == 'linux' and platform_machine == 'aarch64'",
+  "torchao==0.15.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
   "transformers==4.57.3",
   "triton>=2.0.0; sys_platform == 'linux'",
   "trl>=0.23.0",
-  "unsloth[cu128-torch291]==2025.12.4; sys_platform == 'linux'",
-  "unsloth_zoo==2025.12.4; sys_platform == 'linux'",
-  "vllm==0.15.0; sys_platform == 'linux'",
-  "xformers==v0.0.33.post2; sys_platform == 'linux'",
+  "unsloth[cu128-torch291]==2025.12.4",
+  "unsloth_zoo==2025.12.4",
+  "vllm==0.15.0",
+  "xformers==v0.0.33.post2; sys_platform == 'linux' and platform_machine == 'x86_64'",
 ]
 
 # at some point, do per-subpackage dependencies
 
 [tool.uv]
-required-version = ">=0.9.14, <0.11.0" # Allow current 0.10.x line while staying below the next minor
+required-version = ">=0.9.14, <0.12.0" # Allow current 0.11.x line while staying below the next minor
 cache-keys = [
   { file = "pyproject.toml" }, { git = { commit = true, tags = true } },
   { file = "uv.lock" }
@@ -187,8 +190,8 @@ dependency-metadata = [
 
 
 override-dependencies = [
-    "flashinfer-python==0.6.1; sys_platform != 'darwin'", # uv locking won't find the matching versions of flashinfer-python and -cubin without overriding
-    "flashinfer-cubin==0.6.1; sys_platform != 'darwin'",  # perhaps because the published wheels have some wrong metadata
+    "flashinfer-python==0.6.1; sys_platform != 'darwin' and platform_machine != 'aarch64'", # uv locking won't find the matching versions of flashinfer-python and -cubin without overriding
+    "flashinfer-cubin==0.6.1; sys_platform != 'darwin' and platform_machine != 'aarch64'",  # perhaps because the published wheels have some wrong metadata
     "xgrammar>=0.1.32,<1.0.0",                            # CVE-2026-25048: override vllm's pin on 0.1.29
 ]
 

@@ -3,13 +3,16 @@
 
 from __future__ import annotations
 
+import platform
 from typing import (
     Annotated,
     Literal,
+    Self,
 )
 
 from pydantic import (
     Field,
+    model_validator,
 )
 
 from ..configurator.parameters import (
@@ -269,3 +272,10 @@ class TrainingHyperparams(Parameters):
             ),
         ),
     ] = "kernels-community/vllm-flash-attn3"
+
+    @model_validator(mode="after")
+    def _resolve_platform_defaults(self) -> Self:
+        """Override defaults that are incompatible with the current platform."""
+        if platform.machine() == "aarch64" and self.attn_implementation == "kernels-community/vllm-flash-attn3":
+            self.attn_implementation = "sdpa"
+        return self
@@ -203,16 +203,27 @@ def initialize(self, **kwargs) -> None:
         # check this when updating unsloth in the future.
         enforce_eager = self.config.training.use_unsloth is True
 
-        with heartbeat("Model loading", logger_name=__name__, model=self.config.training.pretrained_model):
-            self.llm = vLLM(
-                model=self.config.training.pretrained_model,
-                gpu_memory_utilization=max_vram,
-                enable_lora=True,
-                max_lora_rank=self.config.training.lora_r,
-                structured_outputs_config=structured_outputs_config,
-                enforce_eager=enforce_eager,
-                attention_config=attention_config,
-            )
+        model = self.config.training.pretrained_model
+        vllm_kwargs = dict(
+            gpu_memory_utilization=max_vram,
+            enable_lora=True,
+            max_lora_rank=self.config.training.lora_r,
+            structured_outputs_config=structured_outputs_config,
+            enforce_eager=enforce_eager,
+        )
+        # attention_config was added in vLLM 0.12+ but is not present in NGC
+        # container builds (e.g. nvcr.io/nvidia/vllm:26.02-py3 ships 0.15.1 without it).
+        # Fall back to VLLM_ATTENTION_BACKEND env var if the kwarg is not accepted.
+        with heartbeat("Model loading", logger_name=__name__, model=model):
+            if attention_config is not None:
+                try:
+                    self.llm = vLLM(model, **vllm_kwargs, attention_config=attention_config)
+                except TypeError:
+                    if attn_backend not in (None, "auto"):
+                        os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+                    self.llm = vLLM(model, **vllm_kwargs)
+            else:
+                self.llm = vLLM(model, **vllm_kwargs)
 
     def _build_structured_output_params(self) -> StructuredOutputsParams | None:
         """Build structured output parameters based on generation config.