feat: add DGX Spark (aarch64) support

mvansegbroeck · mvansegbroeck · commit c3bf79b5f96a · 2026-03-24T10:03:57.000-07:00
- containers/Dockerfile.spark: container-based install using nvcr.io/nvidia/vllm:26.02-py3
- docs/DGX_SPARK.md: quick start guide (build + run in 2 steps)
- pyproject.toml: platform markers for aarch64-incompatible packages
  (faiss-gpu-cu12, torchvision+cu128, torchao, xformers)
- config/training.py: auto-fallback Flash Attention 3 to sdpa on aarch64
- vllm_backend.py: handle vllm versions without attention_config kwarg
diff --git a/containers/Dockerfile.spark b/containers/Dockerfile.spark
@@ -0,0 +1,37 @@
+# Dockerfile for NeMo Safe Synthesizer on DGX Spark (aarch64)
+#
+# Base: NVIDIA vLLM container with torch 2.11 + vLLM 0.15.1 + Triton 3.6
+#
+# Build:
+#   docker build -f containers/Dockerfile.spark -t nss-spark .
+#
+# Run:
+#   docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
+#     -it nss-spark
+#
+FROM nvcr.io/nvidia/vllm:26.02-py3
+
+ENV TRITON_CACHE_DIR=/workspace/.triton_cache
+ENV BNB_CUDA_VERSION=130
+
+WORKDIR /workspace/Safe-Synthesizer
+COPY . .
+
+# Torch-dependent packages — install with --no-deps to preserve container's torch/CUDA
+RUN pip install --no-deps \
+    peft accelerate bitsandbytes datasets==4.3.0 trl==0.26.1 \
+    hf_transfer unsloth unsloth_zoo \
+    opacus sentence-transformers gliner kernels
+
+# Safe Synthesizer + remaining deps
+RUN pip install --no-deps -e . && \
+    pip install \
+    faker 'pydantic[email]>=2.12.5' pydantic-settings pyyaml jsonschema rich structlog \
+    colorama 'huggingface-hub>=0.34.4,<1' anyascii pycountry betterproto flashtext \
+    cached-property category-encoders dython dateparser langchain-core json-repair \
+    matplotlib 'outlines>=1.0.0' plotly prv-accountant 'smart-open==7.0.5' python-stdnum \
+    'pandas>=2.1.3' ratelimit 'sqlfluff==3.2.0' 'range_regex>=0.1.0' 'tenacity==9.1.2' \
+    'tiktoken>=0.7.0' tldextract 'wandb==0.23.1' python-dotenv patsy \
+    pyarrow multiprocess onnxruntime opt_einsum dill==0.3.8 faiss-cpu
+
+ENTRYPOINT ["/usr/bin/bash"]
diff --git a/docs/DGX_SPARK.md b/docs/DGX_SPARK.md
@@ -0,0 +1,103 @@
+# NeMo Safe Synthesizer on DGX Spark
+
+Generate synthetic tabular data with quality and privacy guarantees — train, generate, and evaluate in one command.
+
+## Quick Start
+
+### 1. Build and launch the container
+
+```bash
+git clone https://github.com/NVIDIA-NeMo/Safe-Synthesizer.git && cd Safe-Synthesizer
+docker build -f containers/Dockerfile.spark -t nss-spark .
+docker run --gpus all --ipc=host --ulimit memlock=-1 -it --ulimit stack=67108864 nss-spark
+```
+
+### 2. Run
+
+```python
+python -c "
+import pandas as pd, numpy as np
+from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer
+
+# Sample data — replace with your own CSV or DataFrame
+np.random.seed(42)
+df = pd.DataFrame({
+    'age': np.random.randint(18, 85, 500),
+    'income': np.random.lognormal(10.5, 0.8, 500).astype(int),
+    'credit_score': np.random.randint(300, 850, 500),
+    'default': np.random.choice(['yes', 'no'], 500, p=[0.15, 0.85]),
+})
+
+builder = (
+    SafeSynthesizer()
+    .with_data_source(df)
+    .with_replace_pii()
+    .with_generate(num_records=500)
+    .with_evaluate()
+)
+builder.run()
+
+s = builder.results.summary
+print(f'Quality (SQS): {s.synthetic_data_quality_score}/10')
+print(f'Privacy (DPS): {s.data_privacy_score}/10')
+builder.save_results()
+"
+```
+
+Expected: SQS ~8-9, DPS ~9-10.
+
+> **First run is slower.** Model weights (~6 GB) download from HuggingFace and Triton
+> JIT-compiles LoRA kernels for the GB10. Subsequent runs reuse cached weights and kernels.
+
+## Use Your Own Data
+
+```python
+from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer
+
+builder = (
+    SafeSynthesizer()
+    .with_data_source("your_data.csv")  # or pass a DataFrame
+    .with_replace_pii()
+    .with_generate(num_records=1000)
+    .with_evaluate()
+)
+builder.run()
+builder.save_results()
+```
+
+Outputs are saved to `safe-synthesizer-artifacts/` — synthetic CSV and an HTML evaluation report.
+
+## Optional: Improve PII Detection
+
+Set a NIM API key for LLM-based column classification (more accurate than NER-only):
+
+```bash
+export NIM_ENDPOINT_URL="https://integrate.api.nvidia.com/v1"
+export NIM_API_KEY="your-key"  # get one at build.nvidia.com/settings/api-keys
+```
+
+## Optional: Differential Privacy
+
+```python
+builder = (
+    SafeSynthesizer()
+    .with_data_source(df)
+    .with_replace_pii()
+    .with_generate(num_records=1000)
+    .with_differential_privacy(dp_enabled=True, epsilon=8.0)
+    .with_evaluate()
+)
+```
+
+## Troubleshooting
+
+**Slow first generation batch?** Triton JIT-compiles LoRA kernels for the GB10 on first use. This is normal and only happens once per container session.
+
+**Memory issues between runs?** Flush the cache:
+```bash
+sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'
+```
+
+**Why a container?** DGX Spark's CUDA 13 + aarch64 requires specific Triton, vLLM, and PyTorch versions. The container (`nvcr.io/nvidia/vllm:26.02-py3`) provides a tested stack where Unsloth training and vLLM generation work natively.
+
+**Full documentation:** [Safe Synthesizer User Guide](https://github.com/NVIDIA-NeMo/Safe-Synthesizer/blob/main/docs/user-guide/getting-started.md)
diff --git a/pyproject.toml b/pyproject.toml
@@ -141,7 +141,8 @@ cpu = [
 cu128 = [
   "accelerate",
   "bitsandbytes==0.49.1",
-  "faiss-gpu-cu12==1.13.2; sys_platform == 'linux'",
+  "faiss-gpu-cu12==1.13.2; sys_platform == 'linux' and platform_machine == 'x86_64'",
+  "faiss-cpu==1.13.2; sys_platform == 'linux' and platform_machine == 'aarch64'",
   "flashinfer-python==0.6.1; sys_platform == 'linux'",
   "flashinfer-cubin==0.6.1; sys_platform == 'linux'",
   "flashinfer-jit-cache==0.6.1+cu128; sys_platform == 'linux'",
@@ -154,15 +155,16 @@ cu128 = [
   "sentence-transformers",
   "torch==2.9.1+cu128; sys_platform == 'linux'",
   "torch-c-dlpack-ext",
-  "torchvision==0.24.1+cu128; sys_platform == 'linux'",
-  "torchao==0.15.0; sys_platform == 'linux'",
+  "torchvision==0.24.1+cu128; sys_platform == 'linux' and platform_machine == 'x86_64'",
+  "torchvision==0.24.1; sys_platform == 'linux' and platform_machine == 'aarch64'",
+  "torchao==0.15.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
   "transformers==4.57.3",
   "triton>=2.0.0",
   "trl>=0.23.0",
   "unsloth[cu128-torch291]==2025.12.4",
   "unsloth_zoo==2025.12.4",
   "vllm==0.15.0",
-  "xformers==v0.0.33.post2; sys_platform == 'linux'",
+  "xformers==v0.0.33.post2; sys_platform == 'linux' and platform_machine == 'x86_64'",
 ]
 
 # at some point, do per-subpackage dependencies
@@ -188,8 +190,8 @@ dependency-metadata = [
 
 
 override-dependencies = [
-    "flashinfer-python==0.6.1; sys_platform != 'darwin'", # uv locking won't find the matching versions of flashinfer-python and -cubin without overriding
-    "flashinfer-cubin==0.6.1; sys_platform != 'darwin'",  # perhaps because the published wheels have some wrong metadata
+    "flashinfer-python==0.6.1; sys_platform != 'darwin' and platform_machine != 'aarch64'", # uv locking won't find the matching versions of flashinfer-python and -cubin without overriding
+    "flashinfer-cubin==0.6.1; sys_platform != 'darwin' and platform_machine != 'aarch64'",  # perhaps because the published wheels have some wrong metadata
     "xgrammar>=0.1.32,<1.0.0",                            # CVE-2026-25048: override vllm's pin on 0.1.29
 ]
 
diff --git a/src/nemo_safe_synthesizer/config/training.py b/src/nemo_safe_synthesizer/config/training.py
@@ -3,13 +3,15 @@
 
 from __future__ import annotations
 
+import platform
 from typing import (
     Annotated,
     Literal,
 )
 
 from pydantic import (
     Field,
+    model_validator,
 )
 
 from ..configurator.parameters import (
@@ -265,3 +267,11 @@ class TrainingHyperparams(Parameters):
             ),
         ),
     ] = "kernels-community/vllm-flash-attn3"
+
+    @model_validator(mode="after")
+    def _resolve_platform_defaults(self) -> "TrainingHyperparams":
+        """Override defaults that are incompatible with the current platform."""
+        if platform.machine() == "aarch64":
+            if self.attn_implementation == "kernels-community/vllm-flash-attn3":
+                self.attn_implementation = "sdpa"
+        return self
diff --git a/src/nemo_safe_synthesizer/generation/vllm_backend.py b/src/nemo_safe_synthesizer/generation/vllm_backend.py
@@ -203,16 +203,26 @@ def initialize(self, **kwargs) -> None:
         # check this when updating unsloth in the future.
         enforce_eager = self.config.training.use_unsloth is True
 
+        vllm_kwargs = dict(
+            model=self.config.training.pretrained_model,
+            gpu_memory_utilization=max_vram,
+            enable_lora=True,
+            max_lora_rank=self.config.training.lora_r,
+            structured_outputs_config=structured_outputs_config,
+            enforce_eager=enforce_eager,
+        )
+        # attention_config was added in vLLM 0.12+ but removed in some builds.
+        # Fall back to VLLM_ATTENTION_BACKEND env var if the kwarg is not accepted.
         with heartbeat("Model loading", logger_name=__name__, model=self.config.training.pretrained_model):
-            self.llm = vLLM(
-                model=self.config.training.pretrained_model,
-                gpu_memory_utilization=max_vram,
-                enable_lora=True,
-                max_lora_rank=self.config.training.lora_r,
-                structured_outputs_config=structured_outputs_config,
-                enforce_eager=enforce_eager,
-                attention_config=attention_config,
-            )
+            if attention_config is not None:
+                try:
+                    self.llm = vLLM(**vllm_kwargs, attention_config=attention_config)
+                except TypeError:
+                    if attn_backend not in (None, "auto"):
+                        os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+                    self.llm = vLLM(**vllm_kwargs)
+            else:
+                self.llm = vLLM(**vllm_kwargs)
 
     def _build_structured_output_params(self) -> StructuredOutputsParams | None:
         """Build structured output parameters based on generation config.