Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion docs/concepts/person_sampling.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Supported locales:
- `en_US`: United States
- `en_IN`: India (English)
- `en_SG`: Singapore (English)
- `fr_FR`: France (French)
- `hi_Deva_IN`: India (Devanagari script)
- `hi_Latn_IN`: India (Latin script)
- `ja_JP`: Japan
Expand Down Expand Up @@ -119,6 +120,9 @@ ngc registry resource download-version "nvidia/nemotron-personas/nemotron-person
ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-hi_latn_in"
ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-en_in"

# For Nemotron-Personas FR
ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-fr_fr"

# For Nemotron-Personas JP
ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ja_jp"

Expand Down Expand Up @@ -183,6 +187,16 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe
| `email_address` | string | |
| `national_id` | string |

**France-Specific Fields (`fr_FR`):**

- `commune` - Smallest administrative division (includes arrondissements)
- `departement` - Mid-level administrative division
- `household_type` - Household composition (e.g., single person, couple with/without children)
- `monthly_income_eur` - Estimated monthly income in euros
- `first_name_heritage` - Cultural origin of the first name
- `name_heritage` - Cultural, linguistic, or geographic origin of the surname
- `is_first_gen_immigrant` - Whether the individual is a first-generation immigrant to France

**Japan-Specific Fields (`ja_JP`):**

- `area`
Expand Down Expand Up @@ -234,7 +248,7 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe

| Parameter | Type | Description |
|-----------|------|-------------|
| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR" |
| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR" |
| `sex` | str (optional) | Filter by "Male" or "Female" |
| `city` | str or list[str] (optional) | Filter by specific city or cities within locale |
| `age_range` | list[int] (optional) | Two-element list [min_age, max_age] (default: [18, 114]) |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
AVAILABLE_LOCALES,
DEFAULT_AGE_RANGE,
LOCALES_WITH_MANAGED_DATASETS,
LOCALES_WITH_MANAGED_DATASETS_STR,
MAX_AGE,
MIN_AGE,
)
Expand Down Expand Up @@ -446,7 +447,7 @@ class PersonSamplerParams(ConfigBase):
"Locale that determines the language and geographic location "
"that a synthetic person will be sampled from. Must be a locale supported by "
"a managed Nemotron Personas dataset. Managed datasets exist for the following locales: "
f"{', '.join(LOCALES_WITH_MANAGED_DATASETS)}."
f"{LOCALES_WITH_MANAGED_DATASETS_STR}."
),
)
sex: SexT | None = Field(
Expand Down Expand Up @@ -518,7 +519,7 @@ def _validate_locale_with_managed_datasets(self) -> Self:
if self.locale not in LOCALES_WITH_MANAGED_DATASETS:
raise ValueError(
"Person sampling from managed datasets is only supported for the following "
f"locales: {', '.join(LOCALES_WITH_MANAGED_DATASETS)}."
f"locales: {LOCALES_WITH_MANAGED_DATASETS_STR}."
)
return self

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,13 +365,15 @@ class NordColor(Enum):
"en_US": "1.24 GB",
"en_IN": "2.39 GB",
"en_SG": "0.30 GB",
"fr_FR": "2.71 GB",
"hi_Deva_IN": "4.14 GB",
"hi_Latn_IN": "2.7 GB",
"ja_JP": "1.69 GB",
"pt_BR": "2.33 GB",
}

LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())
LOCALES_WITH_MANAGED_DATASETS_STR = ", ".join(LOCALES_WITH_MANAGED_DATASETS)

NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@
"state",
"email_address",
"phone_number",
# France-specific fields
"first_name_heritage",
"name_heritage",
"is_first_gen_immigrant",
"household_type",
"monthly_income_eur",
"commune",
"departement",
# Brazil-specific fields
"race",
# Japan-specific fields
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
import typer

from data_designer.cli.controllers.download_controller import DownloadController
from data_designer.config.utils.constants import DATA_DESIGNER_HOME
from data_designer.config.utils.constants import DATA_DESIGNER_HOME, LOCALES_WITH_MANAGED_DATASETS_STR


def personas_command(
locales: list[str] = typer.Option(
None,
"--locale",
"-l",
help="Locales to download (en_US, en_IN, hi_Deva_IN, hi_Latn_IN, ja_JP). Can be specified multiple times.",
help=f"Locales to download ({LOCALES_WITH_MANAGED_DATASETS_STR}). Can be specified multiple times.",
),
all_locales: bool = typer.Option(
False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ def test_run_personas_with_all_flag(
# Verify NGC check was called
mock_check_ngc.assert_called_once()

# Verify all 7 locales were downloaded
assert mock_download.call_count == 7
# Verify all 8 locales were downloaded
assert mock_download.call_count == 8

# Verify each locale was downloaded
downloaded_locales = [call[0][0] for call in mock_download.call_args_list]
Expand Down Expand Up @@ -219,10 +219,11 @@ def test_determine_locales_with_all_flag(controller: DownloadController) -> None
"""Test _determine_locales returns all locales when all_locales=True."""
result = controller._determine_locales(locales=None, all_locales=True)

assert len(result) == 7
assert len(result) == 8
assert "en_US" in result
assert "en_IN" in result
assert "en_SG" in result
assert "fr_FR" in result
assert "hi_Deva_IN" in result
assert "hi_Latn_IN" in result
assert "ja_JP" in result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def repository() -> PersonaRepository:
def test_init(repository: PersonaRepository) -> None:
"""Test repository initialization creates registry."""
assert repository._registry is not None
assert len(repository._registry.locales) == 7
assert len(repository._registry.locales) == 8
assert repository._registry.dataset_prefix == "nemotron-personas-dataset-"


Expand All @@ -24,11 +24,11 @@ def test_list_all(repository: PersonaRepository) -> None:
locales = repository.list_all()

assert isinstance(locales, list)
assert len(locales) == 7
assert len(locales) == 8

# Verify all expected locales are present
locale_codes = {locale.code for locale in locales}
assert locale_codes == {"en_US", "en_IN", "en_SG", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR"}
assert locale_codes == {"en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR"}

# Verify each locale has required fields
for locale in locales:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ def test_get_available_locales(service: DownloadService) -> None:
locales = service.get_available_locales()

assert isinstance(locales, dict)
assert len(locales) == 7
assert len(locales) == 8
assert "en_US" in locales
assert "en_IN" in locales
assert "en_SG" in locales
assert "fr_FR" in locales
assert "hi_Deva_IN" in locales
assert "hi_Latn_IN" in locales
assert "ja_JP" in locales
Expand Down
Loading