diff --git a/docs/concepts/person_sampling.md b/docs/concepts/person_sampling.md index d7940fdfa..c49e4e9f0 100644 --- a/docs/concepts/person_sampling.md +++ b/docs/concepts/person_sampling.md @@ -56,6 +56,7 @@ Supported locales: - `en_US`: United States - `en_IN`: India (English) - `en_SG`: Singapore (English) +- `fr_FR`: France (French) - `hi_Deva_IN`: India (Devanagari script) - `hi_Latn_IN`: India (Latin script) - `ja_JP`: Japan @@ -119,6 +120,9 @@ ngc registry resource download-version "nvidia/nemotron-personas/nemotron-person ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-hi_latn_in" ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-en_in" +# For Nemotron-Personas FR +ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-fr_fr" + # For Nemotron-Personas JP ngc registry resource download-version "nvidia/nemotron-personas/nemotron-personas-dataset-ja_jp" @@ -183,6 +187,16 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe | `email_address` | string | | | `national_id` | string | +**France-Specific Fields (`fr_FR`):** + +- `commune` - Smallest administrative division (includes arrondissements) +- `departement` - Mid-level administrative division +- `household_type` - Household composition (e.g., single person, couple with/without children) +- `monthly_income_eur` - Estimated monthly income in euros +- `first_name_heritage` - Cultural origin of the first name +- `name_heritage` - Cultural, linguistic, or geographic origin of the surname +- `is_first_gen_immigrant` - Whether the individual is a first-generation immigrant to France + **Japan-Specific Fields (`ja_JP`):** - `area` @@ -234,7 +248,7 @@ For more details, see the documentation for [`SamplerColumnConfig`](../code_refe | Parameter | Type | Description | |-----------|------|-------------| -| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR" | +| `locale` | str | Language/region code - must be one of: "en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR" | | `sex` | str (optional) | Filter by "Male" or "Female" | | `city` | str or list[str] (optional) | Filter by specific city or cities within locale | | `age_range` | list[int] (optional) | Two-element list [min_age, max_age] (default: [18, 114]) | diff --git a/packages/data-designer-config/src/data_designer/config/sampler_params.py b/packages/data-designer-config/src/data_designer/config/sampler_params.py index 0be4817e6..c6f73f34b 100644 --- a/packages/data-designer-config/src/data_designer/config/sampler_params.py +++ b/packages/data-designer-config/src/data_designer/config/sampler_params.py @@ -15,6 +15,7 @@ AVAILABLE_LOCALES, DEFAULT_AGE_RANGE, LOCALES_WITH_MANAGED_DATASETS, + LOCALES_WITH_MANAGED_DATASETS_STR, MAX_AGE, MIN_AGE, ) @@ -446,7 +447,7 @@ class PersonSamplerParams(ConfigBase): "Locale that determines the language and geographic location " "that a synthetic person will be sampled from. Must be a locale supported by " "a managed Nemotron Personas dataset. Managed datasets exist for the following locales: " - f"{', '.join(LOCALES_WITH_MANAGED_DATASETS)}." + f"{LOCALES_WITH_MANAGED_DATASETS_STR}." ), ) sex: SexT | None = Field( @@ -518,7 +519,7 @@ def _validate_locale_with_managed_datasets(self) -> Self: if self.locale not in LOCALES_WITH_MANAGED_DATASETS: raise ValueError( "Person sampling from managed datasets is only supported for the following " - f"locales: {', '.join(LOCALES_WITH_MANAGED_DATASETS)}." + f"locales: {LOCALES_WITH_MANAGED_DATASETS_STR}." ) return self diff --git a/packages/data-designer-config/src/data_designer/config/utils/constants.py b/packages/data-designer-config/src/data_designer/config/utils/constants.py index b8dcf6b18..fb1b5a084 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/constants.py +++ b/packages/data-designer-config/src/data_designer/config/utils/constants.py @@ -365,6 +365,7 @@ class NordColor(Enum): "en_US": "1.24 GB", "en_IN": "2.39 GB", "en_SG": "0.30 GB", + "fr_FR": "2.71 GB", "hi_Deva_IN": "4.14 GB", "hi_Latn_IN": "2.7 GB", "ja_JP": "1.69 GB", @@ -372,6 +373,7 @@ class NordColor(Enum): } LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys()) +LOCALES_WITH_MANAGED_DATASETS_STR = ", ".join(LOCALES_WITH_MANAGED_DATASETS) NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-" diff --git a/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py b/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py index 54d4c9545..1e07a4f2c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +++ b/packages/data-designer-engine/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py @@ -40,6 +40,14 @@ "state", "email_address", "phone_number", + # France-specific fields + "first_name_heritage", + "name_heritage", + "is_first_gen_immigrant", + "household_type", + "monthly_income_eur", + "commune", + "departement", # Brazil-specific fields "race", # Japan-specific fields diff --git a/packages/data-designer/src/data_designer/cli/commands/download.py b/packages/data-designer/src/data_designer/cli/commands/download.py index 0984e7004..7a093a19f 100644 --- a/packages/data-designer/src/data_designer/cli/commands/download.py +++ b/packages/data-designer/src/data_designer/cli/commands/download.py @@ -6,7 +6,7 @@ import typer from data_designer.cli.controllers.download_controller import DownloadController -from data_designer.config.utils.constants import DATA_DESIGNER_HOME +from data_designer.config.utils.constants import DATA_DESIGNER_HOME, LOCALES_WITH_MANAGED_DATASETS_STR def personas_command( @@ -14,7 +14,7 @@ def personas_command( None, "--locale", "-l", - help="Locales to download (en_US, en_IN, hi_Deva_IN, hi_Latn_IN, ja_JP). Can be specified multiple times.", + help=f"Locales to download ({LOCALES_WITH_MANAGED_DATASETS_STR}). Can be specified multiple times.", ), all_locales: bool = typer.Option( False, diff --git a/packages/data-designer/tests/cli/controllers/test_download_controller.py b/packages/data-designer/tests/cli/controllers/test_download_controller.py index 6b1d5c5e3..c130e9275 100644 --- a/packages/data-designer/tests/cli/controllers/test_download_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_download_controller.py @@ -85,8 +85,8 @@ def test_run_personas_with_all_flag( # Verify NGC check was called mock_check_ngc.assert_called_once() - # Verify all 7 locales were downloaded - assert mock_download.call_count == 7 + # Verify all 8 locales were downloaded + assert mock_download.call_count == 8 # Verify each locale was downloaded downloaded_locales = [call[0][0] for call in mock_download.call_args_list] @@ -219,10 +219,11 @@ def test_determine_locales_with_all_flag(controller: DownloadController) -> None """Test _determine_locales returns all locales when all_locales=True.""" result = controller._determine_locales(locales=None, all_locales=True) - assert len(result) == 7 + assert len(result) == 8 assert "en_US" in result assert "en_IN" in result assert "en_SG" in result + assert "fr_FR" in result assert "hi_Deva_IN" in result assert "hi_Latn_IN" in result assert "ja_JP" in result diff --git a/packages/data-designer/tests/cli/repositories/test_persona_repository.py b/packages/data-designer/tests/cli/repositories/test_persona_repository.py index 3f4bdcc40..ce9a53304 100644 --- a/packages/data-designer/tests/cli/repositories/test_persona_repository.py +++ b/packages/data-designer/tests/cli/repositories/test_persona_repository.py @@ -15,7 +15,7 @@ def repository() -> PersonaRepository: def test_init(repository: PersonaRepository) -> None: """Test repository initialization creates registry.""" assert repository._registry is not None - assert len(repository._registry.locales) == 7 + assert len(repository._registry.locales) == 8 assert repository._registry.dataset_prefix == "nemotron-personas-dataset-" @@ -24,11 +24,11 @@ def test_list_all(repository: PersonaRepository) -> None: locales = repository.list_all() assert isinstance(locales, list) - assert len(locales) == 7 + assert len(locales) == 8 # Verify all expected locales are present locale_codes = {locale.code for locale in locales} - assert locale_codes == {"en_US", "en_IN", "en_SG", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR"} + assert locale_codes == {"en_US", "en_IN", "en_SG", "fr_FR", "hi_Deva_IN", "hi_Latn_IN", "ja_JP", "pt_BR"} # Verify each locale has required fields for locale in locales: diff --git a/packages/data-designer/tests/cli/services/test_download_service.py b/packages/data-designer/tests/cli/services/test_download_service.py index fc06aa0bf..b452c0682 100644 --- a/packages/data-designer/tests/cli/services/test_download_service.py +++ b/packages/data-designer/tests/cli/services/test_download_service.py @@ -51,10 +51,11 @@ def test_get_available_locales(service: DownloadService) -> None: locales = service.get_available_locales() assert isinstance(locales, dict) - assert len(locales) == 7 + assert len(locales) == 8 assert "en_US" in locales assert "en_IN" in locales assert "en_SG" in locales + assert "fr_FR" in locales assert "hi_Deva_IN" in locales assert "hi_Latn_IN" in locales assert "ja_JP" in locales