databrickslabs · ghanse · Feb 7, 2025 · Feb 7, 2025 · Feb 10, 2025 · Feb 10, 2025
@@ -6,9 +6,11 @@
 This file defines the `DataGenError` and `DataGenerator` classes
 """
 import copy
+import json
 import logging
 import re
 
+import yaml
 from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType
 
 from ._version import _get_spark_version
@@ -869,6 +871,17 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
         self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable))
         return self
 
+    def withColumns(self, columns):
+        """ Adds a set of columns to the synthetic generation specification.
+
+            :param columns: A list of column generation specifications as dictionaries
+            :returns:       A modified in-place instance of a data generator allowing for chaining of calls
+                            following a builder pattern
+        """
+        for column in columns:
+            self.withColumn(**column)
+        return self
+
     def _mkSqlStructFromList(self, fields):
         """
         Create a SQL struct expression from a list of fields
@@ -1604,3 +1617,51 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
             result = HtmlUtils.formatCodeAsHtml(results)
 
         return result
+
+    @staticmethod
+    def fromDict(options):
+        """ Creates a data generator from a dictionary of options.
+
+            :param options: Dictionary with data generator options (e.g. "name", "rows")
+            :return: A data generator with the specified options
+        """
+        return DataGenerator(**options)
+
+    @staticmethod
+    def fromFile(path):
+        """ Creates a data generator from options loaded from a JSON or YAML file.
+
+            :param path: File path to a JSON or YAML file containing data generation options
+            :return: A data generator with the specified options
+        """
+        if path.endswith("yml") or path.endswith("yaml"):
+            return DataGenerator.fromYaml(path)
+        if path.endswith("json"):
+            return DataGenerator.fromJson(path)
+        raise ValueError("File type must be '.json' or '.yml'")
+
+    @staticmethod
+    def fromJson(path):
+        """ Creates a data generator from options loaded from a JSON file.
+
+            :param path: File path to a JSON file containing data generation options
+            :return: A data generator with the specified options
+        """
+        with open(path, "r", encoding="utf-8") as f:
+            options = json.load(f)
+            generator = options.get("generator")
+            columns = options.get("columns", None)
+            return DataGenerator.fromDict(generator).withColumns(columns)
+
+    @staticmethod
+    def fromYaml(path):
+        """ Creates a data generator from options loaded from a YAML file.
+
+            :param path: File path to a YAML file containing data generation options
+            :return: A data generator with the specified options
+        """
+        with open(path, "r", encoding="utf-8") as f:
+            options = yaml.safe_load(f)
+            generator = options.get("generator")
+            columns = options.get("columns")
+            return DataGenerator.fromDict(generator).withColumns(columns)
@@ -182,3 +182,48 @@ This has several implications:
   SQL expression.
   To enforce the dependency, you must use the `baseColumn` attribute to indicate the dependency.
 
+Creating data generation specs from files
+-----------------------------------------
+
+``DataGenerator.fromFile("file_path")`` will return a ``DataGenerator`` with ``ColumnGenerationSpecs`` from definitions
+in a JSON or YAML file. Use the ``"generator"`` key to specify ``DataGenerator`` options and the ``"columns"`` key to
+specify ``ColumnGenerationSpec`` options.
+
+**JSON Example:**
+
+.. code-block:: JSON
+   {
+     "generator": {
+         "name": "test_data_generator",
+         "rows": 1000,
+         "partitions": 10
+     },
+     "columns": [
+         {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100},
+         {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0},
+         {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}
+     ]
+   }
+
+**YAML Example:**
+.. code-block:: YAML
+generator:
+  name: test_data_generator
+  rows: 1000
+  partitions: 10
+columns:
+  - colName: col1
+    colType: int
+    minValue: 0
+    maxValue: 1000
+  - colName: col2
+    colType: float
+    minValue: -10.0
+    maxValue: 10.0
+  - colName: col3
+    colType: string
+    values:
+      - a
+      - b
+      - c
+    random: true
@@ -128,6 +128,12 @@ representing the column - for example "email_0", "email_1" etc.
 If you specify the attribute ``structType="array"``, the multiple columns will be combined into a single array valued
 column.
 
+Generating columns from Python dictionaries
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can generate columns from Python dictionaries using ``withColumns(column_options)``. Each dictionary should contain
+keys which match the ``withColumn`` arguments (e.g. ``"colName"``, ``"colType"``).
+
 Generating random values
 ^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/tests/files/test_generator_spec.json b/tests/files/test_generator_spec.json
@@ -0,0 +1,15 @@
+{
+  "generator": {
+      "name": "test_data_generator",
+      "rows": 1000,
+      "partitions": 10,
+      "randomSeedMethod": "fixed",
+      "randomSeed": 42,
+      "random": true
+  },
+  "columns": [
+      {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 1000},
+      {"colName": "col2", "colType": "float", "minValue": -10.0, "maxValue": 10.0},
+      {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}
+  ]
+}
diff --git a/tests/files/test_generator_spec.txt b/tests/files/test_generator_spec.txt
@@ -0,0 +1,15 @@
+{
+  "generator": {
+      "name": "test_data_generator",
+      "rows": 1000,
+      "partitions": 10,
+      "randomSeedMethod": "fixed",
+      "randomSeed": 42,
+      "random": true
+  },
+  "columns": [
+      {"colName": "col1", "colType": "int", "min": 0, "max": 100},
+      {"colName": "col2", "colType": "float", "min": 0.0, "max": 100.0},
+      {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}
+  ]
+}
diff --git a/tests/files/test_generator_spec.yml b/tests/files/test_generator_spec.yml
@@ -0,0 +1,23 @@
+generator:
+  name: test_data_generator
+  rows: 1000
+  partitions: 10
+  randomSeedMethod: fixed
+  randomSeed: 42
+  random: true
+columns:
+  - colName: col1
+    colType: int
+    minValue: 0
+    maxValue: 1000
+  - colName: col2
+    colType: float
+    minValue: -10.0
+    maxValue: 10.0
+  - colName: col3
+    colType: string
+    values:
+      - a
+      - b
+      - c
+    random: true
@@ -1,14 +1,15 @@
 from datetime import timedelta, datetime
-
+import json
 import pytest
+import yaml
 from pyspark.sql.types import (
     StructType, StructField, IntegerType, StringType, FloatType, DateType, DecimalType, DoubleType, ByteType,
     ShortType, LongType
 )
 
 
 import dbldatagen as dg
-from dbldatagen import DataGenerator
+from dbldatagen import DataGenerator, ColumnGenerationSpec
 from dbldatagen import NRange, DateRange
 
 schema = StructType([
@@ -754,3 +755,61 @@ def test_random_generation_without_range_values(self, columnSpecOptions):
     def test_version_info(self):
         # test access to version info without explicit import
         print("Data generator version", dg.__version__)
+
+    def test_multi_column_generation(self):
+        column_specs = [
+            {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100},
+            {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0},
+            {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}
+        ]
+        df_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumns(column_specs).build()
+        assert df_from_dicts.columns == ["col1", "col2", "col3"]
+
+    def test_generation_from_dictionary(self):
+        dg_spec = {
+            "name": "test_data_generator",
+            "rows": 1000,
+            "partitions": 10,
+            "randomSeedMethod": "fixed",
+            "randomSeed": 42,
+            "random": True
+        }
+        gen_from_dict = DataGenerator.fromDict(dg_spec)
+        assert gen_from_dict.name == dg_spec.get("name")
+        assert gen_from_dict.rowCount == dg_spec.get("rows")
+        assert gen_from_dict.partitions == dg_spec.get("partitions")
+        assert gen_from_dict.random == dg_spec.get("random")
+        assert gen_from_dict.randomSeed == dg_spec.get("randomSeed")
+
+    def test_generation_from_file(self):
+        path = "tests/files/test_generator_spec.json"
+        with open(path, "r", encoding="utf-8") as f:
+            options = json.load(f)
+            gen_options = options.get("generator")
+            gen_from_json = DataGenerator.fromFile(path)
+            assert gen_from_json.name == gen_options.get("name")
+            assert gen_from_json.rowCount == gen_options.get("rows")
+            assert gen_from_json.partitions == gen_options.get("partitions")
+            assert gen_from_json.random == gen_options.get("random")
+            assert gen_from_json.randomSeed == gen_options.get("randomSeed")
+
+            df_from_json = gen_from_json.build()
+            assert df_from_json.columns == ["col1", "col2", "col3"]
+
+        path = "tests/files/test_generator_spec.yml"
+        with open(path, "r", encoding="utf-8") as f:
+            options = yaml.safe_load(f)
+            gen_options = options.get("generator")
+            gen_from_yaml = DataGenerator.fromFile(path)
+            assert gen_from_yaml.name == gen_options.get("name")
+            assert gen_from_yaml.rowCount == gen_options.get("rows")
+            assert gen_from_yaml.partitions == gen_options.get("partitions")
+            assert gen_from_yaml.random == gen_options.get("random")
+            assert gen_from_yaml.randomSeed == gen_options.get("randomSeed")
+
+            df_from_json = gen_from_json.build()
+            assert df_from_json.columns == ["col1", "col2", "col3"]
+
+        path = "tests/files/test_generator_spec.txt"
+        with pytest.raises(ValueError):
+            DataGenerator.fromFile(path)  # Loading from .txt should raise a ValueError