-
Notifications
You must be signed in to change notification settings - Fork 74
Add methods to create data generation specs from files #310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 2 commits
d749476
c9c4e93
cb2f355
b1b1f1a
b9e8ea9
3c9b851
1dc8b13
039b768
b011c19
72e1b2c
900121f
07f4b7f
adcc9b2
83ce7ba
607c17f
0f585b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,9 +6,11 @@ | |
This file defines the `DataGenError` and `DataGenerator` classes | ||
""" | ||
import copy | ||
import json | ||
import logging | ||
import re | ||
|
||
import yaml | ||
from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType | ||
|
||
from ._version import _get_spark_version | ||
|
@@ -869,6 +871,17 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None | |
self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable)) | ||
return self | ||
|
||
def withColumns(self, columns): | ||
""" Adds a set of columns to the synthetic generation specification. | ||
|
||
:param columns: A list of column generation specifications as dictionaries | ||
:returns: A modified in-place instance of a data generator allowing for chaining of calls | ||
following a builder pattern | ||
""" | ||
for column in columns: | ||
self.withColumn(**column) | ||
return self | ||
|
||
def _mkSqlStructFromList(self, fields): | ||
""" | ||
Create a SQL struct expression from a list of fields | ||
|
@@ -1604,3 +1617,51 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, | |
result = HtmlUtils.formatCodeAsHtml(results) | ||
|
||
return result | ||
|
||
@staticmethod | ||
def fromDict(options): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sure to have explicit tests for this covering the following use cases: 1 - with simple options See the examples on the following page for object valued options - i.e DateRange, Distribution objects |
||
""" Creates a data generator from a dictionary of options. | ||
|
||
:param options: Dictionary with data generator options (e.g. "name", "rows") | ||
:return: A data generator with the specified options | ||
""" | ||
return DataGenerator(**options) | ||
|
||
@staticmethod | ||
def fromFile(path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dont add fromFile as method as |
||
""" Creates a data generator from options loaded from a JSON or YAML file. | ||
|
||
:param path: File path to a JSON or YAML file containing data generation options | ||
:return: A data generator with the specified options | ||
""" | ||
if path.endswith("yml") or path.endswith("yaml"): | ||
return DataGenerator.fromYaml(path) | ||
if path.endswith("json"): | ||
return DataGenerator.fromJson(path) | ||
raise ValueError("File type must be '.json' or '.yml'") | ||
|
||
@staticmethod | ||
def fromJson(path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than taking a path, pass a string containing the definition to method Calling code should be responsible for loading string it could be from dbfs, from a database, from unity catalog |
||
""" Creates a data generator from options loaded from a JSON file. | ||
|
||
:param path: File path to a JSON file containing data generation options | ||
:return: A data generator with the specified options | ||
""" | ||
with open(path, "r", encoding="utf-8") as f: | ||
options = json.load(f) | ||
generator = options.get("generator") | ||
columns = options.get("columns", None) | ||
return DataGenerator.fromDict(generator).withColumns(columns) | ||
|
||
@staticmethod | ||
def fromYaml(path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than taking a path, pass a string containing the definition to method Calling code should be responsible for loading string it could be from dbfs, from a database, from unity catalog |
||
""" Creates a data generator from options loaded from a YAML file. | ||
|
||
:param path: File path to a YAML file containing data generation options | ||
:return: A data generator with the specified options | ||
""" | ||
with open(path, "r", encoding="utf-8") as f: | ||
options = yaml.safe_load(f) | ||
generator = options.get("generator") | ||
columns = options.get("columns") | ||
return DataGenerator.fromDict(generator).withColumns(columns) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -182,3 +182,48 @@ This has several implications: | |
SQL expression. | ||
To enforce the dependency, you must use the `baseColumn` attribute to indicate the dependency. | ||
|
||
Creating data generation specs from files | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be creating data specs from string based YAML or JSON Also we should have capability to write to JSON and YAML There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ronanstokes-db the code is done. I will update the docs. |
||
----------------------------------------- | ||
|
||
``DataGenerator.fromFile("file_path")`` will return a ``DataGenerator`` with ``ColumnGenerationSpecs`` from definitions | ||
in a JSON or YAML file. Use the ``"generator"`` key to specify ``DataGenerator`` options and the ``"columns"`` key to | ||
specify ``ColumnGenerationSpec`` options. | ||
|
||
**JSON Example:** | ||
|
||
.. code-block:: JSON | ||
{ | ||
"generator": { | ||
"name": "test_data_generator", | ||
"rows": 1000, | ||
"partitions": 10 | ||
}, | ||
"columns": [ | ||
{"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, | ||
{"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, | ||
{"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} | ||
] | ||
} | ||
|
||
**YAML Example:** | ||
.. code-block:: YAML | ||
generator: | ||
name: test_data_generator | ||
rows: 1000 | ||
partitions: 10 | ||
columns: | ||
- colName: col1 | ||
colType: int | ||
minValue: 0 | ||
maxValue: 1000 | ||
- colName: col2 | ||
colType: float | ||
minValue: -10.0 | ||
maxValue: 10.0 | ||
- colName: col3 | ||
colType: string | ||
values: | ||
- a | ||
- b | ||
- c | ||
random: true |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"generator": { | ||
"name": "test_data_generator", | ||
"rows": 1000, | ||
"partitions": 10, | ||
"randomSeedMethod": "fixed", | ||
"randomSeed": 42, | ||
"random": true | ||
}, | ||
"columns": [ | ||
{"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 1000}, | ||
{"colName": "col2", "colType": "float", "minValue": -10.0, "maxValue": 10.0}, | ||
{"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"generator": { | ||
"name": "test_data_generator", | ||
"rows": 1000, | ||
"partitions": 10, | ||
"randomSeedMethod": "fixed", | ||
"randomSeed": 42, | ||
"random": true | ||
}, | ||
"columns": [ | ||
{"colName": "col1", "colType": "int", "min": 0, "max": 100}, | ||
{"colName": "col2", "colType": "float", "min": 0.0, "max": 100.0}, | ||
{"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
generator: | ||
name: test_data_generator | ||
rows: 1000 | ||
partitions: 10 | ||
randomSeedMethod: fixed | ||
randomSeed: 42 | ||
random: true | ||
columns: | ||
- colName: col1 | ||
colType: int | ||
minValue: 0 | ||
maxValue: 1000 | ||
- colName: col2 | ||
colType: float | ||
minValue: -10.0 | ||
maxValue: 10.0 | ||
- colName: col3 | ||
colType: string | ||
values: | ||
- a | ||
- b | ||
- c | ||
random: true |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
from datetime import timedelta, datetime | ||
|
||
import json | ||
import pytest | ||
import yaml | ||
from pyspark.sql.types import ( | ||
StructType, StructField, IntegerType, StringType, FloatType, DateType, DecimalType, DoubleType, ByteType, | ||
ShortType, LongType | ||
) | ||
|
||
|
||
import dbldatagen as dg | ||
from dbldatagen import DataGenerator | ||
from dbldatagen import DataGenerator, ColumnGenerationSpec | ||
from dbldatagen import NRange, DateRange | ||
|
||
schema = StructType([ | ||
|
@@ -754,3 +755,61 @@ def test_random_generation_without_range_values(self, columnSpecOptions): | |
def test_version_info(self): | ||
# test access to version info without explicit import | ||
print("Data generator version", dg.__version__) | ||
|
||
def test_multi_column_generation(self): | ||
column_specs = [ | ||
{"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, | ||
{"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, | ||
{"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} | ||
] | ||
df_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumns(column_specs).build() | ||
assert df_from_dicts.columns == ["col1", "col2", "col3"] | ||
|
||
def test_generation_from_dictionary(self): | ||
dg_spec = { | ||
"name": "test_data_generator", | ||
"rows": 1000, | ||
"partitions": 10, | ||
"randomSeedMethod": "fixed", | ||
"randomSeed": 42, | ||
"random": True | ||
} | ||
gen_from_dict = DataGenerator.fromDict(dg_spec) | ||
assert gen_from_dict.name == dg_spec.get("name") | ||
assert gen_from_dict.rowCount == dg_spec.get("rows") | ||
assert gen_from_dict.partitions == dg_spec.get("partitions") | ||
assert gen_from_dict.random == dg_spec.get("random") | ||
assert gen_from_dict.randomSeed == dg_spec.get("randomSeed") | ||
|
||
def test_generation_from_file(self): | ||
path = "tests/files/test_generator_spec.json" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we use string based APIs, they'll be more general - also you can simply define the definitions as multi-line strings rather than requiring separate data files |
||
with open(path, "r", encoding="utf-8") as f: | ||
options = json.load(f) | ||
gen_options = options.get("generator") | ||
gen_from_json = DataGenerator.fromFile(path) | ||
assert gen_from_json.name == gen_options.get("name") | ||
assert gen_from_json.rowCount == gen_options.get("rows") | ||
assert gen_from_json.partitions == gen_options.get("partitions") | ||
assert gen_from_json.random == gen_options.get("random") | ||
assert gen_from_json.randomSeed == gen_options.get("randomSeed") | ||
|
||
df_from_json = gen_from_json.build() | ||
assert df_from_json.columns == ["col1", "col2", "col3"] | ||
|
||
path = "tests/files/test_generator_spec.yml" | ||
with open(path, "r", encoding="utf-8") as f: | ||
options = yaml.safe_load(f) | ||
gen_options = options.get("generator") | ||
gen_from_yaml = DataGenerator.fromFile(path) | ||
assert gen_from_yaml.name == gen_options.get("name") | ||
assert gen_from_yaml.rowCount == gen_options.get("rows") | ||
assert gen_from_yaml.partitions == gen_options.get("partitions") | ||
assert gen_from_yaml.random == gen_options.get("random") | ||
assert gen_from_yaml.randomSeed == gen_options.get("randomSeed") | ||
|
||
df_from_json = gen_from_json.build() | ||
assert df_from_json.columns == ["col1", "col2", "col3"] | ||
|
||
path = "tests/files/test_generator_spec.txt" | ||
with pytest.raises(ValueError): | ||
DataGenerator.fromFile(path) # Loading from .txt should raise a ValueError |
Uh oh!
There was an error while loading. Please reload this page.