Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mimesis support #10

Merged
merged 3 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11", "3.12"]
runs-on: ${{ matrix.os }}
defaults:
run:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
python-version: 3.12
- name: cache poetry install
uses: actions/cache@v3
with:
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,19 @@ fake -n 10 pyint,user_name,date_this_year -f json -c id,awesome_name,last_attent
{"id": 1967, "awesome_name": "jmendoza", "last_attention_at": "2023-01-23"}
```

### Providers (beta)

While [Faker](https://faker.readthedocs.io) is a sweet library, we all like options don't we? [Mimesis](https://mimesis.name/en/master/) is _also_ awesome and can be quite a bit faster than Faker. 🤫 You can use a different provider by using `-p mimesis`.

> [!NOTE]
> Providers use their own syntax for data types, so you must change out your column names as necessary.

To generate the same dataset above with Mimesis for example:

```bash
fake -p mimesis -n 10 "numeric.integer_number(0),person.username,datetime.date(2024)" -f json -c id,awesome_name,last_attention_at
```

### Provider Arguments

Some [Faker providers](https://faker.readthedocs.io/en/master/providers/baseprovider.html) (like `pyint`) take arguments. You can also specify those if you like, separated by semi-colons (_because some arguments take a comma-separated string :)_)
Expand Down
32 changes: 19 additions & 13 deletions faker_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@
from faker import Faker

from faker_cli.parser import infer_column_names, parse_column_types
from faker_cli.templates import (
CloudFrontLogs,
CloudFrontWriter,
S3AccessLogs,
S3AccessWriter,
)
from faker_cli.providers.faker import FakerProvider
from faker_cli.providers.mimesis import MimesisProvider
from faker_cli.templates import CloudFrontWriter, S3AccessWriter
from faker_cli.writer import CSVWriter, JSONWriter

KLAS_MAPPER = {
Expand All @@ -22,10 +19,6 @@
"cloudfront": [CloudFrontWriter, "cloudfront_log"],
}

fake = Faker()
fake.add_provider(S3AccessLogs)
fake.add_provider(CloudFrontLogs)


@click.command()
@click.option("--num-rows", "-n", default=1, help="Number of rows")
Expand All @@ -40,7 +33,8 @@
@click.option("--columns", "-c", help="Column names", default=None, required=False)
@click.option("--template", "-t", help="Template to use", type=click.Choice(["s3access", "cloudfront"]), default=None)
@click.argument("column_types", required=False)
def main(num_rows, format, output, columns, template, column_types):
@click.option("--provider", "-p", help="Fake data provider", type=click.Choice(["faker", "mimesis"]), default="faker")
def main(num_rows, format, output, columns, template, column_types, provider):
"""
Generate fake data, easily.

Expand All @@ -49,13 +43,24 @@ def main(num_rows, format, output, columns, template, column_types):

You can also use --template for real-world synthetic data.
"""
if provider == "faker":
fake = FakerProvider()
elif provider == "mimesis":
fake = MimesisProvider()
else:
pass

# Do some initial validation - we must have either template or column tpes
if not template and not column_types:
ctx = click.get_current_context()
click.echo(ctx.get_help())
ctx.exit()
raise click.BadArgumentUsage("either --template or a list of Faker property names must be provided.")

# Templates are only supported with Faker at the moment
if template and provider != "faker":
raise click.BadArgumentUsage('templates are only supported with the "faker" provider.')

# Parquet output requires a filename
if format in ["parquet", "deltalake"] and output is None:
raise click.BadArgumentUsage("parquet | deltalake formats requires --output/-o filename parameter.")
Expand Down Expand Up @@ -105,13 +110,14 @@ def main(num_rows, format, output, columns, template, column_types):
raise click.ClickException(f"Format {format} not supported.")
writer = format_klas(sys.stdout, headers, output)
for i in range(num_rows):
writer.write(generate_row(fake, col_types))
writer.write(fake.generate_row(col_types))
writer.close()


def generate_row(fake: Faker, column_types: list[tuple[str, list]]) -> list[str]:
return [
fake.format(ctype, *args)
if not ctype.startswith("unique.")
else fake.unique.format(ctype.removeprefix("unique."), *args)
for ctype, args in column_types
]
]
2 changes: 2 additions & 0 deletions faker_cli/providers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class BaseProvider:
pass
21 changes: 21 additions & 0 deletions faker_cli/providers/faker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from faker import Faker

from faker_cli.templates import CloudFrontLogs, S3AccessLogs


class FakerProvider:
def __init__(self) -> None:
self.fake = Faker()
self.fake.add_provider(S3AccessLogs)
self.fake.add_provider(CloudFrontLogs)

def generate_row(self, column_types: list[tuple[str, list]]) -> list[str]:
return [
self.fake.format(ctype, *args)
if not ctype.startswith("unique.")
else self.fake.unique.format(ctype.removeprefix("unique."), *args)
for ctype, args in column_types
]

def format(self, log_entry) -> list[str]:
return self.fake.format(log_entry)
15 changes: 15 additions & 0 deletions faker_cli/providers/mimesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from mimesis import Field


class MimesisProvider:
def __init__(self) -> None:
self.field = Field()

def generate_row(self, column_types: list[tuple[str, list]]) -> list[str]:
return [self.field._lookup_method(ctype)(*args) for ctype, args in column_types]

def format(self, log_entry) -> list[str]:
raise NotImplementedError


# field("person.username", mask="U_d", drange=(100, 1000))
Loading
Loading