From cbde71f11261e93481bfde7733f6020f533f1529 Mon Sep 17 00:00:00 2001 From: "G. D. McBain" Date: Fri, 25 Oct 2024 20:26:53 +1100 Subject: [PATCH 1/3] Standardise `.parquet` suffix in docs and tests (#4254) Signed-off-by: G. D. McBain Signed-off-by: G. D. McBain Co-authored-by: Deepyaman Datta --- RELEASE.md | 1 + docs/source/data/kedro_dataset_factories.md | 26 ++++++++++----------- docs/source/integrations/mlflow.md | 2 +- docs/source/tutorial/create_a_pipeline.md | 6 ++--- tests/framework/cli/test_catalog.py | 10 ++++---- tests/io/test_data_catalog.py | 6 ++--- 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 2d267920ce..ac3c4d95d9 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,6 +11,7 @@ ## Breaking changes to the API ## Documentation changes * Updated CLI autocompletion docs with new Click syntax. +* Standardised `.parquet` suffix in docs and tests. ## Community contributions * [Hyewon Choi](https://github.com/hyew0nChoi) diff --git a/docs/source/data/kedro_dataset_factories.md b/docs/source/data/kedro_dataset_factories.md index d2fc40e5af..f75cdb2681 100644 --- a/docs/source/data/kedro_dataset_factories.md +++ b/docs/source/data/kedro_dataset_factories.md @@ -164,21 +164,21 @@ entries share `type`, `file_format` and `save_args`: ```yaml processing.factory_data: type: spark.SparkDataset - filepath: data/processing/factory_data.pq + filepath: data/processing/factory_data.parquet file_format: parquet save_args: mode: overwrite processing.process_data: type: spark.SparkDataset - filepath: data/processing/process_data.pq + filepath: data/processing/process_data.parquet file_format: parquet save_args: mode: overwrite modelling.metrics: type: spark.SparkDataset - filepath: data/modelling/factory_data.pq + filepath: data/modelling/factory_data.parquet file_format: parquet save_args: mode: overwrite @@ -189,7 +189,7 @@ This could be generalised to the following pattern: ```yaml "{layer}.{dataset_name}": type: spark.SparkDataset - filepath: data/{layer}/{dataset_name}.pq + filepath: data/{layer}/{dataset_name}.parquet file_format: parquet save_args: mode: overwrite @@ -202,7 +202,7 @@ You can have multiple dataset factories in your catalog. For example: ```yaml "{namespace}.{dataset_name}@spark": type: spark.SparkDataset - filepath: data/{namespace}/{dataset_name}.pq + filepath: data/{namespace}/{dataset_name}.parquet file_format: parquet "{dataset_name}@csv": @@ -255,11 +255,11 @@ Consider a catalog file with the following patterns: "preprocessed_{dataset_name}": type: pandas.ParquetDataset - filepath: data/02_intermediate/preprocessed_{dataset_name}.pq + filepath: data/02_intermediate/preprocessed_{dataset_name}.parquet "processed_{dataset_name}": type: pandas.ParquetDataset - filepath: data/03_primary/processed_{dataset_name}.pq + filepath: data/03_primary/processed_{dataset_name}.parquet "{dataset_name}_csv": type: pandas.CSVDataset @@ -267,7 +267,7 @@ Consider a catalog file with the following patterns: "{namespace}.{dataset_name}_pq": type: pandas.ParquetDataset - filepath: data/03_primary/{dataset_name}_{namespace}.pq + filepath: data/03_primary/{dataset_name}_{namespace}.parquet "{default_dataset}": type: pickle.PickleDataset @@ -315,11 +315,11 @@ shuttles: "preprocessed_{name}": type: pandas.ParquetDataset - filepath: data/02_intermediate/preprocessed_{name}.pq + filepath: data/02_intermediate/preprocessed_{name}.parquet "{default}": type: pandas.ParquetDataset - filepath: data/03_primary/{default}.pq + filepath: data/03_primary/{default}.parquet ``` @@ -365,13 +365,13 @@ companies: filepath: data/01_raw/companies.csv type: pandas.CSVDataset model_input_table: - filepath: data/03_primary/model_input_table.pq + filepath: data/03_primary/model_input_table.parquet type: pandas.ParquetDataset preprocessed_companies: - filepath: data/02_intermediate/preprocessed_companies.pq + filepath: data/02_intermediate/preprocessed_companies.parquet type: pandas.ParquetDataset preprocessed_shuttles: - filepath: data/02_intermediate/preprocessed_shuttles.pq + filepath: data/02_intermediate/preprocessed_shuttles.parquet type: pandas.ParquetDataset reviews: filepath: data/01_raw/reviews.csv diff --git a/docs/source/integrations/mlflow.md b/docs/source/integrations/mlflow.md index 78d3df6c69..eb39cc7480 100644 --- a/docs/source/integrations/mlflow.md +++ b/docs/source/integrations/mlflow.md @@ -195,7 +195,7 @@ For that, you can make use of {ref}`runtime parameters `: # Add the intermediate datasets to run only the inference X_test: type: pandas.ParquetDataset - filepath: data/05_model_input/X_test.pq + filepath: data/05_model_input/X_test.parquet y_test: type: pandas.CSVDataset # https://github.com/pandas-dev/pandas/issues/54638 diff --git a/docs/source/tutorial/create_a_pipeline.md b/docs/source/tutorial/create_a_pipeline.md index 94fc261ffb..92253c8215 100644 --- a/docs/source/tutorial/create_a_pipeline.md +++ b/docs/source/tutorial/create_a_pipeline.md @@ -200,11 +200,11 @@ Each of the nodes outputs a new dataset (`preprocessed_companies` and `preproces ```yaml preprocessed_companies: type: pandas.ParquetDataset - filepath: data/02_intermediate/preprocessed_companies.pq + filepath: data/02_intermediate/preprocessed_companies.parquet preprocessed_shuttles: type: pandas.ParquetDataset - filepath: data/02_intermediate/preprocessed_shuttles.pq + filepath: data/02_intermediate/preprocessed_shuttles.parquet ``` @@ -290,7 +290,7 @@ The following entry in `conf/base/catalog.yml` saves the model input table datas ```yaml model_input_table: type: pandas.ParquetDataset - filepath: data/03_primary/model_input_table.pq + filepath: data/03_primary/model_input_table.parquet ``` ## Test the example again diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 8905da9c94..694f40091a 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -38,7 +38,7 @@ def fake_catalog_config(): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/{factory_pattern}.pq", + "filepath": "data/01_raw/{factory_pattern}.parquet", "credentials": "db_connection", }, "csv_{factory_pattern}": { @@ -55,7 +55,7 @@ def fake_catalog_config_resolved(): config = { "parquet_example": { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/example.pq", + "filepath": "data/01_raw/example.parquet", "credentials": {"con": "foo"}, }, "csv_example": { @@ -99,7 +99,7 @@ def fake_catalog_config_with_factories(fake_metadata): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/{factory_pattern}.pq", + "filepath": "data/01_raw/{factory_pattern}.parquet", }, "csv_{factory_pattern}": { "type": "pandas.CSVDataset", @@ -108,7 +108,7 @@ def fake_catalog_config_with_factories(fake_metadata): "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "{factory_pattern}_ds": { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/{factory_pattern}_ds.pq", + "filepath": "data/01_raw/{factory_pattern}_ds.parquet", }, "partitioned_{factory_pattern}": { "type": "partitions.PartitionedDataset", @@ -129,7 +129,7 @@ def fake_catalog_config_with_factories_resolved(): config = { "parquet_example": { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/example.pq", + "filepath": "data/01_raw/example.parquet", }, "csv_example": { "type": "pandas.CSVDataset", diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index bbaf6e8c6b..386c0812db 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -38,7 +38,7 @@ def config_with_dataset_factories(): }, "audi_cars": { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/audi_cars.pq", + "filepath": "data/01_raw/audi_cars.parquet", }, "{type}_boats": { "type": "pandas.CSVDataset", @@ -84,7 +84,7 @@ def config_with_dataset_factories_with_default(config_with_dataset_factories): def config_with_dataset_factories_bad_pattern(config_with_dataset_factories): config_with_dataset_factories["catalog"]["{type}@planes"] = { "type": "pandas.ParquetDataset", - "filepath": "data/01_raw/{brand}_plane.pq", + "filepath": "data/01_raw/{brand}_plane.parquet", } return config_with_dataset_factories @@ -95,7 +95,7 @@ def config_with_dataset_factories_only_patterns(): "catalog": { "{namespace}_{dataset}": { "type": "pandas.CSVDataset", - "filepath": "data/01_raw/{namespace}_{dataset}.pq", + "filepath": "data/01_raw/{namespace}_{dataset}.parquet", }, "{country}_companies": { "type": "pandas.CSVDataset", From 9ec1796669e91278cd7691927aae7321f4a0ae9f Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Fri, 25 Oct 2024 14:51:43 +0100 Subject: [PATCH 2/3] Add docs for `KedroDataCatalog` (#4249) * Added setting default catalog section Signed-off-by: Elena Khaustova * Draft page for KedroDataCatalog Signed-off-by: Elena Khaustova * Updated index page Signed-off-by: Elena Khaustova * Updated Kedro Data Catalog page Signed-off-by: Elena Khaustova * Added kedro_data_catalog to toctree Signed-off-by: Elena Khaustova * Updated docstrings Signed-off-by: Elena Khaustova * Updated setter and list docstrings Signed-off-by: Elena Khaustova * Improved wordings Signed-off-by: Elena Khaustova * Removed odd new line Signed-off-by: Elena Khaustova * Point Kedro version Signed-off-by: Elena Khaustova * Added a note on how to access datasets after _FrozenDatasets class was removed Signed-off-by: Elena Khaustova * Added a link to the old documentation Signed-off-by: Elena Khaustova * Added link to the Slack channel Signed-off-by: Elena Khaustova * Fixed typos Signed-off-by: Elena Khaustova * Added top links for how-to items Signed-off-by: Elena Khaustova * Fixed page reference Signed-off-by: Elena Khaustova * Fixed page reference Signed-off-by: Elena Khaustova * Updated reference to slack Signed-off-by: Elena Khaustova * Updates slack link Signed-off-by: Elena Khaustova * Quoted KedroDataCatalog in the title Signed-off-by: Elena Khaustova * Fixed typos Signed-off-by: Elena Khaustova * Added example of print output Signed-off-by: Elena Khaustova * Applied suggested changes Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova --- docs/source/data/index.md | 26 +++- docs/source/data/kedro_data_catalog.md | 102 ++++++++++++++ kedro/io/kedro_data_catalog.py | 175 +++++++++++++++++++++++-- 3 files changed, 290 insertions(+), 13 deletions(-) create mode 100644 docs/source/data/kedro_data_catalog.md diff --git a/docs/source/data/index.md b/docs/source/data/index.md index 18edfc1ab9..5efc0e5b6f 100644 --- a/docs/source/data/index.md +++ b/docs/source/data/index.md @@ -1,5 +1,5 @@ -# The Kedro Data Catalog +# Data Catalog In a Kedro project, the Data Catalog is a registry of all data sources available for use by the project. The catalog is stored in a YAML file (`catalog.yml`) that maps the names of node inputs and outputs as keys in the `DataCatalog` class. @@ -46,3 +46,27 @@ This section on handing data with Kedro concludes with an advanced use case, ill how_to_create_a_custom_dataset ``` + +## `KedroDataCatalog` (experimental feature) + +As of Kedro 0.19.9, you can explore a new experimental feature — the `KedroDataCatalog`, an enhanced alternative to `DataCatalog`. + +At present, `KedroDataCatalog` replicates the functionality of `DataCatalog` and is fully compatible with the Kedro `run` command. It introduces several API improvements: +* Simplified dataset access: `_FrozenDatasets` has been replaced with a public `get` method to retrieve datasets. +* Added dict-like interface: You can now use a dictionary-like syntax to retrieve, set, and iterate over datasets. + +For more details and examples of how to use `KedroDataCatalog`, see the Kedro Data Catalog page. + +```{toctree} +:maxdepth: 1 + +kedro_data_catalog +``` + +The [documentation](./data_catalog.md) for `DataCatalog` remains relevant as `KedroDataCatalog` retains its core functionality with some enhancements. + +```{note} +`KedroDataCatalog` is under active development and may undergo breaking changes in future releases. While we encourage you to try it out, please be aware of potential modifications as we continue to improve it. Additionally, all upcoming catalog-related features will be introduced through `KedroDataCatalog` before it replaces `DataCatalog`. +``` + +We value your feedback — let us know if you have any thoughts or suggestions regarding `KedroDataCatalog` or potential new features via our [Slack channel](https://kedro-org.slack.com). diff --git a/docs/source/data/kedro_data_catalog.md b/docs/source/data/kedro_data_catalog.md new file mode 100644 index 0000000000..1c10ffebfd --- /dev/null +++ b/docs/source/data/kedro_data_catalog.md @@ -0,0 +1,102 @@ +# Kedro Data Catalog +`KedroDataCatalog` retains the core functionality of `DataCatalog`, with a few API enhancements. For a comprehensive understanding, we recommend reviewing the existing `DataCatalog` [documentation](./data_catalog.md) before exploring the additional functionality of `KedroDataCatalog`. + +This page highlights the new features and provides usage examples: +* [How to make KedroDataCatalog the default catalog for Kedro run](#how-to-make-kedrodatacatalog-the-default-catalog-for-kedro-run) +* [How to access datasets in the catalog](#how-to-access-datasets-in-the-catalog) +* [How to add datasets to the catalog](#how-to-add-datasets-to-the-catalog) +* [How to iterate trough datasets in the catalog](#how-to-iterate-trough-datasets-in-the-catalog) +* [How to get the number of datasets in the catalog](#how-to-get-the-number-of-datasets-in-the-catalog) +* [How to print the full catalog and individual datasets](#how-to-print-the-full-catalog-and-individual-datasets) +* [How to access dataset patterns](#how-to-access-dataset-patterns) + +## How to make `KedroDataCatalog` the default catalog for Kedro `run` + +To set `KedroDataCatalog` as the default catalog for the `kedro run` command and other CLI commands, update your `settings.py` as follows: + +```python +from kedro.io import KedroDataCatalog + +DATA_CATALOG_CLASS = KedroDataCatalog +``` + +Once this change is made, you can run your Kedro project as usual. + +For more information on `settings.py`, refer to the [Project settings documentation](../kedro_project_setup/settings.md). + +## How to access datasets in the catalog + +You can retrieve a dataset from the catalog using either the dictionary-like syntax or the `get` method: + +```python +reviews_ds = catalog["reviews"] +reviews_ds = catalog.get("reviews", default=default_ds) +``` + +## How to add datasets to the catalog + +The new API allows you to add datasets as well as raw data directly to the catalog: + +```python +from kedro_datasets.pandas import CSVDataset + +bikes_ds = CSVDataset(filepath="../data/01_raw/bikes.csv") +catalog["bikes"] = bikes_ds # Adding a dataset +catalog["cars"] = ["Ferrari", "Audi"] # Adding raw data +``` + +When you add raw data, it is automatically wrapped in a `MemoryDataset` under the hood. + +## How to iterate trough datasets in the catalog + +`KedroDataCatalog` supports iteration over dataset names (keys), datasets (values), and both (items). Iteration defaults to dataset names, similar to standard Python dictionaries: + +```python +for ds_name in catalog: # __iter__ defaults to keys + pass + +for ds_name in catalog.keys(): # Iterate over dataset names + pass + +for ds in catalog.values(): # Iterate over datasets + pass + +for ds_name, ds in catalog.items(): # Iterate over (name, dataset) tuples + pass +``` + +## How to get the number of datasets in the catalog + +You can get the number of datasets in the catalog using the `len()` function: + +```python +ds_count = len(catalog) +``` + +## How to print the full catalog and individual datasets + +To print the catalog or an individual dataset programmatically, use the `print()` function or in an interactive environment like IPython or JupyterLab, simply enter the variable: + +```bash +In [1]: catalog +Out[1]: {'shuttles': kedro_datasets.pandas.excel_dataset.ExcelDataset(filepath=PurePosixPath('/data/01_raw/shuttles.xlsx'), protocol='file', load_args={'engine': 'openpyxl'}, save_args={'index': False}, writer_args={'engine': 'openpyxl'}), 'preprocessed_companies': kedro_datasets.pandas.parquet_dataset.ParquetDataset(filepath=PurePosixPath('/data/02_intermediate/preprocessed_companies.pq'), protocol='file', load_args={}, save_args={}), 'params:model_options.test_size': kedro.io.memory_dataset.MemoryDataset(data=''), 'params:model_options.features': kedro.io.memory_dataset.MemoryDataset(data=''))} + +In [2]: catalog["shuttles"] +Out[2]: kedro_datasets.pandas.excel_dataset.ExcelDataset(filepath=PurePosixPath('/data/01_raw/shuttles.xlsx'), protocol='file', load_args={'engine': 'openpyxl'}, save_args={'index': False}, writer_args={'engine': 'openpyxl'}) +``` + +## How to access dataset patterns + +The pattern resolution logic in `KedroDataCatalog` is handled by the `config_resolver`, which can be accessed as a property of the catalog: + +```python +config_resolver = catalog.config_resolver +ds_config = catalog.config_resolver.resolve_pattern(ds_name) # Resolving a dataset pattern +patterns = catalog.config_resolver.list_patterns() # Listing all available patterns +``` + +```{note} +`KedroDataCatalog` does not support all dictionary-specific methods, such as `pop()`, `popitem()`, or deletion by key (`del`). +``` + +For a full list of supported methods, refer to the [KedroDataCatalog source code](https://github.com/kedro-org/kedro/blob/main/kedro/io/kedro_data_catalog.py). diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index c3d216abcd..27dcf1a765 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -64,10 +64,12 @@ def __init__( Example: :: - >>> # settings.py - >>> from kedro.io import KedroDataCatalog + >>> from kedro_datasets.pandas import CSVDataset >>> - >>> DATA_CATALOG_CLASS = KedroDataCatalog + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = KedroDataCatalog(datasets={"cars": cars}) """ self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} @@ -102,34 +104,85 @@ def __repr__(self) -> str: return repr(self._datasets) def __contains__(self, dataset_name: str) -> bool: - """Check if an item is in the catalog as a materialised dataset or pattern""" + """Check if an item is in the catalog as a materialised dataset or pattern.""" return ( dataset_name in self._datasets or self._config_resolver.match_pattern(dataset_name) is not None ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] + """Compares two catalogs based on materialised datasets and datasets patterns.""" return (self._datasets, self._config_resolver.list_patterns()) == ( other._datasets, other.config_resolver.list_patterns(), ) def keys(self) -> List[str]: # noqa: UP006 + """List all dataset names registered in the catalog.""" return list(self.__iter__()) def values(self) -> List[AbstractDataset]: # noqa: UP006 + """List all datasets registered in the catalog.""" return [self._datasets[key] for key in self] def items(self) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 + """List all dataset names and datasets registered in the catalog.""" return [(key, self._datasets[key]) for key in self] def __iter__(self) -> Iterator[str]: yield from self._datasets.keys() def __getitem__(self, ds_name: str) -> AbstractDataset: + """Get a dataset by name from an internal collection of datasets. + + If a dataset is not in the collection but matches any pattern + it is instantiated and added to the collection first, then returned. + + Args: + ds_name: A dataset name. + + Returns: + An instance of AbstractDataset. + + Raises: + DatasetNotFoundError: When a dataset with the given name + is not in the collection and does not match patterns. + """ return self.get_dataset(ds_name) def __setitem__(self, key: str, value: Any) -> None: + """Add dataset to the ``KedroDataCatalog`` using the given key as a datsets name + and the provided data as the value. + + The value can either be raw data or a Kedro dataset (i.e., an instance of a class + inheriting from ``AbstractDataset``). If raw data is provided, it will be automatically + wrapped in a ``MemoryDataset`` before being added to the catalog. + + Args: + key: Name of the dataset. + value: Raw data or an instance of a class inheriting from ``AbstractDataset``. + + Example: + :: + + >>> from kedro_datasets.pandas import CSVDataset + >>> import pandas as pd + >>> + >>> df = pd.DataFrame({"col1": [1, 2], + >>> "col2": [4, 5], + >>> "col3": [5, 6]}) + >>> + >>> catalog = KedroDataCatalog() + >>> catalog["data_df"] = df # Add raw data as a MemoryDataset + >>> + >>> assert catalog.load("data_df").equals(df) + >>> + >>> csv_dataset = CSVDataset(filepath="test.csv") + >>> csv_dataset.save(df) + >>> catalog["data_csv_dataset"] = csv_dataset # Add a dataset instance + >>> + >>> assert catalog.load("data_csv_dataset").equals(df) + """ if key in self._datasets: self._logger.warning("Replacing dataset '%s'", key) if isinstance(value, AbstractDataset): @@ -144,7 +197,19 @@ def __len__(self) -> int: def get( self, key: str, default: AbstractDataset | None = None ) -> AbstractDataset | None: - """Get a dataset by name from an internal collection of datasets.""" + """Get a dataset by name from an internal collection of datasets. + + If a dataset is not in the collection but matches any pattern + it is instantiated and added to the collection first, then returned. + + Args: + key: A dataset name. + default: Optional argument for default dataset to return in case + requested dataset not in the catalog. + + Returns: + An instance of AbstractDataset. + """ if key not in self._datasets: ds_config = self._config_resolver.resolve_pattern(key) if ds_config: @@ -172,6 +237,69 @@ def from_config( """Create a ``KedroDataCatalog`` instance from configuration. This is a factory method used to provide developers with a way to instantiate ``KedroDataCatalog`` with configuration parsed from configuration files. + + Args: + catalog: A dictionary whose keys are the dataset names and + the values are dictionaries with the constructor arguments + for classes implementing ``AbstractDataset``. The dataset + class to be loaded is specified with the key ``type`` and their + fully qualified class name. All ``kedro.io`` dataset can be + specified by their class name only, i.e. their module name + can be omitted. + credentials: A dictionary containing credentials for different + datasets. Use the ``credentials`` key in a ``AbstractDataset`` + to refer to the appropriate credentials as shown in the example + below. + load_versions: A mapping between dataset names and versions + to load. Has no effect on datasets without enabled versioning. + save_version: Version string to be used for ``save`` operations + by all datasets with enabled versioning. It must: a) be a + case-insensitive string that conforms with operating system + filename limitations, b) always return the latest version when + sorted in lexicographical order. + + Returns: + An instantiated ``KedroDataCatalog`` containing all specified + datasets, created and ready to use. + + Raises: + DatasetNotFoundError: When `load_versions` refers to a dataset that doesn't + exist in the catalog. + + Example: + :: + + >>> config = { + >>> "cars": { + >>> "type": "pandas.CSVDataset", + >>> "filepath": "cars.csv", + >>> "save_args": { + >>> "index": False + >>> } + >>> }, + >>> "boats": { + >>> "type": "pandas.CSVDataset", + >>> "filepath": "s3://aws-bucket-name/boats.csv", + >>> "credentials": "boats_credentials", + >>> "save_args": { + >>> "index": False + >>> } + >>> } + >>> } + >>> + >>> credentials = { + >>> "boats_credentials": { + >>> "client_kwargs": { + >>> "aws_access_key_id": "", + >>> "aws_secret_access_key": "" + >>> } + >>> } + >>> } + >>> + >>> catalog = KedroDataCatalog.from_config(config, credentials) + >>> + >>> df = catalog.load("cars") + >>> catalog.save("boats", df) """ catalog = catalog or {} config_resolver = CatalogConfigResolver(catalog, credentials) @@ -284,10 +412,32 @@ def list( self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0 ) -> List[str]: # noqa: UP006 # TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917 - """ - List of all dataset names registered in the catalog. - This can be filtered by providing an optional regular expression - which will only return matching keys. + # TODO: make regex_search mandatory argument as we have catalog.keys() for listing all the datasets. + """List all dataset names registered in the catalog, optionally filtered by a regex pattern. + + If a regex pattern is provided, only dataset names matching the pattern will be returned. + This method supports optional regex flags for customization + + Args: + regex_search: Optional regular expression to filter dataset names. + regex_flags: Optional regex flags. + Returns: + A list of dataset names that match the `regex_search` criteria. If no pattern is + provided, all dataset names are returned. + + Raises: + SyntaxError: If the provided regex pattern is invalid. + + Example: + :: + + >>> catalog = KedroDataCatalog() + >>> # get datasets where the substring 'raw' is present + >>> raw_data = catalog.list(regex_search='raw') + >>> # get datasets which start with 'prm' or 'feat' + >>> feat_eng_data = catalog.list(regex_search='^(prm|feat)') + >>> # get datasets which end with 'time_series' + >>> models = catalog.list(regex_search='.+time_series$') """ if regex_search is None: return self.keys() @@ -325,12 +475,13 @@ def save(self, name: str, data: Any) -> None: >>> import pandas as pd >>> + >>> from kedro.io import KedroDataCatalog >>> from kedro_datasets.pandas import CSVDataset >>> >>> cars = CSVDataset(filepath="cars.csv", >>> load_args=None, >>> save_args={"index": False}) - >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> catalog = KedroDataCatalog(datasets={'cars': cars}) >>> >>> df = pd.DataFrame({'col1': [1, 2], >>> 'col2': [4, 5], @@ -368,13 +519,13 @@ def load(self, name: str, version: str | None = None) -> Any: Example: :: - >>> from kedro.io import DataCatalog + >>> from kedro.io import KedroDataCatalog >>> from kedro_datasets.pandas import CSVDataset >>> >>> cars = CSVDataset(filepath="cars.csv", >>> load_args=None, >>> save_args={"index": False}) - >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> catalog = KedroDataCatalog(datasets={'cars': cars}) >>> >>> df = catalog.load("cars") """ From a5d9bb40380c598bf7d03cb16623026892844ed4 Mon Sep 17 00:00:00 2001 From: "L. R. Couto" <57910428+lrcouto@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:57:40 -0300 Subject: [PATCH 3/3] Add Github Actions workflow to trigger pipeline performance test (#4231) * Add Github Actions workflow to trigger pipeline performance test Signed-off-by: Laura Couto * Lint Signed-off-by: Laura Couto * Lint Signed-off-by: Laura Couto * Change trigger to PR label Signed-off-by: Laura Couto * Run workflow on kedro repo Signed-off-by: Laura Couto * Change setup-python version to 5 Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> --------- Signed-off-by: Laura Couto Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> --- .../workflows/pipeline-performance-test.yml | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .github/workflows/pipeline-performance-test.yml diff --git a/.github/workflows/pipeline-performance-test.yml b/.github/workflows/pipeline-performance-test.yml new file mode 100644 index 0000000000..58fb582213 --- /dev/null +++ b/.github/workflows/pipeline-performance-test.yml @@ -0,0 +1,79 @@ +name: Trigger and Run Pipeline Performance Test + +on: + pull_request: + types: [labeled] + +jobs: + performance-test: + runs-on: ubuntu-latest + + steps: + - name: Check if 'performance' label was added + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: echo "Performance label detected. Running performance test." + + - name: Clone test repo + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: | + git clone https://x-access-token:${{ secrets.GH_TAGGING_TOKEN }}@github.com/kedro-org/pipeline-performance-test.git + + - name: Set up Python 3.11 + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: | + pip install kedro + pip install uv + cd pipeline-performance-test/performance-test + pip install -r requirements.txt + + - name: Run performance test and capture time for latest Kedro release + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: | + cd pipeline-performance-test/performance-test + total_time_release=0.0 + for i in {1..10}; do + { time kedro run; } 2> release_time_output.txt + real_time_release=$(grep real release_time_output.txt | awk '{print $2}' | sed 's/[^0-9.]//g') + total_time_release=$(echo "$total_time_release + $real_time_release" | bc) + done + average_time_release=$(echo "scale=3; $total_time_release / 10" | bc) + echo "average_time_release=${average_time_release}" >> $GITHUB_ENV + + - name: Pull specific branch from Kedro and install it + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: | + git clone --branch ${{ github.event.pull_request.head.ref }} https://github.com/kedro-org/kedro.git + cd kedro + make install + + - name: Run performance test and capture time for specific Kedro branch + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: | + cd pipeline-performance-test/performance-test + total_time_branch=0.0 + for i in {1..10}; do + { time kedro run --params=hook_delay=0,dataset_load_delay=0,file_save_delay=0; } 2> branch_time_output.txt + real_time_branch=$(grep real branch_time_output.txt | awk '{print $2}' | sed 's/[^0-9.]//g') + total_time_branch=$(echo "$total_time_branch + $real_time_branch" | bc) + done + average_time_branch=$(echo "scale=3; $total_time_branch / 10" | bc) + echo "average_time_branch=${average_time_branch}" >> $GITHUB_ENV + + - name: Extract and format real time from release version + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: echo "Average elapsed time for Kedro release version test was ${average_time_release} seconds" + + - name: Extract and format real time from specific branch + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: echo "Average elapsed time for specific branch test was ${average_time_branch} seconds" + + - name: Clean up time output files + if: github.event.action == 'labeled' && contains(github.event.label.name, 'performance') + run: | + rm pipeline-performance-test/performance-test/release_time_output.txt pipeline-performance-test/performance-test/branch_time_output.txt