diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml index d4089c9a940..4fe193c5803 100644 --- a/.github/workflows/ci-notebooks.yml +++ b/.github/workflows/ci-notebooks.yml @@ -7,7 +7,6 @@ on: - .github/workflows/ci-notebooks.yml - setup.cfg - setup.py - - requirements/env_hdk.yml - requirements/env_unidist_linux.yml concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes @@ -25,16 +24,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - execution: [pandas_on_ray, pandas_on_dask, pandas_on_unidist, hdk_on_native] + execution: [pandas_on_ray, pandas_on_dask, pandas_on_unidist] steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only - if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist' - - uses: ./.github/actions/mamba-env - with: - environment-file: requirements/env_hdk.yml - activate-environment: modin_on_hdk - if: matrix.execution == 'hdk_on_native' + if: matrix.execution != 'pandas_on_unidist' - uses: ./.github/actions/mamba-env with: environment-file: requirements/env_unidist_linux.yml @@ -49,29 +43,29 @@ jobs: # replace modin with . in the tutorial requirements file for `pandas_on_ray` and # `pandas_on_dask` since we need Modin built from sources - run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt - if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist' + if: matrix.execution != 'pandas_on_unidist' # install dependencies required for notebooks execution for `pandas_on_ray` and `pandas_on_dask` # Override modin-spreadsheet install for now - run: | pip install -r examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt pip install git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 - if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist' - # Build Modin from sources for `hdk_on_native` and `pandas_on_unidist` + if: matrix.execution != 'pandas_on_unidist' + # Build Modin from sources for `pandas_on_unidist` - run: pip install -e . - if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist' + if: matrix.execution == 'pandas_on_unidist' # install test dependencies # NOTE: If you are changing the set of packages installed here, make sure that # the dev requirements match them. - run: pip install pytest pytest-cov black flake8 flake8-print flake8-no-implicit-concat - if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist' + if: matrix.execution != 'pandas_on_unidist' - run: pip install flake8-print jupyter nbformat nbconvert - if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist' + if: matrix.execution == 'pandas_on_unidist' - run: pip list - if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist' + if: matrix.execution != 'pandas_on_unidist' - run: | conda info conda list - if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist' + if: matrix.execution == 'pandas_on_unidist' # setup kernel configuration for `pandas_on_unidist` execution with mpi backend - run: python examples/tutorial/jupyter/execution/${{ matrix.execution }}/setup_kernel.py if: matrix.execution == 'pandas_on_unidist' diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml index 7939afbf17f..f575945cd85 100644 --- a/.github/workflows/ci-required.yml +++ b/.github/workflows/ci-required.yml @@ -90,19 +90,6 @@ jobs: modin/experimental/pandas/__init__.py - run: python scripts/doc_checker.py modin/core/storage_formats/base - run: python scripts/doc_checker.py modin/core/storage_formats/pandas - - run: | - python scripts/doc_checker.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \ - modin/experimental/core/execution/native/implementations/hdk_on_native/io \ - modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning \ - modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py \ - - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/hdk - - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py - run: python scripts/doc_checker.py modin/logging diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7cdf175cc7d..68ea8eaac3e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -150,62 +150,6 @@ jobs: runner: python -m pytest --execution=${{ matrix.execution }} - uses: ./.github/actions/upload-coverage - test-hdk: - needs: [lint-flake8] - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - env: - MODIN_EXPERIMENTAL: "True" - MODIN_ENGINE: "native" - MODIN_STORAGE_FORMAT: "hdk" - name: Test HDK storage format, Python 3.9 - services: - moto: - image: motoserver/moto - ports: - - 5000:5000 - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/mamba-env - with: - environment-file: requirements/env_hdk.yml - activate-environment: modin_on_hdk - - name: Install HDF5 - run: sudo apt update && sudo apt install -y libhdf5-dev - - run: python -m pytest modin/tests/core/storage_formats/hdk/test_internals.py - - run: python -m pytest modin/tests/experimental/hdk_on_native/test_init.py - - run: python -m pytest modin/tests/experimental/hdk_on_native/test_dataframe.py - - run: python -m pytest modin/tests/experimental/hdk_on_native/test_utils.py - - run: python -m pytest modin/tests/pandas/test_io.py --verbose - - run: python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py - - run: python -m pytest modin/tests/interchange/dataframe_protocol/hdk - - run: python -m pytest modin/tests/experimental/test_sql.py - - run: python -m pytest modin/tests/pandas/test_concat.py - - run: python -m pytest modin/tests/pandas/dataframe/test_binary.py - - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - - run: python -m pytest modin/tests/pandas/dataframe/test_join_sort.py - - run: python -m pytest modin/tests/pandas/test_general.py - - run: python -m pytest modin/tests/pandas/dataframe/test_indexing.py - - run: python -m pytest modin/tests/pandas/test_series.py - - run: python -m pytest modin/tests/pandas/dataframe/test_map_metadata.py - - run: python -m pytest modin/tests/pandas/dataframe/test_window.py - - run: python -m pytest modin/tests/pandas/dataframe/test_default.py - - run: python examples/docker/modin-hdk/census-hdk.py examples/data/census_1k.csv -no-ml - - run: python examples/docker/modin-hdk/nyc-taxi-hdk.py examples/data/nyc-taxi_1k.csv - - run: | - python examples/docker/modin-hdk/plasticc-hdk.py \ - examples/data/plasticc_training_set_1k.csv \ - examples/data/plasticc_test_set_1k.csv \ - examples/data/plasticc_training_set_metadata_1k.csv \ - examples/data/plasticc_test_set_metadata_1k.csv \ - -no-ml - - uses: ./.github/actions/upload-coverage - test-asv-benchmarks: if: github.event_name == 'pull_request' needs: [lint-flake8] @@ -249,18 +193,6 @@ jobs: # check pure pandas MODIN_ASV_USE_IMPL=pandas asv run --quick --dry-run --python=same --strict --show-stderr --launch-method=spawn \ -b ^benchmarks -b ^io | tee benchmarks.log - - # TODO: Remove manual environment creation after fix https://github.com/airspeed-velocity/asv/issues/1310 - conda deactivate - mamba env create -f ../requirements/env_hdk.yml - conda activate modin_on_hdk - pip install asv==0.5.1 - pip install .. - - # check Modin on HDK - MODIN_ENGINE=native MODIN_STORAGE_FORMAT=hdk MODIN_EXPERIMENTAL=true asv run --quick --dry-run --python=same --strict --show-stderr \ - --launch-method=forkserver --python=same --config asv.conf.hdk.json \ - -b ^hdk | tee benchmarks.log else echo "Benchmarks did not run, no changes detected" fi @@ -374,7 +306,6 @@ jobs: - run: | mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret \ python -m pytest modin/tests/experimental/test_io_exp.py - - run: mpiexec -n 1 python -m pytest modin/tests/experimental/test_sql.py - run: mpiexec -n 1 python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py - run: mpiexec -n 1 python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py - run: | @@ -495,8 +426,6 @@ jobs: if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/experimental/test_io_exp.py if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest modin/tests/experimental/test_sql.py - if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4') - run: python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py @@ -703,7 +632,7 @@ jobs: - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py merge-coverage-artifacts: - needs: [test-internals, test-api-and-no-engine, test-defaults, test-hdk, test-all-unidist, test-all, test-experimental, test-sanity] + needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity] if: always() # we need to run it regardless of some job being skipped, like in PR runs-on: ubuntu-latest defaults: diff --git a/.github/workflows/codeql/codeql-config.yml b/.github/workflows/codeql/codeql-config.yml index 5232f85dbee..ccc4dcce1fc 100644 --- a/.github/workflows/codeql/codeql-config.yml +++ b/.github/workflows/codeql/codeql-config.yml @@ -2,5 +2,3 @@ name: "Modin CodeQL config" paths: - modin/** -paths-ignore: - - modin/tests/experimental/hdk_on_native/** # TODO: fix unhashable list error, see #5227 diff --git a/CODEOWNERS b/CODEOWNERS index 4d640a8ecff..cd14349b949 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,8 +1,3 @@ # These owners will be the default owners for everything in # the repo unless a later match takes precedence, * @modin-project/modin-core @devin-petersohn @mvashishtha @RehanSD @YarShev @vnlitvinov @anmyachev @dchigarev - -# These owners will review everything in the HDK engine component -# of Modin. -/modin/experimental/core/storage_formats/hdk/** @modin-project/modin-hdk @aregm @gshimansky @ienkovich @Garra1980 @YarShev @vnlitvinov @anmyachev @dchigarev @AndreyPavlenko -/modin/experimental/core/execution/native/implementations/hdk_on_native/** @modin-project/modin-hdk @aregm @gshimansky @ienkovich @Garra1980 @YarShev @vnlitvinov @anmyachev @dchigarev @AndreyPavlenko diff --git a/README.md b/README.md index d6d17e90ba0..120504b882e 100644 --- a/README.md +++ b/README.md @@ -85,8 +85,8 @@ Modin automatically detects which engine(s) you have installed and uses that for #### From conda-forge Installing from [conda forge](https://github.com/conda-forge/modin-feedstock) using `modin-all` -will install Modin and four engines: [Ray](https://github.com/ray-project/ray), [Dask](https://github.com/dask/dask), -[MPI through unidist](https://github.com/modin-project/unidist) and [HDK](https://github.com/intel-ai/hdk). +will install Modin and three engines: [Ray](https://github.com/ray-project/ray), [Dask](https://github.com/dask/dask) and +[MPI through unidist](https://github.com/modin-project/unidist). ```bash conda install -c conda-forge modin-all @@ -98,7 +98,6 @@ Each engine can also be installed individually (and also as a combination of sev conda install -c conda-forge modin-ray # Install Modin dependencies and Ray. conda install -c conda-forge modin-dask # Install Modin dependencies and Dask. conda install -c conda-forge modin-mpi # Install Modin dependencies and MPI through unidist. -conda install -c conda-forge modin-hdk # Install Modin dependencies and HDK. ``` **Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: `ray-core` instead of `ray-default`. @@ -118,13 +117,13 @@ conda install -n base conda-libmamba-solver and then use it during istallation either like: ```bash -conda install -c conda-forge modin-ray modin-hdk --experimental-solver=libmamba +conda install -c conda-forge modin-ray --experimental-solver=libmamba ``` or starting from conda 22.11 and libmamba solver 22.12 versions: ```bash -conda install -c conda-forge modin-ray modin-hdk --solver=libmamba +conda install -c conda-forge modin-ray --solver=libmamba ``` #### Choosing a Compute Engine @@ -158,8 +157,6 @@ modin_cfg.Engine.put('unidist') # Modin will use Unidist unidist_cfg.Backend.put('mpi') # Unidist will use MPI backend ``` -Check [this Modin docs section](https://modin.readthedocs.io/en/latest/development/using_hdk.html) for HDK engine setup. - _Note: You should not change the engine after your first operation with Modin as it will result in undefined behavior._ #### Which engine should I use? @@ -168,11 +165,6 @@ On Linux, MacOS, and Windows you can install and use either Ray, Dask or MPI thr to use either of these engines as Modin abstracts away all of the complexity, so feel free to pick either! -On Linux you also can choose [HDK](https://modin.readthedocs.io/en/latest/development/using_hdk.html), which is an experimental -engine based on [HDK](https://github.com/intel-ai/hdk) and included in the -[Intel® Distribution of Modin](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-of-modin.html), -which is a part of [Intel® oneAPI AI Analytics Toolkit (AI Kit)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html). - ### Pandas API Coverage

diff --git a/asv_bench/asv.conf.hdk.json b/asv_bench/asv.conf.hdk.json deleted file mode 100644 index 32745c30bf0..00000000000 --- a/asv_bench/asv.conf.hdk.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - // The version of the config file format. Do not change, unless - // you know what you are doing. - "version": 1, - - // The name of the project being benchmarked - "project": "modin", - - // The project's homepage - "project_url": "https://modin.readthedocs.io/", - - // The URL or local path of the source code repository for the - // project being benchmarked - "repo": "..", - - // List of branches to benchmark. If not provided, defaults to "master" - // (for git) or "default" (for mercurial). - "branches": ["main"], - - // Customizable commands for building, installing, and - // uninstalling the project. See asv.conf.json documentation. - // - "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], - - // The tool to use to create environments. May be "conda", - // "virtualenv" or other value depending on the plugins in use. - // If missing or the empty string, the tool will be automatically - // determined by looking for tools on the PATH environment - // variable. - "environment_type": "conda", - - // timeout in seconds for installing any dependencies in environment - // defaults to 10 min - "install_timeout": 6000, - - // the base URL to show a commit for the project. - "show_commit_url": "https://github.com/modin-project/modin/commit/", - - // The Pythons you'd like to test against. If not provided, defaults - // to the current version of Python used to run `asv`. - "pythons": ["3.9"], - - // The list of conda channel names to be searched for benchmark - // dependency packages in the specified order - "conda_channels": ["conda-forge", "defaults"], - - "conda_environment_file": "../requirements/env_hdk.yml", - - // The directory (relative to the current directory) to cache the Python - // environments in. If not provided, defaults to "env" - "env_dir": ".asv/env", - - // The directory (relative to the current directory) that raw benchmark - // results are stored in. If not provided, defaults to "results". - "results_dir": ".asv/results", - - // The directory (relative to the current directory) that the html tree - // should be written to. If not provided, defaults to "html". - "html_dir": ".asv/html", -} diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index 8c5e73be214..8e1dac27296 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -36,7 +36,6 @@ random_columns, random_string, translator_groupby_ngroups, - trigger_import, ) @@ -675,7 +674,6 @@ class TimeIndexing: def setup(self, shape, indexer_type): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) self.indexer = self.indexer_getters[indexer_type](self.df) if isinstance(self.indexer, (IMPL.Series, IMPL.DataFrame)): @@ -701,7 +699,6 @@ class TimeIndexingColumns: def setup(self, shape): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) self.numeric_indexer = [0, 1] self.labels_indexer = self.df.columns[self.numeric_indexer].tolist() diff --git a/asv_bench/benchmarks/hdk/__init__.py b/asv_bench/benchmarks/hdk/__init__.py deleted file mode 100644 index 6173c498dc6..00000000000 --- a/asv_bench/benchmarks/hdk/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Modin on HDK storage format benchmarks.""" diff --git a/asv_bench/benchmarks/hdk/benchmarks.py b/asv_bench/benchmarks/hdk/benchmarks.py deleted file mode 100644 index d7868fe5a27..00000000000 --- a/asv_bench/benchmarks/hdk/benchmarks.py +++ /dev/null @@ -1,473 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""General Modin on HDK storage format benchmarks.""" - -import numpy as np -import pandas - -from ..benchmarks import TimeIndexing as TimeIndexingPandasExecution -from ..benchmarks import TimeIndexingColumns as TimeIndexingColumnsPandasExecution -from ..utils import ( - GROUPBY_NGROUPS, - IMPL, - RAND_HIGH, - RAND_LOW, - execute, - gen_nan_data, - generate_dataframe, - get_benchmark_shapes, - random_booleans, - random_columns, - translator_groupby_ngroups, - trigger_import, -) - - -class TimeJoin: - param_names = ["shape", "how", "is_equal_keys"] - params = [ - get_benchmark_shapes("hdk.TimeJoin"), - ["left", "inner"], - [True, False], - ] - - def setup(self, shape, how, is_equal_keys): - self.df1, self.df2 = ( - generate_dataframe( - "int", - *frame_shape, - RAND_LOW, - RAND_HIGH, - cache_prefix=f"{i}-th_frame_to_join", - ) - for i, frame_shape in enumerate((shape, shape)) - ) - - if is_equal_keys: - # When the frames have default indices to join on: RangeIndex(frame_length), - # HDK backend performs join on the internal meta-column called 'rowid'. - # There is a bug in the engine that makes such joins fail. To avoid joining - # on the meta-column we explicitly specify a non-default index to join on. - # https://github.com/modin-project/modin/issues/3740 - # Generating a new object for every index to avoid shared index objects: - self.df1.index = pandas.RangeIndex(1, len(self.df1) + 1) - self.df2.index = pandas.RangeIndex(1, len(self.df2) + 1) - else: - # Intersection rate indicates how many common join-keys `self.df1` - # and `self.df2` have in terms of percentage. - indices_intersection_rate = 0.5 - - frame_length = len(self.df1) - intersect_size = int(frame_length * indices_intersection_rate) - - intersect_part = np.random.choice( - self.df1.index, size=intersect_size, replace=False - ) - non_intersect_part = np.arange( - start=frame_length, stop=frame_length + (frame_length - intersect_size) - ) - new_index = np.concatenate([intersect_part, non_intersect_part]) - - np.random.shuffle(new_index) - self.df1.index = new_index - - trigger_import(self.df1, self.df2) - - def time_join(self, shape, how, is_equal_keys): - # join dataframes on index to get the predictable shape - execute(self.df1.join(self.df2, how=how, lsuffix="left_")) - - -class TimeMerge: - param_names = ["shapes", "how"] - params = [ - get_benchmark_shapes("hdk.TimeMerge"), - ["left", "inner"], - ] - - def setup(self, shapes, how): - gen_unique_key = how == "inner" - self.dfs = [] - for i, shape in enumerate(shapes): - self.dfs.append( - generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - gen_unique_key=gen_unique_key, - cache_prefix=f"{i}-th_frame_to_merge", - ) - ) - trigger_import(*self.dfs) - - def time_merge(self, shapes, how): - # merging dataframes by index is not supported, therefore we merge by column - # with arbitrary values, which leads to an unpredictable form of the operation result; - # it's need to get the predictable shape to get consistent performance results - execute( - self.dfs[0].merge( - self.dfs[1], on="col1", how=how, suffixes=("left_", "right_") - ) - ) - - -class TimeBinaryOpDataFrame: - param_names = ["shape", "binary_op"] - params = [ - get_benchmark_shapes("hdk.TimeBinaryOpDataFrame"), - ["mul"], - ] - - def setup(self, shape, binary_op): - self.df1 = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df1) - self.op = getattr(self.df1, binary_op) - - def time_mul_scalar(self, shape, binary_op): - execute(self.op(2)) - - def time_mul_dataframes(self, shape, binary_op): - execute(self.op(self.df1)) - - -class TimeBinaryOpSeries: - param_names = ["shape", "binary_op"] - params = [ - get_benchmark_shapes("hdk.TimeBinaryOpSeries"), - ["mul"], - ] - - def setup(self, shape, binary_op): - self.series = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)["col0"] - trigger_import(self.series) - self.op = getattr(self.series, binary_op) - - def time_mul_series(self, shape, binary_op): - execute(self.op(self.series)) - - -class TimeArithmetic: - param_names = ["shape"] - params = [get_benchmark_shapes("hdk.TimeArithmetic")] - - def setup(self, shape): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - - def time_sum(self, shape): - execute(self.df.sum()) - - def time_median(self, shape): - execute(self.df.median()) - - def time_nunique(self, shape): - execute(self.df.nunique()) - - def time_apply(self, shape): - execute(self.df.apply(lambda df: df.sum())) - - def time_mean(self, shape): - execute(self.df.mean()) - - -class TimeSortValues: - param_names = ["shape", "columns_number", "ascending_list"] - params = [ - get_benchmark_shapes("hdk.TimeSortValues"), - [1, 5], - [False, True], - ] - - def setup(self, shape, columns_number, ascending_list): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - self.columns = random_columns(self.df.columns, columns_number) - self.ascending = ( - random_booleans(columns_number) - if ascending_list - else bool(random_booleans(1)[0]) - ) - - def time_sort_values(self, shape, columns_number, ascending_list): - execute(self.df.sort_values(self.columns, ascending=self.ascending)) - - -class TimeDrop: - param_names = ["shape", "drop_ncols"] - params = [ - get_benchmark_shapes("hdk.TimeDrop"), - [1, 0.8], - ] - - def setup(self, shape, drop_ncols): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - drop_count = ( - int(len(self.df.axes[1]) * drop_ncols) - if isinstance(drop_ncols, float) - else drop_ncols - ) - self.labels = self.df.axes[1][:drop_count] - - def time_drop(self, shape, drop_ncols): - execute(self.df.drop(self.labels, axis=1)) - - -class TimeHead: - param_names = ["shape", "head_count"] - params = [ - get_benchmark_shapes("hdk.TimeHead"), - [5, 0.8], - ] - - def setup(self, shape, head_count): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - self.head_count = ( - int(head_count * len(self.df.index)) - if isinstance(head_count, float) - else head_count - ) - - def time_head(self, shape, head_count): - execute(self.df.head(self.head_count)) - - -class TimeFillna: - param_names = ["value_type", "shape", "limit"] - params = [ - ["scalar", "dict"], - get_benchmark_shapes("hdk.TimeFillna"), - [None], - ] - - def setup(self, value_type, shape, limit): - self.df = gen_nan_data(*shape) - columns = self.df.columns - trigger_import(self.df) - - value = self.create_fillna_value(value_type, columns) - limit = int(limit * shape[0]) if limit else None - self.kw = {"value": value, "limit": limit} - - def time_fillna(self, value_type, shape, limit): - execute(self.df.fillna(**self.kw)) - - @staticmethod - def create_fillna_value(value_type: str, columns: list): - if value_type == "scalar": - value = 18.19 - elif value_type == "dict": - value = {k: i * 1.23 for i, k in enumerate(columns)} - else: - assert False - return value - - -class BaseTimeValueCounts: - def setup(self, shape, ngroups=5, subset=1): - ngroups = translator_groupby_ngroups(ngroups, shape) - self.df, self.subset = generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - groupby_ncols=subset, - count_groups=ngroups, - ) - trigger_import(self.df) - - -class TimeValueCountsDataFrame(BaseTimeValueCounts): - param_names = ["shape", "ngroups", "subset"] - params = [ - get_benchmark_shapes("hdk.TimeValueCountsDataFrame"), - GROUPBY_NGROUPS, - [2, 10], - ] - - def time_value_counts(self, *args, **kwargs): - execute(self.df.value_counts(subset=self.subset)) - - -class TimeValueCountsSeries(BaseTimeValueCounts): - param_names = ["shape", "ngroups"] - params = [ - get_benchmark_shapes("hdk.TimeValueCountsSeries"), - GROUPBY_NGROUPS, - ] - - def setup(self, shape, ngroups): - super().setup(shape, ngroups, subset=1) - self.series = self.df[self.subset[0]] - trigger_import(self.series) - - def time_value_counts(self, shape, ngroups): - execute(self.series.value_counts()) - - -class TimeIndexing(TimeIndexingPandasExecution): - params = [ - get_benchmark_shapes("hdk.TimeIndexing"), - *TimeIndexingPandasExecution.params[1:], - ] - - -class TimeIndexingColumns(TimeIndexingColumnsPandasExecution): - params = [ - get_benchmark_shapes("hdk.TimeIndexing"), - *TimeIndexingColumnsPandasExecution.params[1:], - ] - - -class TimeResetIndex: - param_names = ["shape", "drop", "level"] - params = [ - get_benchmark_shapes("hdk.TimeResetIndex"), - [False, True], - [None, "level_1"], - ] - - def setup(self, shape, drop, level): - if not drop or level == "level_1": - raise NotImplementedError - - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - if level: - index = IMPL.MultiIndex.from_product( - [self.df.index[: shape[0] // 2], ["bar", "foo"]], - names=["level_1", "level_2"], - ) - self.df.index = index - trigger_import(self.df) - - def time_reset_index(self, shape, drop, level): - execute(self.df.reset_index(drop=drop, level=level)) - - -class TimeAstype: - param_names = ["shape", "dtype", "astype_ncolumns"] - params = [ - get_benchmark_shapes("hdk.TimeAstype"), - ["float64"], - ["one", "all"], - ] - - def setup(self, shape, dtype, astype_ncolumns): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - self.astype_arg = self.create_astype_arg(dtype, astype_ncolumns) - - def time_astype(self, shape, dtype, astype_ncolumns): - execute(self.df.astype(self.astype_arg)) - - @staticmethod - def create_astype_arg(dtype, astype_ncolumns): - if astype_ncolumns == "all": - astype_arg = dtype - elif astype_ncolumns == "one": - astype_arg = {"col1": dtype} - else: - assert False - return astype_arg - - -class TimeDescribe: - param_names = ["shape"] - params = [get_benchmark_shapes("hdk.TimeDescribe")] - - def setup(self, shape): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - - def time_describe(self, shape): - execute(self.df.describe()) - - -class TimeProperties: - param_names = ["shape"] - params = [get_benchmark_shapes("hdk.TimeProperties")] - - def setup(self, shape): - self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) - trigger_import(self.df) - - def time_shape(self, shape): - return self.df.shape - - def time_columns(self, shape): - return self.df.columns - - def time_index(self, shape): - return self.df.index - - -class BaseTimeGroupBy: - def setup(self, shape, ngroups=5, groupby_ncols=1): - ngroups = translator_groupby_ngroups(ngroups, shape) - self.df, self.groupby_columns = generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - groupby_ncols, - count_groups=ngroups, - ) - # correct while we use 'col*' like name for non-groupby columns - # and 'groupby_col*' like name for groupby columns - self.non_groupby_columns = self.df.columns[:-groupby_ncols] - trigger_import(self.df) - - -class TimeGroupByDefaultAggregations(BaseTimeGroupBy): - param_names = ["shape", "ngroups"] - params = [ - get_benchmark_shapes("hdk.TimeGroupByDefaultAggregations"), - GROUPBY_NGROUPS, - ] - - def time_groupby_count(self, *args, **kwargs): - execute(self.df.groupby(by=self.groupby_columns).count()) - - def time_groupby_sum(self, *args, **kwargs): - execute(self.df.groupby(by=self.groupby_columns).sum()) - - -class TimeGroupByMultiColumn(BaseTimeGroupBy): - param_names = ["shape", "ngroups", "groupby_ncols"] - params = [ - get_benchmark_shapes("hdk.TimeGroupByMultiColumn"), - GROUPBY_NGROUPS, - [6], - ] - - def time_groupby_sum(self, *args, **kwargs): - execute(self.df.groupby(by=self.groupby_columns).sum()) - - def time_groupby_agg_mean(self, *args, **kwargs): - execute(self.df.groupby(by=self.groupby_columns).agg("mean")) - - def time_groupby_agg_nunique(self, *args, **kwargs): - execute(self.df.groupby(by=self.groupby_columns).agg("nunique")) - - def time_groupby_agg_mean_dict(self, *args, **kwargs): - execute( - self.df.groupby(by=self.groupby_columns).agg( - {col: "mean" for col in self.non_groupby_columns} - ) - ) - - -from ..utils import setup # noqa: E402, F401 diff --git a/asv_bench/benchmarks/hdk/io.py b/asv_bench/benchmarks/hdk/io.py deleted file mode 100644 index 6a5fd22926d..00000000000 --- a/asv_bench/benchmarks/hdk/io.py +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""IO Modin on HDK storage format benchmarks.""" - -from ..io.csv import TimeReadCsvTrueFalseValues # noqa: F401 -from ..utils import ( - ASV_USE_IMPL, - IMPL, - RAND_HIGH, - RAND_LOW, - generate_dataframe, - get_benchmark_shapes, - get_shape_id, - trigger_import, -) - - -class TimeReadCsvNames: - shapes = get_benchmark_shapes("hdk.TimeReadCsvNames") - param_names = ["shape"] - params = [shapes] - - def setup_cache(self, test_filename="io_test_file_csv_names"): - # filenames with a metadata of saved dataframes - cache = {} - for shape in self.shapes: - df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="pandas") - file_id = get_shape_id(shape) - cache[file_id] = ( - f"{test_filename}_{file_id}.csv", - df.columns.to_list(), - df.dtypes.to_dict(), - ) - df.to_csv(cache[file_id][0], index=False) - return cache - - def setup(self, cache, shape): - # ray init - if ASV_USE_IMPL == "modin": - IMPL.DataFrame([]) - file_id = get_shape_id(shape) - self.filename, self.names, self.dtype = cache[file_id] - - def time_read_csv_names(self, cache, shape): - df = IMPL.read_csv( - self.filename, - names=self.names, - header=0, - dtype=self.dtype, - ) - trigger_import(df) - - -from ..utils import setup # noqa: E402, F401 diff --git a/asv_bench/benchmarks/hdk/utils.py b/asv_bench/benchmarks/hdk/utils.py deleted file mode 100644 index 2f1ab799139..00000000000 --- a/asv_bench/benchmarks/hdk/utils.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""The module contains the functionality that is used when benchmarking Modin commits on HDK storage format.""" diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 65752ba6b70..a672ee37cc7 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -15,7 +15,6 @@ from ..utils import ( ASV_USE_IMPL, - ASV_USE_STORAGE_FORMAT, IMPL, RAND_HIGH, RAND_LOW, @@ -78,7 +77,6 @@ def time_true_false_values(self, test_filenames, shape): true_values=["Yes", "true"], false_values=["No", "false"], ), - trigger_hdk_import=ASV_USE_STORAGE_FORMAT == "hdk", ) diff --git a/asv_bench/benchmarks/utils/__init__.py b/asv_bench/benchmarks/utils/__init__.py index 9a8fa7bcf11..a59971f2d6c 100644 --- a/asv_bench/benchmarks/utils/__init__.py +++ b/asv_bench/benchmarks/utils/__init__.py @@ -27,7 +27,6 @@ random_string, setup, translator_groupby_ngroups, - trigger_import, ) from .compatibility import ASV_USE_IMPL, ASV_USE_STORAGE_FORMAT from .data_shapes import GROUPBY_NGROUPS, RAND_HIGH, RAND_LOW, get_benchmark_shapes @@ -51,6 +50,5 @@ "random_columns", "random_booleans", "translator_groupby_ngroups", - "trigger_import", "setup", ] diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py index e6b02bb8192..15967cc176b 100644 --- a/asv_bench/benchmarks/utils/common.py +++ b/asv_bench/benchmarks/utils/common.py @@ -28,12 +28,7 @@ import modin.pandas -from .compatibility import ( - ASV_DATASET_SIZE, - ASV_USE_ENGINE, - ASV_USE_IMPL, - ASV_USE_STORAGE_FORMAT, -) +from .compatibility import ASV_DATASET_SIZE, ASV_USE_ENGINE, ASV_USE_IMPL from .data_shapes import RAND_HIGH, RAND_LOW POSSIBLE_IMPL = { @@ -417,26 +412,7 @@ def random_booleans(number: int) -> list: return list(np.random.choice([True, False], size=number)) -def trigger_import(*dfs): - """ - Trigger import execution for DataFrames obtained by HDK engine. - - Parameters - ---------- - *dfs : iterable - DataFrames to trigger import. - """ - if ASV_USE_STORAGE_FORMAT != "hdk" or ASV_USE_IMPL == "pandas": - return - - for df in dfs: - df._query_compiler._modin_frame.force_import() - - -def execute( - df: Union[modin.pandas.DataFrame, pandas.DataFrame], - trigger_hdk_import: bool = False, -): +def execute(df: Union[modin.pandas.DataFrame, pandas.DataFrame]): """ Make sure the calculations are finished. @@ -444,16 +420,8 @@ def execute( ---------- df : modin.pandas.DataFrame or pandas.Datarame DataFrame to be executed. - trigger_hdk_import : bool, default: False - Whether `df` are obtained by import with HDK engine. """ - if trigger_hdk_import: - trigger_import(df) - return if ASV_USE_IMPL == "modin": - if ASV_USE_STORAGE_FORMAT == "hdk": - df._query_compiler._modin_frame._execute() - return partitions = df._query_compiler._modin_frame._partitions.flatten() mgr_cls = df._query_compiler._modin_frame._partition_mgr_cls if len(partitions) and hasattr(mgr_cls, "wait_partitions"): diff --git a/asv_bench/benchmarks/utils/compatibility.py b/asv_bench/benchmarks/utils/compatibility.py index 0fa4bf93e68..a581ff88e65 100644 --- a/asv_bench/benchmarks/utils/compatibility.py +++ b/asv_bench/benchmarks/utils/compatibility.py @@ -46,5 +46,5 @@ assert ASV_USE_IMPL in ("modin", "pandas") assert ASV_DATASET_SIZE in ("big", "small") -assert ASV_USE_ENGINE in ("ray", "dask", "python", "native", "unidist") -assert ASV_USE_STORAGE_FORMAT in ("pandas", "hdk") +assert ASV_USE_ENGINE in ("ray", "dask", "python", "unidist") +assert ASV_USE_STORAGE_FORMAT in ("pandas") diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py index 989ae80f50f..9aa5fccb8fc 100644 --- a/asv_bench/benchmarks/utils/data_shapes.py +++ b/asv_bench/benchmarks/utils/data_shapes.py @@ -16,15 +16,10 @@ import json import os -from .compatibility import ASV_DATASET_SIZE, ASV_USE_STORAGE_FORMAT +from .compatibility import ASV_DATASET_SIZE RAND_LOW = 0 -# use a small number of unique values in Github actions to avoid OOM (mostly related to HDK) -RAND_HIGH = ( - 1_000_000_000 - if ASV_USE_STORAGE_FORMAT == "hdk" and ASV_DATASET_SIZE == "Big" - else 100 -) +RAND_HIGH = 100 BINARY_OP_DATA_SIZE = { "big": [ @@ -57,19 +52,6 @@ } -HDK_BINARY_OP_DATA_SIZE = { - "big": [[[500_000, 20], [1_000_000, 10]]], - "small": [[[10_000, 20], [25_000, 10]]], -} -HDK_UNARY_OP_DATA_SIZE = { - "big": [[1_000_000, 10]], - "small": [[10_000, 10]], -} -HDK_SERIES_DATA_SIZE = { - "big": [[10_000_000, 1]], - "small": [[100_000, 1]], -} - DEFAULT_GROUPBY_NGROUPS = { "big": [100, "huge_amount_groups"], "small": [5], @@ -160,37 +142,6 @@ ), ] -_DEFAULT_HDK_CONFIG_T = [ - ( - HDK_UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], - [ - "hdk.TimeJoin", - "hdk.TimeBinaryOpDataFrame", - "hdk.TimeArithmetic", - "hdk.TimeSortValues", - "hdk.TimeDrop", - "hdk.TimeHead", - "hdk.TimeFillna", - "hdk.TimeIndexing", - "hdk.TimeResetIndex", - "hdk.TimeAstype", - "hdk.TimeDescribe", - "hdk.TimeProperties", - "hdk.TimeGroupByDefaultAggregations", - "hdk.TimeGroupByMultiColumn", - "hdk.TimeValueCountsDataFrame", - "hdk.TimeReadCsvNames", - ], - ), - ( - HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], - ["hdk.TimeMerge", "hdk.TimeAppend"], - ), - ( - HDK_SERIES_DATA_SIZE[ASV_DATASET_SIZE], - ["hdk.TimeBinaryOpSeries", "hdk.TimeValueCountsSeries"], - ), -] DEFAULT_CONFIG = {} DEFAULT_CONFIG["MergeCategoricals"] = ( [[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]] @@ -201,7 +152,7 @@ DEFAULT_CONFIG["TimeReplace"] = ( [[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]] ) -for config in (_DEFAULT_CONFIG_T, _DEFAULT_HDK_CONFIG_T): +for config in (_DEFAULT_CONFIG_T,): for _shape, _names in config: DEFAULT_CONFIG.update({_name: _shape for _name in _names}) @@ -255,6 +206,5 @@ def get_benchmark_shapes(bench_id: str): CONFIG_FROM_FILE = json.load(_f) if CONFIG_FROM_FILE and bench_id in CONFIG_FROM_FILE: - # example: "hdk.TimeReadCsvNames": [[5555, 55], [3333, 33]] return CONFIG_FROM_FILE[bench_id] return DEFAULT_CONFIG[bench_id] diff --git a/asv_bench/test/test_utils.py b/asv_bench/test/test_utils.py index 5f5c89917ca..3c2bee6cf5c 100644 --- a/asv_bench/test/test_utils.py +++ b/asv_bench/test/test_utils.py @@ -31,12 +31,12 @@ [ # binary shapes [[10, 10], [15, 15]], - [[11, 11], [13, 13]] + [[11, 11], [13, 13]], ], [ # unary shapes [11, 11], - [13, 13] + [13, 13], ], ], ), diff --git a/docker/hdk-engine-dev.dockerfile b/docker/hdk-engine-dev.dockerfile deleted file mode 100644 index cb811c2b9bd..00000000000 --- a/docker/hdk-engine-dev.dockerfile +++ /dev/null @@ -1,61 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -FROM ubuntu:18.04 -ENV http_proxy ${http_proxy} -ENV https_proxy ${https_proxy} - -RUN apt-get update --yes \ - && apt-get install wget git --yes \ - # - # cleanup - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /var/lib/apt/lists/* - -ENV CONDA_DIR ${HOME}/miniconda - -SHELL ["/bin/bash", "--login", "-c"] - -RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \ - && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \ - && "${CONDA_DIR}/bin/conda" init bash \ - && rm -f /tmp/miniconda3.sh \ - && echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile" - -# define `gh_username` can be useful in case of using modin fork -ARG gh_username=modin-project -ARG modin_dir="${HOME}/modin" - -# Clone modin repo -RUN mkdir "$modin_dir" \ - && git clone "https://github.com/$gh_username/modin.git" "$modin_dir" \ - && cd "$modin_dir" \ - && git remote add upstream "https://github.com/modin-project/modin.git" - -# install modin dependencies -RUN conda env create -n modin -f "$modin_dir/requirements/env_hdk.yml" - -# install modin -RUN cd "$modin_dir" \ - && conda activate modin \ - && pip install -e . --no-deps - -# setup environments for modin on hdk engine work -ENV MODIN_ENGINE "native" -ENV MODIN_STORAGE_FORMAT "hdk" -ENV MODIN_EXPERIMENTAL "true" - -# To work properly, run the following command in the container: -# conda activate modin -WORKDIR $modin_dir diff --git a/docs/conf.py b/docs/conf.py index b32f68b7d84..99355183995 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,8 +29,6 @@ def noop_decorator(*args, **kwargs): for mod_name in ( "cudf", "cupy", - "pyhdk", - "pyhdk.hdk", "xgboost", "unidist", "unidist.config", @@ -45,19 +43,6 @@ def noop_decorator(*args, **kwargs): sys.modules["cudf"].DataFrame = type("DataFrame", (object,), {}) if not hasattr(sys.modules["cupy"], "ndarray"): sys.modules["cupy"].ndarray = type("ndarray", (object,), {}) -if not hasattr(sys.modules["pyhdk"], "PyDbEngine"): - sys.modules["pyhdk"].PyDbEngine = type("PyDbEngine", (object,), {}) -if not hasattr(sys.modules["pyhdk.hdk"], "HDK"): - sys.modules["pyhdk.hdk"].HDK = type("HDK", (object,), {}) -if not hasattr(sys.modules["pyhdk.hdk"], "QueryNode"): - sys.modules["pyhdk.hdk"].QueryNode = type("QueryNode", (object,), {}) -if not hasattr(sys.modules["pyhdk.hdk"], "ExecutionResult"): - sys.modules["pyhdk.hdk"].ExecutionResult = type("ExecutionResult", (object,), {}) -if not hasattr(sys.modules["pyhdk.hdk"], "RelAlgExecutor"): - sys.modules["pyhdk.hdk"].RelAlgExecutor = type("RelAlgExecutor", (object,), {}) -if not hasattr(sys.modules["pyhdk"], "__version__"): - # Show all known pyhdk config options in documentation - sys.modules["pyhdk"].__version__ = "999" if not hasattr(sys.modules["xgboost"], "Booster"): sys.modules["xgboost"].Booster = type("Booster", (object,), {}) if not hasattr(sys.modules["unidist"], "remote"): diff --git a/docs/development/architecture.rst b/docs/development/architecture.rst index 96ca8554743..7e3d49c8032 100644 --- a/docs/development/architecture.rst +++ b/docs/development/architecture.rst @@ -56,7 +56,7 @@ For the simplicity the other execution systems - Dask and MPI are omitted and on on a selected storage format and mapping or compiling the Dataframe Algebra DAG to and actual execution sequence. * Storage formats module is responsible for mapping the abstract operation to an actual executor call, e.g. pandas, - HDK, custom format. + custom format. * Orchestration subsystem is responsible for spawning and controlling the actual execution environment for the selected execution. It spawns the actual nodes, fires up the execution environment, e.g. Ray, monitors the state of executors and provides telemetry @@ -65,8 +65,7 @@ Component View -------------- User queries which perform data transformation, data ingress or data egress pass through the Modin components -detailed below. The path the query takes is mostly similar across execution systems, with some minor exceptions like -:doc:`HdkOnNative `. +detailed below. The path the query takes is mostly similar across execution systems. Data Transformation ''''''''''''''''''' @@ -224,10 +223,6 @@ documentation page on :doc:`contributing `. - Uses native python execution - mainly used for debugging. - The storage format is `pandas` and the in-memory partition type is a pandas DataFrame. - For more information on the execution path, see the :doc:`pandas on Python ` page. -- :doc:`HDK on Native ` (experimental) - - Uses HDK as an engine. - - The storage format is `hdk` and the in-memory partition type is a pyarrow Table. When defaulting to pandas, the pandas DataFrame is used. - - For more information on the execution path, see the :doc:`HDK on Native ` page. - cuDF on Ray (experimental) - Uses the Ray_ execution framework. - The storage format is `cudf` and the in-memory partition type is a cuDF DataFrame. @@ -247,10 +242,8 @@ following figure illustrates this concept. .. image:: /img/block_partitions_diagram.png :align: center -Currently, the main in-memory format of each partition is a `pandas DataFrame`_ (:doc:`pandas storage format `). -:doc:`HDK ` -and cuDF are also supported as experimental in-memory formats in Modin. - +Currently, the main in-memory format of each partition is a +`pandas DataFrame`_ (:doc:`pandas storage format `). Index ----- @@ -324,17 +317,10 @@ details. The documentation covers most modules, with more docs being added every │ │ │ └─── :doc:`pandas ` │ ├─── :doc:`experimental ` │ │ ├───core - │ │ │ ├───execution - │ │ │ │ └───native - │ │ │ │ └───implementations - │ │ │ │ └─── :doc:`hdk_on_native ` - │ │ │ ├─── :doc:`storage_formats ` - | │ │ | └───:doc:`hdk ` | | | └─── :doc:`io ` │ │ ├─── :doc:`pandas ` │ │ ├─── :doc:`sklearn ` │ │ ├───spreadsheet - │ │ ├───sql │ │ ├─── :doc:`xgboost ` │ │ └─── :doc:`batch ` │ └───pandas diff --git a/docs/development/index.rst b/docs/development/index.rst index 5e257501857..bfcc1e9d8b6 100644 --- a/docs/development/index.rst +++ b/docs/development/index.rst @@ -11,7 +11,6 @@ Development using_pandas_on_dask using_pandas_on_python using_pandas_on_mpi - using_hdk .. meta:: :description lang=en: diff --git a/docs/development/using_hdk.rst b/docs/development/using_hdk.rst deleted file mode 100644 index 86a6b39ffc5..00000000000 --- a/docs/development/using_hdk.rst +++ /dev/null @@ -1,55 +0,0 @@ -HDK -=== - -This section describes usage related documents for the HDK-based engine of Modin. - -This engine uses the HDK_ library to obtain high single-node scalability for -specific set of dataframe operations. -To enable this engine you can set the following environment variable: - -.. code-block:: bash - - export MODIN_STORAGE_FORMAT=hdk - -or use it in your code: - -.. code-block:: python - - import modin.config as cfg - cfg.StorageFormat.put('hdk') - -Since HDK is run through its native engine, Modin automatically sets ``MODIN_ENGINE=Native`` and you might not specify it explicitly. -If for some reasons ``Native`` engine is explicitly set using ``modin.config`` or -``MODIN_ENGINE`` environment variable, make sure you also tell Modin that -``Experimental`` mode is turned on (``export MODIN_EXPERIMENTAL=true`` or -``cfg.IsExperimental.put(True)``) otherwise the following error occurs: - -.. code-block:: bash - - FactoryNotFoundError: HDK on Native is only accessible through the experimental API. - Run `import modin.experimental.pandas as pd` to use HDK on Native. - - -.. note:: - If you encounter ``LLVM ERROR: inconsistency in registered CommandLine options`` error when using HDK, - please refer to the respective section in :doc:`Troubleshooting ` page to avoid the issue. - - -Running on a GPU ----------------- - -Prerequisites: - -* HDK's GPU mode is currently supported on Linux and Intel GPU only. -* HDK supports Gen9 architecture and higher (including Xe & Arc). -* HDK's GPU mode requires proper driver installation. Follow this guide_ to set up your system. Make sure to install the compute runtime packages: ``intel-opencl-icd``, ``intel-level-zero-gpu``, ``level-zero``. -* Make sure your GPU is visible and accessible. - -.. note:: - You can use ``hwinfo`` and ``clinfo`` utilities to verify the driver installation and device accessibility. - -HDK supports a heterogeneous execution mode (experimental) that is disabled by default in Modin. Starting with pyHDK version 0.7 Modin can run the workload on Intel GPU. -Run on a GPU via ``MODIN_HDK_LAUNCH_PARAMETERS="cpu_only=0" python ``. - -.. _HDK: https://github.com/intel-ai/hdk -.. _guide: https://dgpu-docs.intel.com/driver/installation.html \ No newline at end of file diff --git a/docs/flow/modin/config.rst b/docs/flow/modin/config.rst index 78220506e4b..d060d9498a3 100644 --- a/docs/flow/modin/config.rst +++ b/docs/flow/modin/config.rst @@ -38,17 +38,17 @@ API. import os - # Setting `MODIN_STORAGE_FORMAT` environment variable. + # Setting `MODIN_ENGINE` environment variable. # Also can be set outside the script. - os.environ["MODIN_STORAGE_FORMAT"] = "Hdk" + os.environ["MODIN_ENGINE"] = "Dask" import modin.config import modin.pandas as pd - # Checking initially set `StorageFormat` config, - # which corresponds to `MODIN_STORAGE_FORMAT` environment + # Checking initially set `Engine` config, + # which corresponds to `MODIN_ENGINE` environment # variable - print(modin.config.StorageFormat.get()) # prints 'Hdk' + print(modin.config.Engine.get()) # prints 'Dask' # Checking default value of `NPartitions` print(modin.config.NPartitions.get()) # prints '8' diff --git a/docs/flow/modin/core/storage_formats/index.rst b/docs/flow/modin/core/storage_formats/index.rst index 1d98af0d8dc..782721dc591 100644 --- a/docs/flow/modin/core/storage_formats/index.rst +++ b/docs/flow/modin/core/storage_formats/index.rst @@ -7,10 +7,7 @@ of objects that are stored in the partitions of the selected Core Modin Datafram The base storage format in Modin is pandas. In that format, Modin Dataframe operates with partitions that hold ``pandas.DataFrame`` objects. Pandas is the most natural storage format -since high-level DataFrame objects mirror its API, however, Modin's storage formats are not -limited to the objects that conform to pandas API. There is format that are able to store -even instances of SQL-like databases (:doc:`HDK storage format `) -inside Modin Dataframe's partitions. +since high-level DataFrame objects mirror its API. The storage format + execution engine (Ray, Dask, etc.) form the execution backend. The Query Compiler (QC) converts high-level pandas API calls to queries that are understood @@ -73,5 +70,3 @@ This module houses submodules of all of the stable storage formats: - :doc:`Base module ` contains an abstract query compiler class which defines common API. - :doc:`Pandas module ` contains query compiler and text parsers for pandas storage format. - cuDF module contains query compiler and text parsers for cuDF storage format. - -You can find more in the :doc:`experimental section `. diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.rst deleted file mode 100644 index 8eca8961853..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.rst +++ /dev/null @@ -1,98 +0,0 @@ -CalciteBaseNode -""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteBaseNode - :members: - -CalciteScanNode -""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteScanNode - :members: - -CalciteProjectionNode -""""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteProjectionNode - :members: - -CalciteFilterNode -""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteFilterNode - :members: - -CalciteAggregateNode -"""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteAggregateNode - :members: - -CalciteCollation -"""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteCollation - :members: - -CalciteSortNode -""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteSortNode - :members: - -CalciteJoinNode -""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteJoinNode - :members: - -CalciteUnionNode -"""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteUnionNode - :members: - -CalciteInputRefExpr -""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteInputRefExpr - :members: - -CalciteInputIdxExpr -""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteInputIdxExpr - :members: diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.rst deleted file mode 100644 index ee27d090741..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.rst +++ /dev/null @@ -1,8 +0,0 @@ -CalciteBuilder -"""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_builder.CalciteBuilder - :members: diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.rst deleted file mode 100644 index dbe8d1425aa..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.rst +++ /dev/null @@ -1,8 +0,0 @@ -CalciteSerializer -""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_serializer.CalciteSerializer - :members: diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe.rst deleted file mode 100644 index b379b83ed1a..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe.rst +++ /dev/null @@ -1,8 +0,0 @@ -HdkOnNativeDataframe -"""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe - :members: diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.rst deleted file mode 100644 index bf66d28ae99..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.rst +++ /dev/null @@ -1,116 +0,0 @@ -TransformMapper -""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.TransformMapper - :members: - -FrameMapper -""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FrameMapper - :members: - -InputMapper -""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.InputMapper - :members: - -DFAlgNode -""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.DFAlgNode - :members: - -FrameNode -""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FrameNode - :members: - -MaskNode -"""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.MaskNode - :members: - -GroupbyAggNode -"""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.GroupbyAggNode - :members: - -TransformNode -""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.TransformNode - :members: - -JoinNode -"""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.JoinNode - :members: - -UnionNode -""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.UnionNode - :members: - -SortNode -"""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.SortNode - :members: - -FilterNode -"""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FilterNode - :members: - -Utilities -""""""""" - -Public API ----------- - -.. autofunction:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.translate_exprs_to_base -.. autofunction:: modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.replace_frame_in_exprs diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.rst deleted file mode 100644 index 8ded974e0a3..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.rst +++ /dev/null @@ -1,55 +0,0 @@ -BaseExpr -"""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.BaseExpr - :members: - -InputRefExpr -"""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.InputRefExpr - :members: - -LiteralExpr -""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.LiteralExpr - :members: - -OpExpr -"""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.OpExpr - :members: - -AggregateExpr -""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.AggregateExpr - :members: - -Utilities -""""""""" - -Public API ----------- - -.. autofunction:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.is_cmp_op -.. autofunction:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.build_row_idx_filter_expr -.. autofunction:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.build_if_then_else -.. autofunction:: modin.experimental.core.execution.native.implementations.hdk_on_native.expr.build_dt_expr diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.rst deleted file mode 100644 index c118ea49b7c..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.rst +++ /dev/null @@ -1,8 +0,0 @@ -HdkWorker -""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.hdk_worker.HdkWorker - :members: diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst deleted file mode 100644 index e3911d36bd0..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst +++ /dev/null @@ -1,286 +0,0 @@ -:orphan: - -HdkOnNative execution -===================== - -HDK is a low-level execution library for data analytics processing. -HDK is used as a fast execution backend in Modin. The HDK library provides -a set of components for federating analytic queries to an execution backend -based on OmniSciDB. - -OmniSciDB is an open-source SQL-based relational database designed for the -massive parallelism of modern CPU and GPU hardware. Its execution engine -is built on LLVM JIT compiler. - -HDK can be embedded into an application as a python module - ``pyhdk``. This module -provides Python APIs to the HDK library. A specialized in-memory storage layer -provides an efficient way to import data in Arrow table format. - -`HdkOnNative` execution uses HDK for both as a storage format and for -actual data transformation. - -Relational engine limitations ------------------------------ - -Using a relational database engine implies a set of restrictions on -operations we can execute on a dataframe. - -1. We cannot handle frames that use data types not supported by OmniSciDB. - Currently, we allow only integer, float, string, and categorical data types. - -2. Column data should be homogeneous. - -3. Can only support operations that map to relational algebra. This means - most operations are supported over a single axis (axis=0) only. Non-relational - operations like transposition and pivot are not supported. - -When the unsupported data type is detected or unsupported operations is requested -it falls back to the original pandas framework. - -Partitions ----------- - -In Modin, partitioning is used to achieve high parallelism. In the case of -HDK-based execution, parallelism is provided by HDK execution -engine itself and we don't need to manage multiple partitions. -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` -always has a single partition. - -A partition holds data in either ``pandas.DataFrame``, ``pyarrow.Table`` or ``DbTable`` -format. ``pandas.DataFrame`` is preferred only when we detect unsupported -data type and therefore have to use ``pandas`` framework for processing. -The ``pyarrow.Table`` format is used when a ``DataFrame`` is created and until the -table is imported into HDK. When it's imported, the partition data is replaced with -a ``DbTable``. ``DbTable`` represents a table in the HDK database and provides basic -information about the table: table name, column names, shape. It also allows -exporting the data into the ``pyarrow.Table`` format. Depending on the data types, -a ``pyarrow.Table`` import/export could be performed zero-copy. A query execution -result is also returned as a ``DbTable``. - -Data Ingress ------------- - -When users import data in Modin DataFrame (from a file or from some python -object like array or dictionary) they invoke one of the ``modin.pandas.io`` -functions (to read data from a file) or use :py:class:`~modin.pandas.dataframe.DataFrame` constructor -(to create a DataFrame from an iterable object). Both of the paths lead to the -:py:class:`~modin.core.execution.dispatching.factories.dispatcher.FactoryDispatcher` -that defines a factory that handles the import query. For `HdkOnNative` -execution, the factory is accordingly -:py:class:`~modin.core.execution.dispatching.factories.factories.ExperimentalHdkOnNativeFactory`. -The factory dispatches the import query: if the data needs to be read from a file -- the query is routed to the -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.io.HdkOnNativeIO` -class, that uses Arrow Framework to read the file into a PyArrow Table, the resulted -table is passed to the -:py:class:`~modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler`. -If the factory deals with importing a Python's iterable object, the query goes straight -into the -:py:class:`~modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler`. -The Query Compiler sanitizes an input object and passes it to one of the -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` -factory methods (``.from_*``). The Dataframe's build method stores the passed object into a new Dataframe's partition -and returns the resulted Dataframe, which is then wrapped into a Query Compiler, which is -wrapped into a high-level Modin DataFrame, which is returned to the user. - -.. figure:: /img/hdk/hdk_ingress.svg - :align: center - -Note that during this ingress flow, no data is actually imported to HDK. The need for -importing to HDK is decided later at the execution stage by the Modin Core Dataframe layer. -If the query requires for the data to be placed in HDK, the import is triggered. -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` -passes partition to import to the -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition_manager.HdkOnNativeDataframePartitionManager` -that extracts a partition's underlying object and sends a request to import it to HDK. -The response for the request is a unique identifier for the just imported table -at HDK, this identifier is placed in the partition. After that, the partition has -a reference to the concrete table in HDK to query, and the data is considered to be -fully imported. - -.. figure:: /img/hdk/hdk_import.svg - :align: center - -Data Transformation -------------------- - -.. figure:: /img/hdk/hdk_query_flow.svg - :align: center - -When a user calls any :py:class:`~modin.pandas.dataframe.DataFrame` API, a query -starts forming at the `API` layer to be executed at the `Execution` layer. The `API` -layer is responsible for processing the query appropriately, for example, determining -whether the final result should be a ``DataFrame`` or ``Series`` object, and -sanitizing the input to the -:py:class:`~modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler`, -e.g. validating a parameter from the query and defining specific intermediate values -to provide more context to the query compiler. - -The :py:class:`~modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler` -is responsible for reducing the query to the pre-defined Dataframe algebra operators -and triggering execution on the -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe`. - -When the :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` -receives a query, it determines whether the operation requires data materialization -or whether it can be performed lazily. The operation is then either appended to a -lazy computation tree or executed immediately. - -Lazy execution -"""""""""""""" - -HDK has a powerful query optimizer and an execution engine that -combines multiple operations into a single execution module. E.g. join, -filter and aggregation can be executed in a single data scan. - -To utilize this feature and reduce data transformation and transfer -overheads, all of the operations that don't require data materialization -are performed lazily. - -Lazy operations on a frame build a tree which is later translated into -a query executed by HDK. Each of the tree nodes has its input node(s) -- a frame argument(s) of the operation. When a new node is appended to the -tree, it becomes its root. The leaves of the tree are always a special node -type, whose input is an actual materialized frame to execute operations -from the tree on. - -.. figure:: /img/hdk/hdk_lazy_tree_example.svg - :align: center - -There are two types of trees. The first one describes operations on frames that -map to relational operations like projection, union, etc. Nodes in this tree are -derived from -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.DFAlgNode` -class. Leaf nodes are instances of the -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FrameNode` -class. The second type of tree is used to describe operations on columns, including -arithmetic operations, type casts, datetime operations, etc. Nodes of this tree are derived from -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.expr.BaseExpr` -class. Leaf nodes are instances of the -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.expr.InputRefExpr` -class. - -Visit the corresponding sections to go through all of the types of nodes: - -* :doc:`Frame nodes ` -* :doc:`Expression nodes ` - -Execution of a computation tree -""""""""""""""""""""""""""""""" - -Frames are materialized (executed) when their data is accessed. E.g. it -happens when we try to access the frame's index or shape. There are two ways -to execute required operations: through Arrow or through HDK. - -Arrow execution -''''''''''''''' - -For simple operations which don't include actual computations, execution can use -Arrow API. We can use it to rename columns, drop columns and concatenate -frames. Arrow execution is performed if we have an arrow table in the partition -and it's preferable since it doesn't require actual data import into HDK. - -HDK execution -''''''''''''' - -To execute a query in the HDK engine we need to import data first. We should -find all leaves of an operation tree and import their Arrow tables. Partitions -with ``DbTable`` hold corresponding table names used to refer to them in -queries. - -HDK executes queries expressed in HDK-specific intermediate representation (IR) format. -It also provides components to translate SQL queries to relational algebra JSON format -which can be later optimized and translated to HDK IR. Modin generates queries in relational -algebra JSON format. These queries are optionally optimized with Apache Calcite -based optimizer provided by HDK (:py:class:`~pyhdk.sql.Calcite`) and then executed. - -Operations used by Calcite in its intermediate representation are implemented -in classes derived from -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteBaseNode`. -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_builder.CalciteBuilder` is used to -translate :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.DFAlgNode`-based -trees into :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteBaseNode`-based sequences. -It also translates :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.expr.BaseExpr`-based -trees by replacing :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.expr.InputRefExpr` -nodes with either :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteInputRefExpr` -or :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_algebra.CalciteInputIdxExpr` -depending on context. - -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_serializer.CalciteSerializer` -is used to serialize the resulting sequence into -JSON format. This JSON becomes a query by simply adding 'execute relalg' -or 'execute calcite' prefix (the latter is used if we want to use Calcite -for additional query optimization). - -.. figure:: /img/hdk/hdk_calcite_serialization_flow.svg - :align: center - -The building of Calcite query (starting from the conversion to the Calcite Algebra and up to -the forming JSON query) is orchestrated by -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition_manager.HdkOnNativeDataframePartitionManager`. - -An execution result is a new table in the HDK database, that is represented by ``DbTable``, -which is used to form a new partition. This partition is assigned to the executed frame. -The frame's operation tree is replaced with -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FrameNode` operation. - -Rowid column and sub-queries -'''''''''''''''''''''''''''' - -A special case of an index is the default index - 0-based numeric sequence. -In our representation, such an index is represented by the absence of index columns. -If we need to access the index value we can use the virtual ``rowid`` column provided -by HDK. Unfortunately, this special column is available for physical -tables only. That means we cannot access it for a node that is not a tree leaf. -That makes us execute trees with such nodes in several steps. First, we -materialize all frames that require ``rowid`` column and only after that we can -materialize the root of the tree. - -HdkOnNative Dataframe Implementation ------------------------------------- - -Modin implements ``Dataframe``, ``PartitionManager`` and ``Partition`` classes -specific for ``HdkOnNative`` execution: - -* :doc:`HdkOnNativeDataframe ` -* :doc:`HdkOnNativeDataframePartition ` -* :doc:`HdkOnNativeDataframePartitionManager ` - -To support lazy execution Modin uses two types of trees. Operations on frames are described -by ``DFAlgNode`` based trees. Scalar computations are described by ``BaseExpr`` based tree. - -* :doc:`Frame nodes ` -* :doc:`Expression nodes ` - -Interactions with HDK engine are done using ``HdkWorker`` class. Queries use serialized -Calcite relational algebra format. Calcite algebra nodes are based on ``CalciteBaseNode`` class. -Translation is done by ``CalciteBuilder`` class. Serialization is performed by ``CalciteSerializer`` -class. - -* :doc:`CalciteBaseNode ` -* :doc:`CalciteBuilder ` -* :doc:`CalciteSerializer ` -* :doc:`HdkWorker ` - -Column name mangling -"""""""""""""""""""" - -In ``pandas.DataFrame`` columns might have names of non-string types or not allowed -in SQL (e. g. an empty string). To handle this we use an internal encoder, that -makes the names SQL-compatible. Index labels are more tricky because they might be -non-unique. Indexes are represented as regular columns, and we have to perform a -special mangling to get valid and unique column names. Demangling is done when we -transform our frame (i.e. its Arrow table) into ``pandas.DataFrame`` format. - -.. toctree:: - :hidden: - - dataframe - partitioning/partition - partitioning/partition_manager - df_algebra - expr - calcite_algebra - calcite_builder - calcite_serializer - hdk_worker diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.rst deleted file mode 100644 index 5e6241361c9..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.rst +++ /dev/null @@ -1,8 +0,0 @@ -HdkOnNativeDataframePartition -""""""""""""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition.HdkOnNativeDataframePartition - :members: diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.rst deleted file mode 100644 index 8671ff2c429..00000000000 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.rst +++ /dev/null @@ -1,8 +0,0 @@ -HdkOnNativeDataframePartitionManager -"""""""""""""""""""""""""""""""""""" - -Public API ----------- - -.. autoclass:: modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition_manager.HdkOnNativeDataframePartitionManager - :members: diff --git a/docs/flow/modin/experimental/core/storage_formats/hdk/index.rst b/docs/flow/modin/experimental/core/storage_formats/hdk/index.rst deleted file mode 100644 index cc8fbd30e79..00000000000 --- a/docs/flow/modin/experimental/core/storage_formats/hdk/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -HDK storage format -"""""""""""""""""" - -.. toctree:: - :hidden: - - query_compiler - -High-Level Module Overview -'''''''''''''''''''''''''' - -This module contains :py:class:`~modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler` -class used for lazy Dataframe based execution implementations. - -For more information about the specific of this format please visit the :doc:`implementation page `. diff --git a/docs/flow/modin/experimental/core/storage_formats/hdk/query_compiler.rst b/docs/flow/modin/experimental/core/storage_formats/hdk/query_compiler.rst deleted file mode 100644 index ce3566a1ac2..00000000000 --- a/docs/flow/modin/experimental/core/storage_formats/hdk/query_compiler.rst +++ /dev/null @@ -1,13 +0,0 @@ -DFAlgQueryCompiler -"""""""""""""""""" - -:py:class:`~modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler` implements -a query compiler for lazy frame. Each compiler instance holds an instance of -:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` -which is used to build a lazy execution tree. - -Public API -'''''''''' - -.. autoclass:: modin.experimental.core.storage_formats.hdk.query_compiler.DFAlgQueryCompiler - :members: diff --git a/docs/flow/modin/experimental/core/storage_formats/index.rst b/docs/flow/modin/experimental/core/storage_formats/index.rst deleted file mode 100644 index 8a5213c1ea8..00000000000 --- a/docs/flow/modin/experimental/core/storage_formats/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -Experimental storage formats -"""""""""""""""""""""""""""" - -``modin.experimental.storage_formats`` holds experimental storage formats that are under development right now -and provides a limited set of functionality: - -* :doc:`hdk ` - - -.. toctree:: - :hidden: - - hdk/index diff --git a/docs/getting_started/examples.rst b/docs/getting_started/examples.rst index 8c06baaf65e..3eb4591aa97 100644 --- a/docs/getting_started/examples.rst +++ b/docs/getting_started/examples.rst @@ -18,9 +18,9 @@ Tutorials The following tutorials cover the basic usage of Modin. `Here `__ is a one hour video tutorial that walks through these basic exercises. -- Exercise 1: Introduction to Modin [`Source PandasOnRay `__, `Source PandasOnDask `__, `Source HdkOnNative `__] -- Exercise 2: Speed Improvements with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__, `Source HdkOnNative `__] -- Exercise 3: Defaulting to pandas with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__, `Source HdkOnNative `__] +- Exercise 1: Introduction to Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] +- Exercise 2: Speed Improvements with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] +- Exercise 3: Defaulting to pandas with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] The following tutorials covers more advanced features in Modin: diff --git a/docs/getting_started/faq.rst b/docs/getting_started/faq.rst index aa8b8190f23..8338a77cca2 100644 --- a/docs/getting_started/faq.rst +++ b/docs/getting_started/faq.rst @@ -136,8 +136,7 @@ This can also be done with: modin_cfg.Engine.put('unidist') # Modin will use Unidist unidist_cfg.Backend.put('mpi') # Unidist will use MPI backend -We also have an experimental HDK-based engine of Modin, which you can read about on :doc:`Using HDK ` -page. We plan to support more execution engines in future. If you have a specific request, +We plan to support more execution engines in future. If you have a specific request, please post on the #feature-requests channel on our Slack_ community. How do I connect Modin to a database via `read_sql`? diff --git a/docs/getting_started/installation.rst b/docs/getting_started/installation.rst index 16c34d87791..100e8b120dc 100644 --- a/docs/getting_started/installation.rst +++ b/docs/getting_started/installation.rst @@ -25,7 +25,7 @@ To install the most recent stable release run the following: pip install -U modin # -U for upgrade in case you have an older version Modin can be used with :doc:`Ray`, :doc:`Dask`, -:doc:`Unidist`, or :doc:`HDK` engines. +:doc:`Unidist` engines. If you don't have Ray_, Dask_ or Unidist_ installed, you will need to install Modin with one of the targets: .. code-block:: bash @@ -45,7 +45,7 @@ This means that the dashboard and cluster launcher are no longer installed by de If you need those, consider installing ``ray[default]`` along with ``modin[ray]``. Modin will automatically detect which engine you have installed and use that for -scheduling computation! See below for HDK engine installation. +scheduling computation! Release candidates """""""""""""""""" @@ -128,9 +128,7 @@ it is possible to install modin with chosen engine(s) alongside. Current options +---------------------------------+---------------------------+-----------------------------+ | modin-mpi | MPI_ through unidist_ | Linux, Windows, MacOS | +---------------------------------+---------------------------+-----------------------------+ -| modin-hdk | HDK_ | Linux | -+---------------------------------+---------------------------+-----------------------------+ -| modin-all | Dask, Ray, Unidist, HDK | Linux | +| modin-all | Dask, Ray, Unidist | Linux | +---------------------------------+---------------------------+-----------------------------+ **Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: ``ray-core`` instead of ``ray-default``. @@ -153,7 +151,7 @@ or explicitly: .. code-block:: bash - conda install -c conda-forge modin-ray modin-dask modin-mpi modin-hdk + conda install -c conda-forge modin-ray modin-dask modin-mpi Refer to `Installing with conda`_ section of the unidist documentation for more details on how to install a specific MPI implementation to run on. @@ -169,21 +167,14 @@ Then it can be used during installation either like .. code-block:: bash - conda install -c conda-forge modin-ray modin-hdk --experimental-solver=libmamba + conda install -c conda-forge modin-ray modin- --experimental-solver=libmamba or starting from conda 22.11 and libmamba solver 22.12 versions .. code-block:: bash - conda install -c conda-forge modin-ray modin-hdk --solver=libmamba - - -Using Intel\ |reg| Distribution of Modin -"""""""""""""""""""""""""""""""""""""""" + conda install -c conda-forge modin-ray --solver=libmamba -With ``conda`` it is also possible to install `Intel Distribution of Modin`_, a special version of Modin -that is part of Intel\ |reg| oneAPI AI Analytics Toolkit. This version of Modin is powered by :doc:`HDK` -engine that contains a bunch of optimizations for Intel hardware. More details to get started can be found in the `Intel Distribution of Modin Getting Started`_ guide. Installing from the GitHub main branch -------------------------------------- @@ -203,7 +194,7 @@ If you would like to install Modin with a specific engine, you can use ``modin[r Windows ------- -All Modin engines except :doc:`HDK` are available both on Windows and Linux as mentioned above. +All Modin engines are available both on Windows and Linux as mentioned above. Default engine on Windows is :doc:`Ray`. It is also possible to use Windows Subsystem For Linux (WSL_), but this is generally not recommended due to the limitations and poor performance of Ray on WSL, a roughly @@ -245,7 +236,6 @@ Once cloned, ``cd`` into the ``modin`` directory and use ``pip`` to install: .. _Unidist: https://github.com/modin-project/unidist .. _`Installing with pip`: https://unidist.readthedocs.io/en/latest/installation.html#installing-with-pip .. _`Installing with conda`: https://unidist.readthedocs.io/en/latest/installation.html#installing-with-conda -.. _HDK: https://github.com/intel-ai/hdk .. _`Intel Distribution of Modin`: https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-of-modin.html#gs.86stqv .. _`Intel Distribution of Modin Getting Started`: https://www.intel.com/content/www/us/en/developer/articles/technical/intel-distribution-of-modin-getting-started-guide.html .. |reg| unicode:: U+000AE .. REGISTERED SIGN diff --git a/docs/getting_started/troubleshooting.rst b/docs/getting_started/troubleshooting.rst index 75f4fc17b6f..1d98dcfd6d6 100644 --- a/docs/getting_started/troubleshooting.rst +++ b/docs/getting_started/troubleshooting.rst @@ -267,27 +267,6 @@ either by excluding the time of the first iteration from your measurements or ex Common errors ------------- -Error when using HDK engine along with ``pyarrow.gandiva``: ``LLVM ERROR: inconsistency in registered CommandLine options`` -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - -This can happen when you use HDK engine along with ``pyarrow.gandiva``: - -.. code-block:: python - - import modin.config as cfg - cfg.Engine.put("Native") # The engine would be imported with dlopen flags - cfg.StorageFormat.put("Hdk") - cfg.IsExperimental.put(True) - import modin.pandas as pd - import pyarrow.gandiva as gandiva # Error - # CommandLine Error: Option 'enable-vfe' registered more than once! - # LLVM ERROR: inconsistency in registered CommandLine options - # Aborted (core dumped) - -**Solution** - -Do not use HDK engine along with ``pyarrow.gandiva``. - Error when using Dask engine: ``RuntimeError: if __name__ == '__main__':`` """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" diff --git a/docs/getting_started/why_modin/modin_vs_dask_vs_koalas.rst b/docs/getting_started/why_modin/modin_vs_dask_vs_koalas.rst index 5fb95a09223..c819db91859 100644 --- a/docs/getting_started/why_modin/modin_vs_dask_vs_koalas.rst +++ b/docs/getting_started/why_modin/modin_vs_dask_vs_koalas.rst @@ -46,7 +46,7 @@ Execution Semantics **DaskDF and Koalas make use of lazy evaluation, which means that the computation is delayed until users explicitly evaluate the results.** This mode of evaluation places a lot of optimization responsibility on the user, forcing them to think about when it would be useful to inspect the intermediate results or delay doing so. Specifically, DaskDF's API differs from pandas in that it requires users to explicitly call ``.compute()`` to materialize the result of the computation. Often if that computation corresponds to a long chain of operators, this call can take a very long time to execute. Overall, the need to explicitly trigger computation makes the API less convenient to work with, but gives DaskDF and Koalas the opportunity to perform holistic optimizations over the entire dataflow graph. However, to the best of our knowledge, neither DaskDF nor Koalas actually leverage holistic optimizations. -**Modin employs eager evaluation, like pandas.** Eager evaluation is the default mode of operation for data scientists when working with pandas in an interactive environment, such as Jupyter Notebooks. Modin reproduces this familiar behavior by performing all computations eagerly as soon as it is issued, so that users can inspect intermediate results and quickly see the results of their computations without having to wait or explicitly trigger computation. This is especially useful during interactive data analysis, where users often iterate on their dataframe workflows or build up their dataframe queries in an incremental fashion. Modin also supports lazy evaluation via the HDK engine, you can learn more about it on :doc:`HDK ` page. We also have developed techniques for `opportunistic evaluation `_ that bridges the gap between lazy and eager evaluation that will be incorporated in Modin in the future. +**Modin employs eager evaluation, like pandas.** Eager evaluation is the default mode of operation for data scientists when working with pandas in an interactive environment, such as Jupyter Notebooks. Modin reproduces this familiar behavior by performing all computations eagerly as soon as it is issued, so that users can inspect intermediate results and quickly see the results of their computations without having to wait or explicitly trigger computation. This is especially useful during interactive data analysis, where users often iterate on their dataframe workflows or build up their dataframe queries in an incremental fashion. We also have developed techniques for `opportunistic evaluation `_ that bridges the gap between lazy and eager evaluation that will be incorporated in Modin in the future. Ordering Semantics ------------------ diff --git a/docs/img/factory_dispatching.svg b/docs/img/factory_dispatching.svg index 17cffe10bfa..5bc51cd24fc 100644 --- a/docs/img/factory_dispatching.svg +++ b/docs/img/factory_dispatching.svg @@ -1,3 +1,4 @@ + -
Modin user
Modin user
pd.read_csv
pd.read_csv
FactoryDispatcher
FactoryDispatcher
DataFrame
DataFrame
QueryCompiler
QueryCompiler
PandasOnRayFactory
PandasOnRayFactory
HdkOnNativeFactory
HdkOnNativeFactory
CudfOnRayFactory
CudfOnRayFactory
.
.
.
.
.
.
PandasOnRayIO
PandasOnRayIO
Viewer does not support full SVG 1.1
\ No newline at end of file +
Modin user
Modin user
pd.read_csv
pd.read_csv
FactoryDispatcher
FactoryDispatcher
DataFrame
DataFrame
QueryCompiler
QueryCompiler
PandasOnRayFactory
PandasOnRayFactory
PandasOnDaskFactory
PandasOnDaskFactory
PandasOnUnidistFactory
PandasOnUnidistFactory
.
.
.
.
.
.
PandasOnRayIO
PandasOnRayIO
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/img/hdk/hdk_calcite_serialization_flow.svg b/docs/img/hdk/hdk_calcite_serialization_flow.svg deleted file mode 100644 index 6914ef81755..00000000000 --- a/docs/img/hdk/hdk_calcite_serialization_flow.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
HdkOnNativeDataframe
HdkOnNativeDataframe
PartitionManager.
     run_execution_plan()
PartitionManager....
Partition to query,
computation tree
Partition to query...
CalciteBuilder.build()
CalciteBuilder.build()
Computation tree
Computation tree
Calcite Nodes Sequence
Calcite Nodes Sequence
CalciteSerializer.
       serialize()
CalciteSerialize...
Calcite Nodes Sequence
Calcite Nodes Sequence
JSON query for DB
JSON query for DB
HDK Engine
HDK Engine
JSON query for DB
JSON query for DB
DbTable
DbTable
Partition, holding
resulted DbTable
Partition, holding...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/img/hdk/hdk_import.svg b/docs/img/hdk/hdk_import.svg deleted file mode 100644 index 6350102ca58..00000000000 --- a/docs/img/hdk/hdk_import.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
Partition
Partition
PyArrow Table/
pandas DataFrame
PyArrow Table/...
Request to import a table
Request to import a table
DbTable
DbTable
HDK Engine
HDK Engine
PartitionManager
PartitionManager
HdkOnNativeDataframe
OmnisciOnNa...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/img/hdk/hdk_ingress.svg b/docs/img/hdk/hdk_ingress.svg deleted file mode 100644 index 9d6fdeff920..00000000000 --- a/docs/img/hdk/hdk_ingress.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
pandas API
pandas API
Factory Dispatcher
Factory Dispatch...
ExperimentalHdkOnNativeFactory
ExperimentalHdkOnNativeFactory
HdkOnNativeIO
HdkOnNativeIO
PyArrow.IO
PyArrow.IO
DFAlgQueryCompiler
DFAlgQueryCompiler
PyArrow Table with data read
PyArrow Table wi...
Request to read
Request to read
PyArrow Table/
pandas DataFrame
PyArrow Table/...
HdkOnNativeDataframe
HdkOnNativeDataframe
Query
Compiler
Query...
Core Modin Dataframe
with the passed object
stored in partition
Core Modin Dataframe...
Modin
DataFrame
Modin...
Query to import data
Query to import data
Query for
Dispatcher
Query for...
Is import from file?
Is import from file?
Yes
Yes
No
No
PyArrow Table/
pandas DataFrame
PyArrow Table/...
Modin
DataFrame
Modin...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/img/hdk/hdk_lazy_tree_example.svg b/docs/img/hdk/hdk_lazy_tree_example.svg deleted file mode 100644 index 0c8536d3a3b..00000000000 --- a/docs/img/hdk/hdk_lazy_tree_example.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
df.sort_values(by="col1")
df.sort_values(by="col1")
pandas API
pandas API
HdkOnNativeDataframe
HdkOnNativeDataframe
Reduced query for QC
Reduced query for QC
...
...
.sort_rows(columns="col1")
.sort_r...

SortNode


+ input = DFAlgNode

+ columns = ["col1"]

+ ascending = ...

SortNode...

GroupbyAggNode


+ input = DFAlgNode

+ agg_exprs = {

      "col1": AggregateExpr,

      ...

   }

GroupbyAggNode...

JoinNode


+ left = DFAlgNode

+ right = DFAlgNode

+ how = ...

JoinNode...

FrameNode


+ input = core_modin_dataframe1

FrameNode...

FrameNode


+ input = core_modin_dataframe2

FrameNode...
...
...
...
...
Frame nodes
Frame n...
Just inserted Node
Just inserted Node

AggregateExpr


+ input = BaseExpr

+ agg = "sum"

+ ...

AggregateExpr...

OpExpr


+ operands = [

        BaseExpr,

        BaseExpr

    ]

+ op = "POWER"

+ ...

OpExpr...
...
...
...
...

InputRefExpr


+ frame = core_modin_dataframe1

+ column = "col12"

InputRefExpr...

InputRefExpr


+ frame = core_modin_dataframe1

+ column = "col14"

InputRefExpr...
Expression nodes
Expression nodes
Computation tree
Computation tree
Append SortNode to
the computation tree
Append SortNode to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/img/hdk/hdk_query_flow.svg b/docs/img/hdk/hdk_query_flow.svg deleted file mode 100644 index 988bbffe3bc..00000000000 --- a/docs/img/hdk/hdk_query_flow.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
pandas API
pandas API
DFAlg
Query Compiler
DFAlg...
Base
Query Compiler
Base...
Dataframe
Dataframe
DFAlgNode
DFAlgNode
Partition
Partition
PartitionManager
PartitionManager
...
...
CalciteBuilder
CalciteBuild...
CalciteSerializer
CalciteSeriali...
HDK Engine
HDK Engine
Reduced query for QC
Reduced query for QC
Reduced query
for execution
Reduced query...
Execution
Execution
Modin HdkOnNative
Dataframe classes
Modin HdkOnNative...
Dataframe
Dataframe
Partition
Manager
Partition...
Axis Partition
Axis Part...
Partition
Partition
Dataframe
Dataframe
Partition Manager
Partition...
Axis Partition
Axis Part...
Partition
Partition
Modin PandasDataframe classes
Modin PandasDataframe classes
Base Modin Dataframe classes
Base Modin Dataframe classes
Expression nodes
Expression nodes
GroupbyAggNode
GroupbyAggNode
JoinNode
JoinNode
FrameNode
FrameNode
InputRefExpr
InputRefExpr
BaseExpr
BaseExpr
LiteralExpr
LiteralExpr
AggregateExpr
AggregateExpr
...
...
Frame nodes
Frame nodes
Computation tree nodes
Computation tree nodes
Relation algebra nodes
Relation algebra nodes
CalciteScanNode
CalciteScanNode
CalciteBaseNode
CalciteBaseNode
CalciteJoinNode
CalciteJoinNode
CalciteAggNode
CalciteAggNode
...
...
Leaf node
Leaf node
Leaf node
Leaf node
Leaf node
Leaf node
Append an operation's
node to a computation tree
Append an operation's...
Calcite nodes sequence
Calcite nodes sequence
Computation tree
Computation tree
DB Query
DB Query
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 11a1415aa7f..2f72814fa2e 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -13,11 +13,6 @@ the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. -.. note:: - Currently third column reflects implementation status for Ray and Dask engines. By default, support for a method - in the HDK engine could be treated as ``D`` unless ``Notes`` column contains additional information. Similarly, - by default ``Notes`` contains information about ``Ray`` and ``Dask`` engines unless ``Hdk`` is explicitly mentioned. - +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | DataFrame method | pandas Doc link | Implemented? (Y/N/P/D) | Notes for Current implementation | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -27,9 +22,6 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``add`` | `add`_ | Y | **Ray** and **Dask**: Shuffles data in operations | | | | | between DataFrames. | -| | | | **Hdk**: ``P``, support binary operations on | -| | | | scalars and projections of the same frame, | -| | | | otherwise ``D`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``add_prefix`` | `add_prefix`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -55,7 +47,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``assign`` | `assign`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``astype`` | `astype`_ | Y | **Hdk**: ``P``, ``int``<-> ``float`` supported | +| ``astype`` | `astype`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``at`` | `at`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -88,8 +80,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``corrwith`` | `corrwith`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``count`` | `count`_ | Y | **Hdk**: ``P``, only default params supported, | -| | | | otherwise ``D`` | +| ``count`` | `count`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``cov`` | `cov`_ | P | Covariance floating point precision may slightly | | | | | differ from pandas. For ``numeric_only`` | @@ -113,16 +104,15 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``dot`` | `dot`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``drop`` | `drop`_ | Y | **Hdk**: ``P`` since row drop unsupported | +| ``drop`` | `drop`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``droplevel`` | `droplevel`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``drop_duplicates`` | `drop_duplicates`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``dropna`` | `dropna`_ | Y | **Hdk**: ``P`` since ``thresh`` and ``axis`` | -| | | | params unsupported | +| ``dropna`` | `dropna`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``dtypes`` | `dtypes`_ | Y | **Hdk**: ``Y`` | +| ``dtypes`` | `dtypes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``duplicated`` | `duplicated`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -143,9 +133,7 @@ default to pandas. | ``ffill`` | `ffill`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``fillna`` | `fillna`_ | P | ``value`` parameter of type DataFrame defaults to | -| | | | pandas. **Hdk**: ``P``, params ``limit``, | -| | | | ``downcast`` and ``method`` unsupported. Also | -| | | | only ``axis = 0`` supported for now | +| | | | pandas. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``filter`` | `filter`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -164,9 +152,6 @@ default to pandas. | ``get`` | `get`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``groupby`` | `groupby`_ | Y | Not yet optimized for all operations. | -| | | | **Hdk**: ``P``. ``count``, ``sum``, ``size``, | -| | | | ``mean``, ``nunique``, ``std``, ``skew`` | -| | | | supported, otherwise ``D`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``gt`` | `gt`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -180,10 +165,9 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``idxmin`` | `idxmin`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``iloc`` | `iloc`_ | Y | **Hdk**: ``P``, read access fully supported, | -| | | | write access: no row and 2D assignments support | +| ``iloc`` | `iloc`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``infer_objects`` | `infer_objects`_ | Y | **Hdk**: ``D`` | +| ``infer_objects`` | `infer_objects`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``info`` | `info`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -221,20 +205,15 @@ default to pandas. | ``le`` | `le`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``loc`` | `loc`_ | P | We do not support: boolean array, callable. | -| | | | **Hdk**: ``P``, read access fully supported, | -| | | | write access: no row and 2D assignments support | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``lt`` | `lt`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mask`` | `mask`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``max`` | `max`_ | Y | **Hdk**: ``P``, only default params supported, | -| | | | otherwise ``D`` | +| ``max`` | `max`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mean`` | `mean`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | -| | | | **Hdk**: ``P``. ``D`` for ``level``, ``axis``, | -| | | | ``skipna`` and ``numeric_only`` params | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``median`` | `median`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | @@ -250,12 +229,8 @@ default to pandas. | | | | ``right_index=False`` or ``left_index=False`` | | | | | and ``right_index=True``. | | | | | Defaults to pandas otherwise. | -| | | | **Hdk**: ``P``, only non-index joins for | -| | | | ``how=left`` and ``how=inner`` with | -| | | | explicit `on` are supported | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``min`` | `min`_ | Y | **Hdk**: ``P``, only default params supported, | -| | | | otherwise ``D`` | +| ``min`` | `min`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mod`` | `mod`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -277,8 +252,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``nsmallest`` | `nsmallest`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``nunique`` | `nunique`_ | Y | **Hdk**: ``P``, no support for ``axis!=0`` and | -| | | | ``dropna=False`` | +| ``nunique`` | `nunique`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pct_change`` | `pct_change`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -292,7 +266,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pop`` | `pop`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``pow`` | `pow`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``pow`` | `pow`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``prod`` | `prod`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -306,7 +280,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rank`` | `rank`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rdiv`` | `rdiv`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``rdiv`` | `rdiv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``reindex`` | `reindex`_ | Y | Shuffles data | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -322,13 +296,12 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``resample`` | `resample`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``reset_index`` | `reset_index`_ | P | **Hdk**: ``P``. ``D`` for ``level`` parameter | -| | | | **Ray** and **Dask**: ``D`` when ``names`` or | +| ``reset_index`` | `reset_index`_ | P | **Ray** and **Dask**: ``D`` when ``names`` or | | | | | ``allow_duplicates`` is non-default | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rfloordiv`` | `rfloordiv`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``rfloordiv`` | `rfloordiv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rmod`` | `rmod`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``rmod`` | `rmod`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rmul`` | `rmul`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -336,11 +309,11 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``round`` | `round`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rpow`` | `rpow`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``rpow`` | `rpow`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rsub`` | `rsub`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``rsub`` | `rsub`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rtruediv`` | `rtruediv`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``rtruediv`` | `rtruediv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sample`` | `sample`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -353,7 +326,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``set_index`` | `set_index`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``shape`` | `shape`_ | Y | **Hdk**: ``Y`` | +| ``shape`` | `shape`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``shift`` | `shift`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -366,7 +339,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sort_values`` | `sort_values`_ | Y | Shuffles data. Order of indexes that have the | | | | | same sort key is not guaranteed to be the same | -| | | | across sorts; **Hdk**: ``Y`` | +| | | | across sorts | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sparse`` | `sparse`_ | N | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ @@ -381,10 +354,9 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sub`` | `sub`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``subtract`` | `subtract`_ | Y | See ``add``; **Hdk**: ``D`` | +| ``subtract`` | `subtract`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``sum`` | `sum`_ | Y | **Hdk**: ``P``, only default params supported, | -| | | | otherwise ``D`` | +| ``sum`` | `sum`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``swapaxes`` | `swapaxes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/io_supported.rst b/docs/supported_apis/io_supported.rst index a44adb71abc..9fbf38b473c 100644 --- a/docs/supported_apis/io_supported.rst +++ b/docs/supported_apis/io_supported.rst @@ -13,10 +13,6 @@ the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. -.. note:: - Currently, the second column reflects implementation status for ``Ray`` and ``Dask`` engines. By default, support for a method - in the ``Hdk`` engine could be treated as ``D`` unless ``Notes`` column contains additional information. - .. note:: Support for fully asynchronous reading has been added for the following functions: ``read_csv``, ``read_fwf``, ``read_table``, ``read_custom_text``. @@ -27,16 +23,7 @@ default to pandas. +-------------------+---------------------------------+--------------------------------------------------------+ | IO method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation | +-------------------+---------------------------------+--------------------------------------------------------+ -| `read_csv`_ | Y | **Hdk**: ``P``, only basic cases and parameters | -| | | supported: ``filepath_or_buffer`` can be local file | -| | | only, ``sep``, ``delimiter``, ``header`` (partly) | -| | | ``names``, ``usecols``, ``dtype``, | -| | | ``true/false_values``, ``skiprows`` (partly) | -| | | ``skip_blank_lines`` (partly), ``parse_dates`` | -| | | (partly), ``compression`` (inferred automatically, | -| | | should not be specified), ``quotechar``, | -| | | ``escapechar``, ``doublequote``, | -| | | ``delim_whitespace`` | +| `read_csv`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_fwf`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index e392dfd3043..6084b208ace 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -13,19 +13,12 @@ for partial (meaning some parameters may not be supported yet), and ``D`` stands default to pandas. To learn more about the implementations that default to pandas, see the related section on :doc:`Defaulting to pandas `. -.. note:: - Currently, the second column reflects implementation status for ``Ray`` and ``Dask`` engines. By default, support for a method - in the ``HDK`` engine could be treated as ``D`` unless ``Notes`` column contains additional information. Similarly, - by default ``Notes`` contains information about ``Ray`` and ``Dask`` engines unless ``Hdk`` is explicitly mentioned. - +-----------------------------+---------------------------------+----------------------------------------------------+ | Series method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``abs`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``add`` | Y | **Hdk**: ``P``, support binary operations on | -| | | scalars and projections of the same frame, | -| | | otherwise ``D`` | +| ``add`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``add_prefix`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -57,7 +50,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``asof`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``astype`` | Y | **Hdk**: ``P``, ``int``<->``float`` supported | +| ``astype`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``at`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -96,8 +89,7 @@ the related section on :doc:`Defaulting to pandas `. | | | available only. For other methods defaults to | | | | pandas. | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``count`` | Y | **Hdk**: ``P``, only default params supported, | -| | | otherwise ``D`` | +| ``count`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cov`` | Y | Covariance floating point precision may slightly | | | | differ from pandas. | @@ -124,21 +116,19 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dot`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``drop`` | Y | **Hdk**: ``D`` | +| ``drop`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``drop_duplicates`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``droplevel`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``dropna`` | Y | **Hdk**: ``P`` since ``thresh`` and ``axis`` | -| | | params unsupported | +| ``dropna`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``dt`` | Y | **Hdk**: ``P``, only ``year``, ``month``, | -| | | ``day`` and ``hour`` supported, otherwise ``D`` | +| ``dt`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dtype`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``dtypes`` | Y | **Hdk**: ``Y`` | +| ``dtypes`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``duplicated`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -158,9 +148,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ffill`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``fillna`` | Y | **Hdk**: ``P``, params ``limit``, | -| | | ``downcast`` and ``method`` unsupported. Also | -| | | only ``axis = 0`` supported for now | +| ``fillna`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``filter`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -188,8 +176,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get_values`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``groupby`` | D | **Hdk**: ``P``. ``count``, ``sum``, ``size`` | -| | | supported, otherwise ``D`` | +| ``groupby`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``gt`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -205,14 +192,13 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``idxmin`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``iloc`` | Y | **Hdk**: ``P``, read access fully supported, | -| | | write access: no row and 2D assignments support | +| ``iloc`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``imag`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``index`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``infer_objects`` | Y | **Hdk**: ``D`` | +| ``infer_objects`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``interpolate`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -246,8 +232,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``le`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``loc`` | Y | **Hdk**: ``P``, read access fully supported, | -| | | write access: no row and 2D assignments support | +| ``loc`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``lt`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -255,21 +240,17 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mask`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``max`` | Y | **Hdk**: ``P``, only default params supported, | -| | | otherwise ``D`` | +| ``max`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mean`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | -| | | **Hdk**: ``P``. ``D`` for ``level``, ``axis``, | -| | | `skipna` and `numeric_only` params | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``median`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``memory_usage`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``min`` | Y | **Hdk**: ``P``, only default params supported, | -| | | otherwise ``D`` | +| ``min`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mod`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -297,8 +278,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nsmallest`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``nunique`` | Y | **Hdk**: ``P``, no support for ``axis!=0`` and | -| | | ``dropna=False`` | +| ``nunique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``pct_change`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -308,7 +288,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``pop`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``pow`` | Y | See ``add``; **Hdk**: ``D`` | +| ``pow`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``prod`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -326,7 +306,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ravel`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``rdiv`` | Y | See ``add``; **Hdk**: ``D`` | +| ``rdiv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rdivmod`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -348,13 +328,12 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``resample`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``reset_index`` | P | **Hdk**: ``P``. ``D`` for ``level`` parameter | -| | | **Ray** and **Dask**: ``D`` when ``names`` or | +| ``reset_index`` | P | **Ray** and **Dask**: ``D`` when ``names`` or | | | | ``allow_duplicates`` is non-default | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``rfloordiv`` | Y | See ``add``; **Hdk**: ``D`` | +| ``rfloordiv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``rmod`` | Y | See ``add``; **Hdk**: ``D`` | +| ``rmod`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rmul`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -362,11 +341,11 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``round`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``rpow`` | Y | See ``add``; **Hdk**: ``D`` | +| ``rpow`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``rsub`` | Y | See ``add``; **-Hdk**: ``D`` | +| ``rsub`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``rtruediv`` | Y | See ``add``; **Hdk**: ``D`` | +| ``rtruediv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sample`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -379,7 +358,7 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``set_value`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``shape`` | Y | **Hdk**: ``Y`` | +| ``shape`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``shift`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -392,7 +371,6 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sort_values`` | Y | Order of indexes that have the same sort key | | | | is not guaranteed to be the same across sorts; | -| | | **Hdk**: ``Y`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sparse`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -407,10 +385,9 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sub`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``subtract`` | Y | See ``add``; **Hdk**: ``D`` | +| ``subtract`` | Y | See ``add``; | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``sum`` | Y | **Hdk**: ``P``, only default params supported, | -| | | otherwise ``D`` | +| ``sum`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``swapaxes`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -476,7 +453,6 @@ the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``value_counts`` | Y | The indices order of resulting object may differ | | | | from pandas. | -| | | **Hdk**: ``Y`` except ``dropna`` param support | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``values`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/utilities_supported.rst b/docs/supported_apis/utilities_supported.rst index 6fe3dd442bc..f4661ceb2b7 100644 --- a/docs/supported_apis/utilities_supported.rst +++ b/docs/supported_apis/utilities_supported.rst @@ -12,16 +12,10 @@ the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. -.. note:: - Currently, the second column reflects implementation status for Ray and Dask engines. By default, support for a method - in the HDK engine could be treated as ``D`` unless ``Notes`` column contains additional information. Similarly, - by default ``Notes`` contains information about ``Ray`` and ``Dask`` engines unless ``Hdk`` is explicitly mentioned. - +---------------------------+---------------------------------+----------------------------------------------------+ | Utility method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation | +---------------------------+---------------------------------+----------------------------------------------------+ -| `pd.concat`_ | Y | **Hdk**: ``Y`` but ``sort`` and | -| | | `ignore_index`` parameters ignored | +| `pd.concat`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.eval`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ diff --git a/docs/usage_guide/advanced_usage/index.rst b/docs/usage_guide/advanced_usage/index.rst index 02211d1033f..6ecf2a9d54a 100644 --- a/docs/usage_guide/advanced_usage/index.rst +++ b/docs/usage_guide/advanced_usage/index.rst @@ -8,7 +8,6 @@ Advanced Usage /flow/modin/distributed/dataframe/pandas spreadsheets_api progress_bar - modin_sql modin_xgboost modin_logging batch @@ -26,7 +25,7 @@ on DataFrames, and more. Join us on `Slack`_ for the latest updates! Modin engines ------------- -Modin supports a series of execution engines such as Ray_, Dask_, `MPI through unidist`_, `HDK`_, +Modin supports a series of execution engines such as Ray_, Dask_, `MPI through unidist`_, each of which might be a more beneficial choice for a specific scenario. When doing the first operation with Modin it automatically initializes one of the engines to further perform distributed/parallel computation. If you are familiar with a concrete execution engine, it is possible to initialize the engine on your own and @@ -95,11 +94,6 @@ Dataframe Algebra A minimal set of operators that can be composed to express any dataframe query for use in query planning and optimization. See our `paper`_ for more information, and full documentation is coming soon! -SQL on Modin Dataframes ------------------------- - -Read about Modin Dataframe support for SQL queries in this recent `blog post`_. Check out the `Modin SQL documentation`_ as well! - Distributed XGBoost on Modin ---------------------------- @@ -129,8 +123,6 @@ An experimental GitHub Action on pull request has been added to Modin, which aut `fuzzydata`, a random dataframe workflow generator. The resulting workflow that was used to test Modin codebase can be downloaded as an artifact from the GitHub Actions tab for further inspection. See `fuzzydata`_ for more details. -.. _`blog post`: https://medium.com/riselab/why-every-data-scientist-using-pandas-needs-modin-bringing-sql-to-dataframes-3b216b29a7c0 -.. _`Modin SQL documentation`: modin_sql.html .. _`Modin Spreadsheet API documentation`: spreadsheets_api.html .. _`Progress Bar documentation`: progress_bar.html .. _`Paper`: https://arxiv.org/pdf/2001.00888.pdf @@ -141,4 +133,3 @@ downloaded as an artifact from the GitHub Actions tab for further inspection. Se .. _Ray: https://github.com/ray-project/ray .. _Dask: https://github.com/dask/distributed .. _`MPI through unidist`: https://github.com/modin-project/unidist -.. _HDK: https://github.com/intel-ai/hdk diff --git a/docs/usage_guide/advanced_usage/modin_engines.rst b/docs/usage_guide/advanced_usage/modin_engines.rst index 53079c67e6c..dea384df452 100644 --- a/docs/usage_guide/advanced_usage/modin_engines.rst +++ b/docs/usage_guide/advanced_usage/modin_engines.rst @@ -60,17 +60,6 @@ You can initialize MPI through unidist engine with a specific number of CPUs (wo To get more details on all possible parameters for initialization refer to `unidist documentation`_. -HDK ---- - -For now it is not possible to initialize HDK beforehand. Modin itself initializes it with the required configuration. - -.. code-block:: python - - import modin.config as modin_cfg - - modin_cfg.StorageFormat.put("hdk") # # Modin will use HDK engine - .. _`Ray documentation`: https://docs.ray.io/en/latest .. _Dask Distributed documentation: https://distributed.dask.org/en/latest .. _`unidist documentation`: https://unidist.readthedocs.io/en/latest diff --git a/docs/usage_guide/advanced_usage/modin_sql.rst b/docs/usage_guide/advanced_usage/modin_sql.rst deleted file mode 100644 index 1b76bb2df3a..00000000000 --- a/docs/usage_guide/advanced_usage/modin_sql.rst +++ /dev/null @@ -1,142 +0,0 @@ -SQL on Modin Dataframes -======================= - -Modin provides a SQL API that allows you to intermix SQL and pandas operations -without copying the entire dataset into a new structure between the two. This is possible -due to the architecture of Modin. Currently, Modin has a query compiler that acts as an -intermediate layer between the query language (e.g. SQL, pandas) and the execution -(See :doc:`architecture ` documentation for details). - -To execute SQL queries, Modin uses HDK engine -(See :doc:`Using HDK ` documentation for details) -Thus, to execute SQL queries, pyhdk module must be installed. - - -A Short Example Using the Google Play Store -"""""""""""""""""""""""""""""""""""""""""""" - -.. code-block:: python - - import modin.pandas as pd - import modin.experimental.sql as sql - - # read google play app store list from csv - gstore_apps_df = pd.read_csv("https://tinyurl.com/googleplaystorecsv") - -.. figure:: /img/modin_sql_google_play_table.png - :align: center - -Imagine that you want to quickly select from ‘gstore_apps_df’ the columns -App, Category, and Rating, where Price is ‘0’. - -.. code-block:: python - - # You can then define the query that you want to perform - query_str = "SELECT App, Category, Rating FROM gstore_apps WHERE Price = '0'" - - # And simply apply that query to a dataframe - result_df = sql.query(query_str, gstore_apps=gstore_apps_df) - - # Or, in this case, where the query only requires one table, - # you can also ignore the FROM part in the query string: - sql_str = "SELECT App, Category, Rating WHERE Price = '0' " - -Writing Complex Queries -""""""""""""""""""""""" - -For complex queries, it's recommended to use the HDK engine because it's much more -powerful, comparing to dfsql. Especially, if multiple data frames are involved. - -Let's explore a more complicated example. - -.. code-block:: python - - gstore_reviews_df = pd.read_csv("https://tinyurl.com/googleplaystoreurcsv") - - -.. figure:: /img/modin_sql_google_play_ur_table.png - :align: center - - -Say we want to retrieve the top 10 app categories ranked by best average ‘sentiment_polarity’ where the -average ‘sentiment_subjectivity’ is less than 0.5. - -Since ‘Category’ is on the **gstore_apps_df** and sentiment_polarity is on **gstore_reviews_df**, -we need to join the two tables, and operate averages on that join. - -.. code-block:: python - - # Single query with join and group by - sql_str = """ - SELECT - category, - AVG(sentiment_polarity) AS avg_sentiment_polarity, - AVG(sentiment_subjectivity) AS avg_sentiment_subjectivity - FROM ( - SELECT - category, - CAST(sentiment as float) AS sentiment, - CAST(sentiment_polarity AS float) AS sentiment_polarity, - CAST(sentiment_subjectivity AS float) AS sentiment_subjectivity - FROM gstore_apps_df - INNER JOIN gstore_reviews_df - ON gstore_apps_df.app = gstore_reviews_df.app - ) sub - GROUP BY category - HAVING avg_sentiment_subjectivity < 0.5 - ORDER BY avg_sentiment_polarity DESC - LIMIT 10 - """ - - # Run query using apps and reviews dataframes, - # NOTE: that you simply pass the names of the tables in the query as arguments - - result_df = sql.query( sql_str, - gstore_apps_df = gstore_apps_df, - gstore_reviews_df = gstore_reviews_df) - - -Or, you can bring the best of doing this in python and run the query in multiple parts (it’s up to you). - -.. code-block:: python - - # join the items and reviews - - result_df = sql.query(""" - SELECT - category, - sentiment, - sentiment_polarity, - sentiment_subjectivity - FROM gstore_apps_df INNER JOIN gstore_reviews_df - ON gstore_apps_df.app = gstore_reviews_df.app""", - gstore_apps_df=gstore_apps_df, - gstore_reviews_df=gstore_reviews_df) - - # group by category and calculate averages - - result_df = sql.query(""" - SELECT - category, - AVG(sentiment_polarity) AS avg_sentiment_polarity, - AVG(sentiment_subjectivity) AS avg_sentiment_subjectivity - FROM result_df - GROUP BY category - HAVING CAST(avg_sentiment_subjectivity AS float) < 0.5 - ORDER BY avg_sentiment_polarity DESC - LIMIT 10""", - result_df=result_df) - - -If you have a cluster or even a computer with more than one CPU core, -you can write SQL and Modin will run those queries in a distributed and optimized way. - -Further Examples and Full Documentation -""""""""""""""""""""""""""""""""""""""" -In the meantime, you can check out our `Example Notebook`_ that contains more -examples and ideas, as well as this blog_ explaining Modin SQL usage. - - -.. _MindsDB: https://mindsdb.com/ -.. _Example Notebook: https://github.com/mindsdb/dfsql/blob/stable/testdrive.ipynb -.. _blog: https://medium.com/riselab/why-every-data-scientist-using-pandas-needs-modin-bringing-sql-to-dataframes-3b216b29a7c0 diff --git a/docs/usage_guide/examples/index.rst b/docs/usage_guide/examples/index.rst index 51a0b8dd311..0b23b2c838c 100644 --- a/docs/usage_guide/examples/index.rst +++ b/docs/usage_guide/examples/index.rst @@ -9,9 +9,9 @@ Tutorials The following tutorials cover the basic usage of Modin. `Here `_ is a one hour video tutorial that walks through these basic exercises. -- Exercise 1: Introduction to Modin [`Source PandasOnRay `__, `Source PandasOnDask `__, `Source HdkOnNative `__] -- Exercise 2: Speed Improvements with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__, `Source HdkOnNative `__] -- Exercise 3: Defaulting to pandas with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__, `Source HdkOnNative `__] +- Exercise 1: Introduction to Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] +- Exercise 2: Speed Improvements with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] +- Exercise 3: Defaulting to pandas with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] The following tutorials covers more advanced features in Modin: diff --git a/examples/docker/modin-hdk/Dockerfile b/examples/docker/modin-hdk/Dockerfile deleted file mode 100644 index 7316a3e7f0b..00000000000 --- a/examples/docker/modin-hdk/Dockerfile +++ /dev/null @@ -1,82 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Build image from this dockerfile like this: -# docker build -t modin-hdk:latest . - -FROM ubuntu:20.04 - -# Proxy settings -ENV http_proxy=${http_proxy} -ENV https_proxy=${https_proxy} -ENV no_proxy=${no_proxy} - -RUN apt-get update --yes \ - && apt-get install wget --yes \ - && rm -rf /var/lib/apt/lists/* - -# Modin settings -ENV MODIN_STORAGE_FORMAT="hdk" -ENV MODIN_EXPERIMENTAL="true" -ENV MODIN_ENGINE="native" - -ENV USER modin -ENV UID 1000 -ENV HOME /home/$USER - -RUN adduser --disabled-password \ - --gecos "Non-root user" \ - --uid $UID \ - --home $HOME \ - $USER - -# Conda settings -ENV CONDA_DIR=${HOME}/miniconda -ENV CONDA_ENV_NAME=modin-hdk -ENV PATH="${CONDA_DIR}/bin:${PATH}" - -RUN wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \ - && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \ - && "${CONDA_DIR}/bin/conda" init bash \ - && rm -f /tmp/miniconda3.sh - -RUN conda update -n base -c defaults conda -y \ - && conda create -n ${CONDA_ENV_NAME} --yes -c conda-forge --strict-channel-priority \ - modin-hdk \ - scikit-learn \ - scikit-learn-intelex \ - xgboost \ - && conda clean --all --yes - -# Activate ${CONDA_ENV_NAME} for interactive shells -RUN echo "source ${CONDA_DIR}/bin/activate ${CONDA_ENV_NAME}" >> "${HOME}/.bashrc" -# Activate ${CONDA_ENV_NAME} for non-interactive shells -# The following line comments out line that prevents ~/.bashrc execution in -# non-interactive mode. -RUN sed -e 's,\(^[[:space:]]\+[*]) return;;$\),# \1,' -i "${HOME}/.bashrc" -ENV BASH_ENV="${HOME}/.bashrc" - -# Set up benchmark scripts -COPY nyc-taxi-hdk.py "${HOME}" -COPY census-hdk.py "${HOME}" -COPY plasticc-hdk.py "${HOME}" -RUN mkdir /dataset -WORKDIR ${HOME} - -# Clean up proxy settings to publish on Docker Hub -ENV http_proxy= -ENV https_proxy= -ENV no_proxy= - -# Set entrypoint with arguments expansion -ENTRYPOINT ["/bin/bash", "-c", "exec $0 $*"] diff --git a/examples/docker/modin-hdk/build-docker-image.sh b/examples/docker/modin-hdk/build-docker-image.sh deleted file mode 100755 index 968591f6144..00000000000 --- a/examples/docker/modin-hdk/build-docker-image.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -e - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -cd "`dirname \"$0\"`" - -docker build -t modin-hdk . - -echo -e '\nNYC TAXI BENCHMARK -User is responsible for preparing the dataset. -It Can be generated by following the instructions on the link: -https://github.com/toddwschneider/nyc-taxi-data#instructions -To run the benchmark execute: -\tdocker run --rm -v /path/to/dataset:/dataset modin-hdk python nyc-taxi-hdk.py - -CENSUS BENCHMARK -User is responsible for preparing the dataset. -It can be downloaded from the following link: -https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz -To run the benchmark execute: -\tdocker run --rm -v /path/to/dataset:/dataset modin-hdk python census-hdk.py - -PLASTICC BENCHMARK -User is responsible for preparing the datasets. -The datasets must include four files: training set, test set, -training set metadata and test set metadata. -To run the benchmark execute: -\tdocker run --rm -v /path/to/dataset:/dataset modin-hdk python plasticc-hdk.py \n' diff --git a/examples/docker/modin-hdk/census-hdk.py b/examples/docker/modin-hdk/census-hdk.py deleted file mode 100644 index 71a4f9a20ef..00000000000 --- a/examples/docker/modin-hdk/census-hdk.py +++ /dev/null @@ -1,270 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import sys - -import numpy as np -from utils import measure - -import modin.pandas as pd - - -def read(filename): - columns_names = [ - "YEAR0", - "DATANUM", - "SERIAL", - "CBSERIAL", - "HHWT", - "CPI99", - "GQ", - "QGQ", - "PERNUM", - "PERWT", - "SEX", - "AGE", - "EDUC", - "EDUCD", - "INCTOT", - "SEX_HEAD", - "SEX_MOM", - "SEX_POP", - "SEX_SP", - "SEX_MOM2", - "SEX_POP2", - "AGE_HEAD", - "AGE_MOM", - "AGE_POP", - "AGE_SP", - "AGE_MOM2", - "AGE_POP2", - "EDUC_HEAD", - "EDUC_MOM", - "EDUC_POP", - "EDUC_SP", - "EDUC_MOM2", - "EDUC_POP2", - "EDUCD_HEAD", - "EDUCD_MOM", - "EDUCD_POP", - "EDUCD_SP", - "EDUCD_MOM2", - "EDUCD_POP2", - "INCTOT_HEAD", - "INCTOT_MOM", - "INCTOT_POP", - "INCTOT_SP", - "INCTOT_MOM2", - "INCTOT_POP2", - ] - columns_types = [ - "int64", - "int64", - "int64", - "float64", - "int64", - "float64", - "int64", - "float64", - "int64", - "int64", - "int64", - "int64", - "int64", - "int64", - "int64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - ] - dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} - - df = pd.read_csv( - filename, - names=columns_names, - dtype=dtypes, - skiprows=1, - ) - - # to trigger real execution and table import - df._query_compiler._modin_frame.force_import() - return df - - -def etl(df): - keep_cols = [ - "YEAR0", - "DATANUM", - "SERIAL", - "CBSERIAL", - "HHWT", - "CPI99", - "GQ", - "PERNUM", - "SEX", - "AGE", - "INCTOT", - "EDUC", - "EDUCD", - "EDUC_HEAD", - "EDUC_POP", - "EDUC_MOM", - "EDUCD_MOM2", - "EDUCD_POP2", - "INCTOT_MOM", - "INCTOT_POP", - "INCTOT_MOM2", - "INCTOT_POP2", - "INCTOT_HEAD", - "SEX_HEAD", - ] - df = df[keep_cols] - - df = df[df["INCTOT"] != 9999999] - df = df[df["EDUC"] != -1] - df = df[df["EDUCD"] != -1] - - df["INCTOT"] = df["INCTOT"] * df["CPI99"] - - for column in keep_cols: - df[column] = df[column].fillna(-1) - - df[column] = df[column].astype("float64") - - y = df["EDUC"] - X = df.drop(columns=["EDUC", "CPI99"]) - - # to trigger real execution - df.shape - y.shape - X.shape - - return (df, X, y) - - -def mse(y_test, y_pred): - return ((y_test - y_pred) ** 2).mean() - - -def cod(y_test, y_pred): - y_bar = y_test.mean() - total = ((y_test - y_bar) ** 2).sum() - residuals = ((y_test - y_pred) ** 2).sum() - return 1 - (residuals / total) - - -def ml(X, y, random_state, n_runs, test_size): - # to not install ML dependencies unless required - import sklearnex - from sklearn import config_context - - sklearnex.patch_sklearn() - import sklearn.linear_model as lm - from sklearn.model_selection import train_test_split - - clf = lm.Ridge() - - X = np.ascontiguousarray(X, dtype=np.float64) - y = np.ascontiguousarray(y, dtype=np.float64) - - mse_values, cod_values = [], [] - ml_scores = {} - - print("ML runs: ", n_runs) - for i in range(n_runs): - (X_train, X_test, y_train, y_test) = train_test_split( - X, y, test_size=test_size, random_state=random_state - ) - random_state += 777 - - with config_context(assume_finite=True): - model = clf.fit(X_train, y_train) - - y_pred = model.predict(X_test) - - mse_values.append(mse(y_test, y_pred)) - cod_values.append(cod(y_test, y_pred)) - - ml_scores["mse_mean"] = sum(mse_values) / len(mse_values) - ml_scores["cod_mean"] = sum(cod_values) / len(cod_values) - ml_scores["mse_dev"] = pow( - sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values]) - / (len(mse_values) - 1), - 0.5, - ) - ml_scores["cod_dev"] = pow( - sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values]) - / (len(cod_values) - 1), - 0.5, - ) - - return ml_scores - - -def main(): - if len(sys.argv) < 2: - print( - "USAGE: docker run --rm -v /path/to/dataset:/dataset python census-hdk.py" - + " " - + " [-no-ml]" - ) - return - # ML specific - N_RUNS = 50 - TEST_SIZE = 0.1 - RANDOM_STATE = 777 - - df = measure("Reading", read, sys.argv[1]) - _, X, y = measure("ETL", etl, df) - - if "-no-ml" not in sys.argv[2:]: - measure( - "ML", - ml, - X, - y, - random_state=RANDOM_STATE, - n_runs=N_RUNS, - test_size=TEST_SIZE, - ) - - -if __name__ == "__main__": - main() diff --git a/examples/docker/modin-hdk/nyc-taxi-hdk.py b/examples/docker/modin-hdk/nyc-taxi-hdk.py deleted file mode 100644 index de159100318..00000000000 --- a/examples/docker/modin-hdk/nyc-taxi-hdk.py +++ /dev/null @@ -1,285 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import sys - -from utils import measure - -import modin.pandas as pd -from modin.experimental.sql import query -from modin.tests.pandas.utils import df_equals - - -def read(filename): - columns_names = [ - "trip_id", - "vendor_id", - "pickup_datetime", - "dropoff_datetime", - "store_and_fwd_flag", - "rate_code_id", - "pickup_longitude", - "pickup_latitude", - "dropoff_longitude", - "dropoff_latitude", - "passenger_count", - "trip_distance", - "fare_amount", - "extra", - "mta_tax", - "tip_amount", - "tolls_amount", - "ehail_fee", - "improvement_surcharge", - "total_amount", - "payment_type", - "trip_type", - "pickup", - "dropoff", - "cab_type", - "precipitation", - "snow_depth", - "snowfall", - "max_temperature", - "min_temperature", - "average_wind_speed", - "pickup_nyct2010_gid", - "pickup_ctlabel", - "pickup_borocode", - "pickup_boroname", - "pickup_ct2010", - "pickup_boroct2010", - "pickup_cdeligibil", - "pickup_ntacode", - "pickup_ntaname", - "pickup_puma", - "dropoff_nyct2010_gid", - "dropoff_ctlabel", - "dropoff_borocode", - "dropoff_boroname", - "dropoff_ct2010", - "dropoff_boroct2010", - "dropoff_cdeligibil", - "dropoff_ntacode", - "dropoff_ntaname", - "dropoff_puma", - ] - # use string instead of category - columns_types = [ - "int64", - "string", - "timestamp", - "timestamp", - "string", - "int64", - "float64", - "float64", - "float64", - "float64", - "int64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "float64", - "string", - "float64", - "string", - "string", - "string", - "float64", - "int64", - "float64", - "int64", - "int64", - "float64", - "float64", - "float64", - "float64", - "string", - "float64", - "float64", - "string", - "string", - "string", - "float64", - "float64", - "float64", - "float64", - "string", - "float64", - "float64", - "string", - "string", - "string", - "float64", - ] - - dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} - all_but_dates = { - col: valtype - for (col, valtype) in dtypes.items() - if valtype not in ["timestamp"] - } - dates_only = [col for (col, valtype) in dtypes.items() if valtype in ["timestamp"]] - - df = pd.read_csv( - filename, - names=columns_names, - dtype=all_but_dates, - parse_dates=dates_only, - ) - - # to trigger real execution and table import - df._query_compiler._modin_frame.force_import() - return df - - -def q1_hdk(df): - q1_pandas_output = df.groupby("cab_type").size() - q1_pandas_output.shape # to trigger real execution - return q1_pandas_output - - -def q1_sql(df): - sql = """ - SELECT - cab_type, - COUNT(*) AS 'count' - FROM trips - GROUP BY - cab_type - """ - return query(sql, trips=df) - - -def q2_hdk(df): - q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"}) - q2_pandas_output.shape # to trigger real execution - return q2_pandas_output - - -def q2_sql(df): - sql = """ - SELECT - passenger_count, - AVG(total_amount) AS 'total_amount' - FROM trips - GROUP BY - passenger_count - """ - return query(sql, trips=df) - - -def q3_hdk(df): - df["pickup_datetime"] = df["pickup_datetime"].dt.year - q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size() - q3_pandas_output.shape # to trigger real execution - return q3_pandas_output - - -def q3_sql(df): - sql = """ - SELECT - passenger_count, - pickup_datetime, - COUNT(*) AS 'count' - FROM trips - GROUP BY - passenger_count, - pickup_datetime - """ - df["pickup_datetime"] = df["pickup_datetime"].dt.year - return query(sql, trips=df) - - -def q4_hdk(df): - df["pickup_datetime"] = df["pickup_datetime"].dt.year - df["trip_distance"] = df["trip_distance"].astype("int64") - q4_pandas_output = ( - df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False) - .size() - .reset_index() - .sort_values( - by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False] - ) - ) - q4_pandas_output.shape # to trigger real execution - return q4_pandas_output - - -def q4_sql(df): - sql = """ - SELECT - passenger_count, - pickup_datetime, - CAST(trip_distance AS int) AS trip_distance, - COUNT(*) AS the_count - FROM trips - GROUP BY - passenger_count, - pickup_datetime, - trip_distance - ORDER BY - pickup_datetime, - the_count desc - """ - df["pickup_datetime"] = df["pickup_datetime"].dt.year - df["trip_distance"] = df["trip_distance"].astype("int64") - return query(sql, trips=df) - - -def validate(df, hdk_func, sql_func, copy_df=False, reset_index=True, sort_by=None): - hdk_result = hdk_func(df.copy() if copy_df else df) - sql_result = sql_func(df.copy() if copy_df else df) - if reset_index: - hdk_result = hdk_result.reset_index() - hdk_result.columns = sql_result.columns - if sort_by is not None: - hdk_result = hdk_result.sort_values(by=sort_by) - sql_result = hdk_result.sort_values(by=sort_by) - df_equals(hdk_result, sql_result) - - -def main(): - if len(sys.argv) != 2: - print( - f"USAGE: docker run --rm -v /path/to/dataset:/dataset python nyc-taxi-hdk.py " - ) - return - df = measure("Reading", read, sys.argv[1]) - measure("Q1H", q1_hdk, df) - measure("Q1S", q1_sql, df) - measure("Q2H", q2_hdk, df) - measure("Q2S", q2_sql, df) - # The data frame is modified by some tests, therefore a copy should be used for these tests. - measure("Q3H", q3_hdk, df.copy()) - measure("Q3S", q3_sql, df.copy()) - measure("Q4H", q4_hdk, df.copy()) - measure("Q4S", q4_sql, df.copy()) - - validate(df, q1_hdk, q1_sql) - validate(df, q2_hdk, q2_sql) - validate(df, q3_hdk, q3_sql, copy_df=True) - # Additional sorting is required here to make the results identical - validate( - df, q4_hdk, q4_sql, copy_df=True, reset_index=False, sort_by=["trip_distance"] - ) - - -if __name__ == "__main__": - main() diff --git a/examples/docker/modin-hdk/plasticc-hdk.py b/examples/docker/modin-hdk/plasticc-hdk.py deleted file mode 100644 index 704e0fe49cd..00000000000 --- a/examples/docker/modin-hdk/plasticc-hdk.py +++ /dev/null @@ -1,279 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import sys -from functools import partial - -import numpy as np -from utils import measure - -import modin.pandas as pd - - -################ helper functions ############################### -def create_dtypes(): - dtypes = dict( - [ - ("object_id", "int32"), - ("mjd", "float32"), - ("passband", "int32"), - ("flux", "float32"), - ("flux_err", "float32"), - ("detected", "int32"), - ] - ) - - # load metadata - columns_names = [ - "object_id", - "ra", - "decl", - "gal_l", - "gal_b", - "ddf", - "hostgal_specz", - "hostgal_photoz", - "hostgal_photoz_err", - "distmod", - "mwebv", - "target", - ] - meta_dtypes = ["int32"] + ["float32"] * 4 + ["int32"] + ["float32"] * 5 + ["int32"] - meta_dtypes = dict( - [(columns_names[i], meta_dtypes[i]) for i in range(len(meta_dtypes))] - ) - return dtypes, meta_dtypes - - -def trigger_read_op(dfs: tuple): - for df in dfs: - df.shape # to trigger real execution - return dfs - - -def ravel_column_names(cols): - d0 = cols.get_level_values(0) - d1 = cols.get_level_values(1) - return ["%s_%s" % (i, j) for i, j in zip(d0, d1)] - - -def all_etl(train, train_meta, test, test_meta): - train_final = etl(train, train_meta) - test_final = etl(test, test_meta) - return (train_final, test_final) - - -def split_step(train_final, test_final): - from sklearn.model_selection import train_test_split - from sklearn.preprocessing import LabelEncoder - - X = train_final.drop(["object_id", "target"], axis=1).values - Xt = test_final.drop(["object_id"], axis=1).values - - y = train_final["target"] - assert X.shape[1] == Xt.shape[1] - classes = sorted(y.unique()) - - class_weights = {c: 1 for c in classes} - class_weights.update({c: 2 for c in [64, 15]}) - - lbl = LabelEncoder() - y = lbl.fit_transform(y) - - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, stratify=y, random_state=126 - ) - - return X_train, y_train, X_test, y_test, Xt, classes, class_weights - - -def multi_weighted_logloss(y_true, y_preds, classes, class_weights): - """ - refactor from - @author olivier https://www.kaggle.com/ogrellier - multi logloss for PLAsTiCC challenge - """ - y_p = y_preds.reshape(y_true.shape[0], len(classes), order="F") - y_ohe = pd.get_dummies(y_true) - y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15) - y_p_log = np.log(y_p) - y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0) - nb_pos = y_ohe.sum(axis=0).values.astype(float) - class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())]) - y_w = y_log_ones * class_arr / nb_pos - - loss = -np.sum(y_w) / np.sum(class_arr) - return loss - - -def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights): - loss = multi_weighted_logloss( - y_true.get_label(), y_predicted, classes, class_weights - ) - return "wloss", loss - - -################ helper functions ############################### - - -def read( - training_set_filename, - test_set_filename, - training_set_metadata_filename, - test_set_metadata_filename, - dtypes, - meta_dtypes, -): - train = pd.read_csv(training_set_filename, dtype=dtypes) - test = pd.read_csv( - test_set_filename, - names=list(dtypes.keys()), - dtype=dtypes, - header=0, - ) - - train_meta = pd.read_csv(training_set_metadata_filename, dtype=meta_dtypes) - target = meta_dtypes.pop("target") - test_meta = pd.read_csv(test_set_metadata_filename, dtype=meta_dtypes) - meta_dtypes["target"] = target - - dfs = (train, train_meta, test, test_meta) - trigger_read_op(dfs) - return dfs - - -def etl(df, df_meta): - # workaround for both Modin_on_ray and Modin_on_hdk modes. Eventually this should be fixed - df["flux_ratio_sq"] = (df["flux"] / df["flux_err"]) * ( - df["flux"] / df["flux_err"] - ) # np.power(df["flux"] / df["flux_err"], 2.0) - df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"] - - aggs = { - "passband": ["mean"], - "flux": ["min", "max", "mean", "skew"], - "flux_err": ["min", "max", "mean"], - "detected": ["mean"], - "mjd": ["max", "min"], - "flux_ratio_sq": ["sum"], - "flux_by_flux_ratio_sq": ["sum"], - } - agg_df = df.groupby("object_id", sort=False).agg(aggs) - - agg_df.columns = ravel_column_names(agg_df.columns) - - agg_df["flux_diff"] = agg_df["flux_max"] - agg_df["flux_min"] - agg_df["flux_dif2"] = agg_df["flux_diff"] / agg_df["flux_mean"] - agg_df["flux_w_mean"] = ( - agg_df["flux_by_flux_ratio_sq_sum"] / agg_df["flux_ratio_sq_sum"] - ) - agg_df["flux_dif3"] = agg_df["flux_diff"] / agg_df["flux_w_mean"] - agg_df["mjd_diff"] = agg_df["mjd_max"] - agg_df["mjd_min"] - - agg_df = agg_df.drop(["mjd_max", "mjd_min"], axis=1) - - agg_df = agg_df.reset_index() - - df_meta = df_meta.drop(["ra", "decl", "gal_l", "gal_b"], axis=1) - - df_meta = df_meta.merge(agg_df, on="object_id", how="left") - - df_meta.shape # to trigger real execution - return df_meta - - -def ml(train_final, test_final): - # to not install ML dependencies unless required - import sklearnex - import xgboost as xgb - - sklearnex.patch_sklearn() - - X_train, y_train, X_test, y_test, Xt, classes, class_weights = split_step( - train_final, test_final - ) - - cpu_params = { - "objective": "multi:softprob", - "eval_metric": "merror", - "tree_method": "hist", - "nthread": 16, - "num_class": 14, - "max_depth": 7, - "verbosity": 1, - "subsample": 0.7, - "colsample_bytree": 0.7, - } - - func_loss = partial( - xgb_multi_weighted_logloss, classes=classes, class_weights=class_weights - ) - - dtrain = xgb.DMatrix(data=X_train, label=y_train) - dvalid = xgb.DMatrix(data=X_test, label=y_test) - dtest = xgb.DMatrix(data=Xt) - - watchlist = [(dvalid, "eval"), (dtrain, "train")] - - clf = xgb.train( - cpu_params, - dtrain=dtrain, - num_boost_round=60, - evals=watchlist, - feval=func_loss, - early_stopping_rounds=10, - verbose_eval=None, - ) - - yp = clf.predict(dvalid) - cpu_loss = multi_weighted_logloss(y_test, yp, classes, class_weights) - ysub = clf.predict(dtest) # noqa: F841 (unused variable) - - return cpu_loss - - -def main(): - if len(sys.argv) < 5: - print( - "USAGE: docker run --rm -v /path/to/dataset:/dataset python plasticc-hdk.py" - + " " - + " " - + " " - + " " - + " [-no-ml]" - ) - return - - dtypes, meta_dtypes = create_dtypes() - - train, train_meta, test, test_meta = measure( - "Reading", - read, - sys.argv[1], - sys.argv[2], - sys.argv[3], - sys.argv[4], - dtypes, - meta_dtypes, - ) - train_final, test_final = measure( - "ETL", all_etl, train, train_meta, test, test_meta - ) - - if "-no-ml" not in sys.argv[5:]: - cpu_loss = measure("ML", ml, train_final, test_final) - print("validation cpu_loss:", cpu_loss) - - -if __name__ == "__main__": - main() diff --git a/examples/docker/modin-hdk/utils.py b/examples/docker/modin-hdk/utils.py deleted file mode 100644 index 08dbe330ab1..00000000000 --- a/examples/docker/modin-hdk/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import sys -import time -from os.path import abspath, dirname, join - -MODIN_DIR = abspath(join(dirname(__file__), *[".." for _ in range(3)])) -if MODIN_DIR not in sys.path: - sys.path.insert(0, MODIN_DIR) - - -def measure(name, func, *args, **kwargs): - t0 = time.time() - res = func(*args, **kwargs) - t1 = time.time() - print(f"{name}: {t1 - t0} sec") - return res diff --git a/examples/docker/modin-ray/plasticc.py b/examples/docker/modin-ray/plasticc.py index 1c0cddadd1b..e798849e181 100644 --- a/examples/docker/modin-ray/plasticc.py +++ b/examples/docker/modin-ray/plasticc.py @@ -157,7 +157,7 @@ def read( def etl(df, df_meta): - # workaround for both Modin_on_ray and Modin_on_hdk modes. Eventually this should be fixed + # workaround for Modin_on_ray. Eventually this should be fixed df["flux_ratio_sq"] = (df["flux"] / df["flux_err"]) * ( df["flux"] / df["flux_err"] ) # np.power(df["flux"] / df["flux_err"], 2.0) diff --git a/examples/tutorial/jupyter/README.md b/examples/tutorial/jupyter/README.md index f5f07259531..a08bb81c42e 100644 --- a/examples/tutorial/jupyter/README.md +++ b/examples/tutorial/jupyter/README.md @@ -5,7 +5,6 @@ Currently we provide tutorial notebooks for the following execution backends: - [PandasOnRay](https://modin.readthedocs.io/en/latest/development/using_pandas_on_ray.html) - [PandasOnDask](https://modin.readthedocs.io/en/latest/development/using_pandas_on_dask.html) - [PandasOnMPI through unidist](https://modin.readthedocs.io/en/latest/development/using_pandas_on_mpi.html) -- [HdkOnNative](https://modin.readthedocs.io/en/latest/development/using_hdk.html) ## Creating a development environment @@ -34,23 +33,6 @@ to install dependencies needed to run notebooks with Modin on `PandasOnUnidist` **Note:** Sometimes pip is installing every version of a package. If you encounter that issue, please install every package listed in `requirements.txt` file individually with `pip install `. -To get required dependencies for `HdkOnNative` Jupyter Notebooks -you should create a development environment with `conda` -using `jupyter_hdk_env.yml` file located in the respective directory: - -```bash -conda config --set channel_priority strict -conda env create -f execution/hdk_on_native/jupyter_hdk_env.yml -``` - -After the environment is created it needs to be activated: - -```bash -conda activate jupyter_modin_on_hdk -``` - -**Note:** `HDK` engine is available on Linux only for now. - ## Run Jupyter Notebooks A Jupyter Notebook server can be run from the current directory as follows: diff --git a/examples/tutorial/jupyter/execution/hdk_on_native/jupyter_hdk_env.yml b/examples/tutorial/jupyter/execution/hdk_on_native/jupyter_hdk_env.yml deleted file mode 100644 index 27c7ffab17e..00000000000 --- a/examples/tutorial/jupyter/execution/hdk_on_native/jupyter_hdk_env.yml +++ /dev/null @@ -1,6 +0,0 @@ -name: jupyter_modin_on_hdk -channels: - - conda-forge -dependencies: - - modin-hdk - - jupyter diff --git a/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_1.ipynb b/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_1.ipynb deleted file mode 100644 index 546611412f7..00000000000 --- a/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_1.ipynb +++ /dev/null @@ -1,234 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", - "\n", - "

Scale your pandas workflows by changing one line of code

\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exercise 1: How to use Modin\n", - "\n", - "**GOAL**: Learn how to import Modin to accelerate and scale pandas workflows." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modin is a drop-in replacement for pandas that distributes the computation \n", - "across all of the cores in your machine or in a cluster.\n", - "In practical terms, this means that you can continue using the same pandas scripts\n", - "as before and expect the behavior and results to be the same. The only thing that needs\n", - "to change is the import statement. Normally, you would change:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "```\n", - "\n", - "to:\n", - "\n", - "```python\n", - "import modin.pandas as pd\n", - "```\n", - "\n", - "Changing this line of code will allow you to use all of the cores in your machine to do computation on your data. One of the major performance bottlenecks of pandas is that it only uses a single core for any given computation. Modin exposes an API that is identical to pandas, allowing you to continue interacting with your data as you would with pandas. There are no additional commands required to use Modin locally. Partitioning, scheduling, data transfer, and other related concerns are all handled by Modin under the hood." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "

\n", - "

pandas on a multicore laptop\n", - " \n", - " Modin on a multicore laptop\n", - " \n", - "\n", - "
\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Concept for exercise: Dataframe constructor\n", - "\n", - "Often when playing around in pandas, it is useful to create a DataFrame with the constructor. That is where we will start.\n", - "\n", - "```python\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", - "df = pd.DataFrame(frame_data)\n", - "```\n", - "\n", - "When creating a dataframe from a non-distributed object, it will take extra time to partition the data for Modin. When this is happening, you will see this message:\n", - "\n", - "```\n", - "UserWarning: Distributing object. This may take some time.\n", - "```\n", - "\n", - "Modin uses Ray as an execution engine by default. Since this notebook is related to HDK, let's run examples on the HDK engine. For reaching this, we need to activate HDK either via Modin config or Modin environment variable. See more in [HDK usage](https://github.com/modin-project/modin/blob/main/docs/development/using_hdk.rst) section.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import modin.config as cfg\n", - "cfg.StorageFormat.put('hdk')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Note: Importing notebooks dependencies. Do not change this code!\n", - "import numpy as np\n", - "import pandas\n", - "import sys\n", - "import modin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pandas.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "modin.__version__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Implement your answer here. You are also free to play with the size\n", - "# and shape of the DataFrame, but beware of exceeding your memory!\n", - "\n", - "import pandas as pd\n", - "\n", - "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", - "df = pd.DataFrame(frame_data)\n", - "\n", - "# ***** Do not change the code below! It verifies that \n", - "# ***** the exercise has been done correctly. *****\n", - "\n", - "try:\n", - " assert df is not None\n", - " assert frame_data is not None\n", - " assert isinstance(frame_data, np.ndarray)\n", - "except:\n", - " raise AssertionError(\"Don't change too much of the original code!\")\n", - "assert \"modin.pandas\" in sys.modules, \"Not quite correct. Remember the single line of code change (See above)\"\n", - "\n", - "import modin.pandas\n", - "assert pd == modin.pandas, \"Remember the single line of code change (See above)\"\n", - "assert hasattr(df, \"_query_compiler\"), \"Make sure that `df` is a modin.pandas DataFrame.\"\n", - "\n", - "print(\"Success! You only need to change one line of code!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have created a toy example for playing around with the DataFrame, let's print it out in different ways." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Concept for Exercise: Data Interaction and Printing\n", - "\n", - "When interacting with data, it is very imporant to look at different parts of the data (e.g. `df.head()`). Here we will show that you can print the modin.pandas DataFrame in the same ways you would pandas." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# When working with non-string column labels it could happen that some backend logic would try to insert a column \n", - "# with a string name to the frame, so we do add_prefix()\n", - "df = df.add_prefix(\"col\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Print the first 10 lines.\n", - "df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "df.count()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Please move on to [Exercise 2](./exercise_2.ipynb) when you are ready**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_2.ipynb b/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_2.ipynb deleted file mode 100644 index 1ee31a5099e..00000000000 --- a/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_2.ipynb +++ /dev/null @@ -1,256 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", - "\n", - "

Scale your pandas workflows by changing one line of code

\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exercise 2: Speed improvements\n", - "\n", - "**GOAL**: Learn about common functionality that Modin speeds up by using all of your machine's cores." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Concept for Exercise: `read_csv` speedups\n", - "\n", - "The most commonly used data ingestion method used in pandas is CSV files (link to pandas survey). This concept is designed to give an idea of the kinds of speedups possible, even on a non-distributed filesystem. Modin also supports other file formats for parallel and distributed reads, which can be found in the documentation.\n", - "\n", - "We will import both Modin and pandas so that the speedups are evident.\n", - "\n", - "**Note: Rerunning the `read_csv` cells many times may result in degraded performance, depending on the memory of the machine**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import modin.pandas as pd\n", - "import pandas\n", - "import time\n", - "import modin.config as cfg\n", - "cfg.StorageFormat.put(\"hdk\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dataset: 2015 NYC taxi trip data\n", - "\n", - "We will be using a version of this data already in S3, originally posted in this blog post: https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes\n", - "\n", - "**Size: ~200MB**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# We download data locally because currently `HdkOnNative` doesn't support read files from s3 storage.\n", - "# Note that this may take a few minutes to download.\n", - "\n", - "import urllib.request\n", - "url_path = \"https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv\"\n", - "urllib.request.urlretrieve(url_path, \"taxi.csv\")\n", - "path = \"taxi.csv\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `pandas.read_csv`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "\n", - "pandas_df = pandas.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"])\n", - "\n", - "end = time.time()\n", - "pandas_duration = end - start\n", - "print(\"Time to read with pandas: {} seconds\".format(round(pandas_duration, 3)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Expect pandas to take >3 minutes on EC2, longer locally\n", - "\n", - "This is a good time to chat with your neighbor\n", - "Dicussion topics\n", - "- Do you work with a large amount of data daily?\n", - "- How big is your data?\n", - "- What’s the common use case of your data?\n", - "- Do you use any big data analytics tools?\n", - "- Do you use any interactive analytics tool?\n", - "- What’s are some drawbacks of your current interative analytic tools today?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `modin.pandas.read_csv`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "\n", - "modin_df = pd.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"])\n", - "\n", - "end = time.time()\n", - "modin_duration = end - start\n", - "print(\"Time to read with Modin: {} seconds\".format(round(modin_duration, 3)))\n", - "\n", - "print(\"Modin is {}x faster than pandas at `read_csv`!\".format(round(pandas_duration / modin_duration, 2)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Are they equals?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "modin_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pandas_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Concept for exercise: Groupby and aggregate\n", - "\n", - "In pandas, you can groupby and aggregate. We will groupby a column in the dataset and use count for our aggregate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "\n", - "pandas_groupby = pandas_df.groupby(by=\"total_amount\").count()\n", - "\n", - "end = time.time()\n", - "pandas_duration = end - start\n", - "\n", - "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start = time.time()\n", - "\n", - "modin_groupby = modin_df.groupby(by=\"total_amount\").count()\n", - "\n", - "end = time.time()\n", - "modin_duration = end - start\n", - "print(\"Time to groupby with Modin: {} seconds\".format(round(modin_duration, 3)))\n", - "\n", - "print(\"Modin is {}x faster than pandas at `groupby`!\".format(round(pandas_duration / modin_duration, 2)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Are they equal?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pandas_groupby" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "modin_groupby" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Please move on to [Exercise 3](./exercise_3.ipynb)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_3.ipynb b/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_3.ipynb deleted file mode 100644 index c1d1203fd84..00000000000 --- a/examples/tutorial/jupyter/execution/hdk_on_native/local/exercise_3.ipynb +++ /dev/null @@ -1,146 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", - "\n", - "

Scale your pandas workflows by changing one line of code

\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exercise 3: Not Implemented\n", - "\n", - "**GOAL**: Learn what happens when a function is not yet supported in Modin." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When functionality has not yet been implemented for `HdkOnNative` execution, we default to pandas as follows\n", - "\n", - "![](../../../img/hdk_convert_to_pandas.png)\n", - "\n", - "We convert the Modin dataframe to a pyarrow.Table, perform a lazy tree execution in HDK, render it as a pyarrow.Table, convert it to pandas to perform the operation, and then convert it back to Modin when complete. These operations will have a large overhead due to the communication involved and will take longer than pandas.\n", - "\n", - "When this is happening, a warning will be given to the user to inform them that this operation will take longer than usual. For example, `DataFrame.mask` is not supported. In this case, when a user tries to use it, they will see this warning:\n", - "\n", - "```\n", - "UserWarning: `DataFrame.mask` defaulting to pandas implementation.\n", - "```\n", - "\n", - "#### Relation engine limitations\n", - "As the `HdkOnNative` execution is backed by relation algebra based DB engine, there is a certain set of limitations on operations that could be used in Modin with such an execution. For example arbitrary functions in `DataFrame.apply` are not supported as the HDK engine can't execute python callables against its tables, this means that `DataFrame.apply(python_callable)` will **always** be defaulting to pandas. \n", - "\n", - "For more info about `HdkOnNative` limitations visit the appropriate section on read-the-docs: [relation algebra limitations](https://modin.readthedocs.io/en/stable/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.html#relational-engine-limitations).\n", - "\n", - "If your flow mainly operates with non-relational algebra operations, you should better choose non-HDK execution (for example, `PandasOnRay`)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Concept for exercise: Default to pandas\n", - "\n", - "In this section of the exercise we will see first-hand how the runtime is affected by operations that are not implemented." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import modin.pandas as pd\n", - "import pandas\n", - "import numpy as np\n", - "import time\n", - "import modin.config as cfg\n", - "cfg.StorageFormat.put(\"hdk\")\n", - "\n", - "frame_data = np.random.randint(0, 100, size=(2**18, 2**8))\n", - "df = pd.DataFrame(frame_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pandas_df = pandas.DataFrame(frame_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "modin_start = time.time()\n", - "\n", - "print(df.mask(df < 50))\n", - "\n", - "modin_end = time.time()\n", - "print(\"Modin mask took {} seconds.\".format(round(modin_end - modin_start, 4)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pandas_start = time.time()\n", - "\n", - "print(pandas_df.mask(pandas_df < 50))\n", - "\n", - "pandas_end = time.time()\n", - "print(\"pandas mask took {} seconds.\".format(round(pandas_end - pandas_start, 4)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## To request a feature please open an issue: https://github.com/modin-project/modin/issues\n", - "\n", - "For a complete list of what is implemented, see the [Supported APIs](https://modin.readthedocs.io/en/latest/supported_apis/index.html) section." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py b/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py deleted file mode 100644 index c406998ec43..00000000000 --- a/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import os -import sys - -import nbformat - -MODIN_DIR = os.path.abspath( - os.path.join(os.path.dirname(__file__), *[".." for _ in range(6)]) -) -sys.path.insert(0, MODIN_DIR) -from examples.tutorial.jupyter.execution.test.utils import ( # noqa: E402 - _execute_notebook, - _replace_str, -) - -local_notebooks_dir = "examples/tutorial/jupyter/execution/hdk_on_native/local" - - -# in this notebook user should replace 'import pandas as pd' with -# 'import modin.pandas as pd' to make notebook work -def test_exercise_1(): - modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_1_test.ipynb") - nb = nbformat.read( - os.path.join(local_notebooks_dir, "exercise_1.ipynb"), - as_version=nbformat.NO_CONVERT, - ) - - _replace_str(nb, "import pandas as pd", "import modin.pandas as pd") - - nbformat.write(nb, modified_notebook_path) - _execute_notebook(modified_notebook_path) - - -# this notebook works "as is" -def test_exercise_2(): - modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") - nb = nbformat.read( - os.path.join(local_notebooks_dir, "exercise_2.ipynb"), - as_version=nbformat.NO_CONVERT, - ) - - nbformat.write(nb, modified_notebook_path) - _execute_notebook(modified_notebook_path) - - -# this notebook works "as is" -def test_exercise_3(): - modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_3_test.ipynb") - nb = nbformat.read( - os.path.join(local_notebooks_dir, "exercise_3.ipynb"), - as_version=nbformat.NO_CONVERT, - ) - - nbformat.write(nb, modified_notebook_path) - _execute_notebook(modified_notebook_path) diff --git a/modin/config/__init__.py b/modin/config/__init__.py index d2f590549c3..86ae3abcb17 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -23,15 +23,12 @@ CpuCount, DaskThreadsPerWorker, DocModule, - DoUseCalcite, Engine, EnvironmentVariable, ExperimentalGroupbyImpl, ExperimentalNumPyAPI, GithubCI, GpuCount, - HdkFragmentSize, - HdkLaunchParameters, IsDebug, IsExperimental, IsRayCluster, @@ -85,10 +82,6 @@ # Partitioning "NPartitions", "MinPartitionSize", - # HDK specific - "HdkFragmentSize", - "DoUseCalcite", - "HdkLaunchParameters", # ASV specific "TestDatasetSize", "AsvImplementation", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index d303d516b31..59f1ca265ed 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -175,7 +175,7 @@ class Engine(EnvironmentVariable, type=str): """Distribution engine to run queries by.""" varname = "MODIN_ENGINE" - choices = ("Ray", "Dask", "Python", "Native", "Unidist") + choices = ("Ray", "Dask", "Python", "Unidist") NOINIT_ENGINES = { "Python", @@ -226,16 +226,6 @@ def _get_default(cls) -> str: f'Please `pip install "modin[dask]"` to install compatible Dask version (>={MIN_DASK_VERSION}).' ) return "Dask" - try: - # We import ``DbWorker`` from this module since correct import of ``DbWorker`` itself - # from HDK is located in it with all the necessary options for dlopen. - from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( # noqa - DbWorker, - ) - except ImportError: - pass - else: - return "Native" try: import unidist @@ -253,26 +243,6 @@ def _get_default(cls) -> str: "Please refer to installation documentation page to install an engine" ) - @classmethod - def get(cls) -> str: - """ - Get value of the Engine. - - Returns - ------- - str - """ - value = super().get() - if value == "Native": - from modin.error_message import ErrorMessage - - ErrorMessage.single_warning( - "HDK engine is deprecated and will be removed in a future version. " - + "Consider switching to Ray, Dask or MPI engine.", - FutureWarning, - ) - return value - @classmethod @doc(Parameter.add_option.__doc__) def add_option(cls, choice: Any) -> Any: @@ -287,7 +257,7 @@ class StorageFormat(EnvironmentVariable, type=str): varname = "MODIN_STORAGE_FORMAT" default = "Pandas" - choices = ("Pandas", "Hdk", "Cudf") + choices = ("Pandas", "Cudf") class IsExperimental(EnvironmentVariable, type=bool): @@ -462,19 +432,6 @@ def get(cls) -> int: return nparts -class HdkFragmentSize(EnvironmentVariable, type=int): - """How big a fragment in HDK should be when creating a table (in rows).""" - - varname = "MODIN_HDK_FRAGMENT_SIZE" - - -class DoUseCalcite(EnvironmentVariable, type=bool): - """Whether to use Calcite for HDK queries execution.""" - - varname = "MODIN_USE_CALCITE" - default = True - - class TestDatasetSize(EnvironmentVariable, type=str): """Dataset size for running some tests.""" @@ -663,69 +620,6 @@ class PersistentPickle(EnvironmentVariable, type=bool): default = False -class HdkLaunchParameters(EnvironmentVariable, type=dict): - """ - Additional command line options for the HDK engine. - - Please visit OmniSci documentation for the description of available parameters: - https://docs.omnisci.com/installation-and-configuration/config-parameters#configuration-parameters-for-omniscidb - """ - - varname = "MODIN_HDK_LAUNCH_PARAMETERS" - - @classmethod - def get(cls) -> dict: - """ - Get the resulted command-line options. - - Decode and merge specified command-line options with the default one. - - Returns - ------- - dict - Decoded and verified config value. - """ - custom_parameters = super().get() - result = cls._get_default().copy() - result.update( - {key.replace("-", "_"): value for key, value in custom_parameters.items()} - ) - return result - - @classmethod - def _get_default(cls) -> Any: - """ - Get default value of the config. Checks the pyhdk version and omits variables unsupported in prior versions. - - Returns - ------- - dict - Config keys and corresponding values. - """ - if (default := getattr(cls, "default", None)) is None: - cls.default = default = { - "enable_union": 1, - "enable_columnar_output": 1, - "enable_lazy_fetch": 0, - "null_div_by_zero": 1, - "enable_watchdog": 0, - "enable_thrift_logs": 0, - "enable_multifrag_execution_result": 1, - "cpu_only": 1, - } - - try: - import pyhdk - - if version.parse(pyhdk.__version__) >= version.parse("0.6.1"): - default["enable_lazy_dict_materialization"] = 0 - default["log_dir"] = "pyhdk_log" - except ImportError: - # if pyhdk is not available, do not show any additional options - pass - return default - - class MinPartitionSize(EnvironmentVariable, type=int): """ Minimum number of rows/columns in a single pandas partition split. diff --git a/modin/core/dataframe/pandas/metadata/index.py b/modin/core/dataframe/pandas/metadata/index.py index b731a99bc73..303e07ea334 100644 --- a/modin/core/dataframe/pandas/metadata/index.py +++ b/modin/core/dataframe/pandas/metadata/index.py @@ -13,7 +13,6 @@ """Module contains class ModinIndex.""" -import functools import uuid from typing import Optional @@ -104,15 +103,7 @@ def _get_default_callable(dataframe_obj, axis): ------- callable() -> tuple(pandas.Index, list[ints]) """ - # HACK: for an unknown reason, the 'lambda' approach seems to trigger some strange - # race conditions in HDK on certain versions of python, causing the tests to fail - # (python 3.9.* and 3.10.* are the versions where we saw the problem). That's - # really strange, but practically the same code that uses 'functools.partial' - # instead of a lambda works absolutely fine. - # return lambda: dataframe_obj._compute_axis_labels_and_lengths(axis) - return functools.partial( - type(dataframe_obj)._compute_axis_labels_and_lengths, dataframe_obj, axis - ) + return lambda: dataframe_obj._compute_axis_labels_and_lengths(axis) def maybe_specify_new_frame_ref(self, value, axis) -> "ModinIndex": """ diff --git a/modin/core/dataframe/pandas/utils.py b/modin/core/dataframe/pandas/utils.py index 5081331e986..358becd8ba2 100644 --- a/modin/core/dataframe/pandas/utils.py +++ b/modin/core/dataframe/pandas/utils.py @@ -99,7 +99,7 @@ def create_pandas_df_from_partitions( objs = iter(partition_data) partition_data = [[next(objs) for _ in range(width)] for __ in range(height)] else: - # Partitions do not always contain pandas objects, for example, hdk uses pyarrow tables. + # Partitions do not always contain pandas objects. # This implementation comes from the fact that calling `partition.get` # function is not always equivalent to `partition.to_pandas`. partition_data = [[obj.to_pandas() for obj in part] for part in partition_data] diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index aa94688e3df..448e693508d 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -783,23 +783,6 @@ def prepare(cls): # EXPERIMENTAL FACTORIES # Factories that operate only in experimental mode. They provide access to executions # that have little coverage of implemented functionality or are not stable enough. - - -@doc(_doc_factory_class, execution_name="experimental HdkOnNative") -class ExperimentalHdkOnNativeFactory(BaseFactory): - @classmethod - @doc(_doc_factory_prepare_method, io_module_name="experimental ``HdkOnNativeIO``") - def prepare(cls): - from modin.experimental.core.execution.native.implementations.hdk_on_native.io import ( - HdkOnNativeIO, - ) - - if not IsExperimental.get(): - raise ValueError("'HdkOnNative' only works in experimental mode.") - - cls.io_cls = HdkOnNativeIO - - @doc(_doc_factory_class, execution_name="cuDFOnRay") class ExperimentalCudfOnRayFactory(BaseFactory): @classmethod diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 50008f261a4..f3051b07574 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -30,7 +30,6 @@ from pandas._typing import DtypeBackend, IndexLabel, Suffixes from pandas.core.dtypes.common import is_number, is_scalar -from modin.config import StorageFormat from modin.core.dataframe.algebra.default2pandas import ( BinaryDefault, CatDefault, @@ -6786,11 +6785,6 @@ def repartition(self, axis=None): BaseQueryCompiler The repartitioned BaseQueryCompiler. """ - if StorageFormat.get() == "Hdk": - # Hdk uses only one partition, it makes - # no sense for it to repartition the dataframe. - return self - axes = [0, 1] if axis is None else [axis] new_query_compiler = self diff --git a/modin/experimental/core/execution/native/__init__.py b/modin/experimental/core/execution/native/__init__.py deleted file mode 100644 index 75ff034bb35..00000000000 --- a/modin/experimental/core/execution/native/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental Modin's functionality related to Native (HDK) execution engine.""" diff --git a/modin/experimental/core/execution/native/implementations/__init__.py b/modin/experimental/core/execution/native/implementations/__init__.py deleted file mode 100644 index eb987c05364..00000000000 --- a/modin/experimental/core/execution/native/implementations/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental Modin's functionality related to Native execution engine and optimized for specific storage formats.""" diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/__init__.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/__init__.py deleted file mode 100644 index fee34f8554e..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental Modin's functionality related to Native execution engine and optimized for HDK storage format.""" diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py deleted file mode 100644 index 87b91adf896..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py +++ /dev/null @@ -1,312 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides ``BaseDbWorker`` class.""" - -import abc -import uuid -from typing import List, Tuple - -import numpy as np -import pyarrow as pa - -from modin.error_message import ErrorMessage - -_UINT_TO_INT_MAP = { - pa.uint8(): pa.int16(), - pa.uint16(): pa.int32(), - pa.uint32(): pa.int64(), - pa.uint64(): pa.int64(), # May cause overflow -} - - -class DbTable(abc.ABC): - """ - Base class, representing a table in the HDK database. - - Attributes - ---------- - name : str - Table name. - """ - - @property - @abc.abstractmethod - def shape(self) -> Tuple[int, int]: - """ - Return a tuple with the number of rows and columns. - - Returns - ------- - tuple of int - """ - pass - - @property - @abc.abstractmethod - def column_names(self) -> List[str]: - """ - Return a list of the table column names. - - Returns - ------- - tuple of str - """ - pass - - @abc.abstractmethod - def to_arrow(self) -> pa.Table: - """ - Convert this table to arrow. - - Returns - ------- - pyarrow.Table - """ - pass - - def __len__(self): - """ - Return the number of rows in the table. - - Returns - ------- - int - """ - return self.shape[0] - - -class BaseDbWorker(abc.ABC): - """Base class for HDK storage format based execution engine .""" - - @classmethod - @abc.abstractmethod - def dropTable(cls, name): - """ - Drops table with the specified name. - - Parameters - ---------- - name : str - A table to drop. - """ - pass - - @classmethod - @abc.abstractmethod - def executeDML(cls, query): - """ - Execute DML SQL query. - - Parameters - ---------- - query : str - SQL query. - - Returns - ------- - DbTable - Execution result. - """ - pass - - @classmethod - @abc.abstractmethod - def executeRA(cls, query): - """ - Execute calcite query. - - Parameters - ---------- - query : str - Serialized calcite query. - - Returns - ------- - DbTable - Execution result. - """ - pass - - @classmethod - def _genName(cls, name): - """ - Generate or mangle a table name. - - Parameters - ---------- - name : str or None - Table name to mangle or None to generate a unique - table name. - - Returns - ------- - str - Table name. - """ - if not name: - name = "frame_" + str(uuid.uuid4()).replace("-", "") - # TODO: reword name in case of caller's mistake - return name - - @classmethod - def cast_to_compatible_types(cls, table, cast_dict): - """ - Cast PyArrow table to be fully compatible with HDK. - - Parameters - ---------- - table : pyarrow.Table - Source table. - cast_dict : bool - Cast dictionary columns to string. - - Returns - ------- - pyarrow.Table - Table with fully compatible types with HDK. - """ - schema = table.schema - new_schema = schema - need_cast = False - uint_to_int_cast = False - - for i, field in enumerate(schema): - if pa.types.is_dictionary(field.type): - value_type = field.type.value_type - # Conversion for dictionary of null type to string is not supported - # in Arrow. Build new column for this case for now. - if pa.types.is_null(value_type): - mask = np.full(table.num_rows, True, dtype=bool) - new_col_data = np.empty(table.num_rows, dtype=str) - new_col = pa.array(new_col_data, pa.string(), mask) - new_field = pa.field( - field.name, pa.string(), field.nullable, field.metadata - ) - table = table.set_column(i, new_field, new_col) - elif pa.types.is_string(value_type): - if cast_dict: - need_cast = True - new_field = pa.field( - field.name, pa.string(), field.nullable, field.metadata - ) - else: - new_field = field - else: - new_field, int_cast = cls._convert_field(field, value_type) - need_cast = True - uint_to_int_cast = uint_to_int_cast or int_cast - if new_field == field: - new_field = pa.field( - field.name, - value_type, - field.nullable, - field.metadata, - ) - new_schema = new_schema.set(i, new_field) - else: - new_field, int_cast = cls._convert_field(field, field.type) - need_cast = need_cast or new_field is not field - uint_to_int_cast = uint_to_int_cast or int_cast - new_schema = new_schema.set(i, new_field) - - # Such cast may affect the data, so we have to raise a warning about it - if uint_to_int_cast: - ErrorMessage.single_warning( - "HDK does not support unsigned integer types, such types will be rounded up to the signed equivalent." - ) - - if need_cast: - try: - table = table.cast(new_schema) - except pa.lib.ArrowInvalid as err: - raise (OverflowError if uint_to_int_cast else RuntimeError)( - "An error occurred when trying to convert unsupported by HDK 'dtypes' " - + f"to the supported ones, the schema to cast was: \n{new_schema}." - ) from err - - return table - - @staticmethod - def _convert_field(field, field_type): - """ - Convert the specified arrow field, if required. - - Parameters - ---------- - field : pyarrow.Field - field_type : pyarrow.DataType - - Returns - ------- - Tuple[pyarrow.Field, boolean] - A tuple, containing (new_field, uint_to_int_cast) - """ - if pa.types.is_date(field_type): - # Arrow's date is the number of days since the UNIX-epoch, so we can convert it - # to a timestamp[s] (number of seconds since the UNIX-epoch) without losing precision - return ( - pa.field(field.name, pa.timestamp("s"), field.nullable, field.metadata), - False, - ) - elif pa.types.is_unsigned_integer(field_type): - # HDK doesn't support unsigned types - return ( - pa.field( - field.name, - _UINT_TO_INT_MAP[field_type], - field.nullable, - field.metadata, - ), - True, - ) - return field, False - - @classmethod - @abc.abstractmethod - def import_arrow_table(cls, table, name=None): - """ - Import Arrow table to the worker. - - Parameters - ---------- - table : pyarrow.Table - A table to import. - name : str, optional - A table name to use. None to generate a unique name. - - Returns - ------- - DbTable - Imported table. - """ - pass - - @classmethod - def import_pandas_dataframe(cls, df, name=None): - """ - Import ``pandas.DataFrame`` to the worker. - - Parameters - ---------- - df : pandas.DataFrame - A frame to import. - name : str, optional - A table name to use. None to generate a unique name. - - Returns - ------- - DbTable - Imported table. - """ - return cls.import_arrow_table(pa.Table.from_pandas(df), name=name) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py deleted file mode 100644 index 99ff2afdf54..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py +++ /dev/null @@ -1,381 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -""" -Module provides classes for relational algebra expressions. - -Provided classes reflect relational algebra format used by -HDK storage format. -""" - -import abc - -from .dataframe.utils import ColNameCodec -from .db_worker import DbTable -from .expr import BaseExpr - - -class CalciteInputRefExpr(BaseExpr): - """ - Calcite version of input column reference. - - Calcite translation should replace all ``InputRefExpr``. - - Calcite references columns by their indexes (positions in input table). - If there are multiple input tables for Calcite node, then a position - in a concatenated list of all columns is used. - - Parameters - ---------- - idx : int - Input column index. - - Attributes - ---------- - input : int - Input column index. - """ - - def __init__(self, idx): - self.input = idx - - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - CalciteInputRefExpr - """ - return CalciteInputRefExpr(self.input) - - def __repr__(self): - """ - Return a string representation of the expression. - - Returns - ------- - str - """ - return f"(input {self.input})" - - -class CalciteInputIdxExpr(BaseExpr): - """ - Basically the same as ``CalciteInputRefExpr`` but with a different serialization. - - Parameters - ---------- - idx : int - Input column index. - - Attributes - ---------- - input : int - Input column index. - """ - - def __init__(self, idx): - self.input = idx - - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - CalciteInputIdxExpr - """ - return CalciteInputIdxExpr(self.input) - - def __repr__(self): - """ - Return a string representation of the expression. - - Returns - ------- - str - """ - return f"(input_idx {self.input})" - - -class CalciteBaseNode(abc.ABC): - """ - A base class for a Calcite computation sequence node. - - Calcite nodes are not combined into a tree but usually stored - in a sequence which works similar to a stack machine: the result - of the previous operation is an implicit operand of the current - one. Input nodes also can be referenced directly via its unique - ID number. - - Calcite nodes structure is based on a JSON representation used by - HDK for parsed queries serialization/deserialization for - interactions with a Calcite server. Currently, this format is - internal and is not a part of public API. It's not documented - and can be modified in an incompatible way in the future. - - Parameters - ---------- - relOp : str - An operation name. - - Attributes - ---------- - id : int - Id of the node. Should be unique within a single query. - relOp : str - Operation name. - """ - - _next_id = [0] - - def __init__(self, relOp): - self.id = str(type(self)._next_id[0]) - type(self)._next_id[0] += 1 - self.relOp = relOp - - @classmethod - def reset_id(cls, next_id=0): - """ - Reset ID to be used for the next new node to `next_id`. - - Can be used to have a zero-based numbering for each - generated query. - - Parameters - ---------- - next_id : int, default: 0 - Next node id. - """ - cls._next_id[0] = next_id - - -class CalciteScanNode(CalciteBaseNode): - """ - A node to represent a scan operation. - - Scan operation can only be applied to physical tables. - - Parameters - ---------- - modin_frame : HdkOnNativeDataframe - A frame to scan. The frame should have a materialized table - in HDK. - - Attributes - ---------- - table : list of str - A list holding a database name and a table name. - fieldNames : list of str - A list of columns to include into the scan. - inputs : list - An empty list existing for the sake of serialization - simplicity. Has no meaning but is expected by HDK - deserializer. - """ - - def __init__(self, modin_frame): - assert modin_frame._partitions is not None - table = modin_frame._partitions[0][0].get() - assert isinstance(table, DbTable) - super(CalciteScanNode, self).__init__("EnumerableTableScan") - self.table = ["hdk", table.name] - self.fieldNames = [ - ColNameCodec.encode(col) for col in modin_frame._table_cols - ] + ["rowid"] - # HDK expects from scan node to have 'inputs' field - # holding empty list - self.inputs = [] - - -class CalciteProjectionNode(CalciteBaseNode): - """ - A node to represent a projection operation. - - Parameters - ---------- - fields : list of str - Output column names. - exprs : list of BaseExpr - Output column expressions. - - Attributes - ---------- - fields : list of str - A list of output columns. - exprs : list of BaseExpr - A list of expressions describing how output columns are computed. - Order of expression follows `fields` order. - """ - - def __init__(self, fields, exprs): - super(CalciteProjectionNode, self).__init__("LogicalProject") - self.fields = [ColNameCodec.encode(field) for field in fields] - self.exprs = exprs - - -class CalciteFilterNode(CalciteBaseNode): - """ - A node to represent a filter operation. - - Parameters - ---------- - condition : BaseExpr - A filtering condition. - - Attributes - ---------- - condition : BaseExpr - A filter to apply. - """ - - def __init__(self, condition): - super(CalciteFilterNode, self).__init__("LogicalFilter") - self.condition = condition - - -class CalciteAggregateNode(CalciteBaseNode): - """ - A node to represent an aggregate operation. - - Parameters - ---------- - fields : list of str - Output field names. - group : list of CalciteInputIdxExpr - Group key columns. - aggs : list of BaseExpr - Aggregates to compute. - - Attributes - ---------- - fields : list of str - Output field names. - group : list of CalciteInputIdxExpr - Group key columns. - aggs : list of BaseExpr - Aggregates to compute. - """ - - def __init__(self, fields, group, aggs): - super(CalciteAggregateNode, self).__init__("LogicalAggregate") - self.fields = [ColNameCodec.encode(field) for field in fields] - self.group = group - self.aggs = aggs - - -class CalciteCollation: - """ - A structure to describe sorting order. - - Parameters - ---------- - field : CalciteInputIdxExpr - A column to sort by. - dir : {"ASCENDING", "DESCENDING"}, default: "ASCENDING" - A sort order. - nulls : {"LAST", "FIRST"}, default: "LAST" - NULLs position after the sort. - - Attributes - ---------- - field : CalciteInputIdxExpr - A column to sort by. - dir : {"ASCENDING", "DESCENDING"} - A sort order. - nulls : {"LAST", "FIRST"} - NULLs position after the sort. - """ - - def __init__(self, field, dir="ASCENDING", nulls="LAST"): - self.field = field - self.direction = dir - self.nulls = nulls - - -class CalciteSortNode(CalciteBaseNode): - """ - A node to represent a sort operation. - - Parameters - ---------- - collation : list of CalciteCollation - Sort keys. - - Attributes - ---------- - collation : list of CalciteCollation - Sort keys. - """ - - def __init__(self, collation): - super(CalciteSortNode, self).__init__("LogicalSort") - self.collation = collation - - -class CalciteJoinNode(CalciteBaseNode): - """ - A node to represent a join operation. - - Parameters - ---------- - left_id : int - ID of the left join operand. - right_id : int - ID of the right join operand. - how : str - Type of the join. - condition : BaseExpr - Join condition. - - Attributes - ---------- - inputs : list of int - IDs of the left and the right operands of the join. - joinType : str - Type of the join. - condition : BaseExpr - Join condition. - """ - - def __init__(self, left_id, right_id, how, condition): - super(CalciteJoinNode, self).__init__("LogicalJoin") - self.inputs = [left_id, right_id] - self.joinType = how - self.condition = condition - - -class CalciteUnionNode(CalciteBaseNode): - """ - A node to represent a union operation. - - Parameters - ---------- - inputs : list of int - Input frame IDs. - all : bool - True for UNION ALL operation. - - Attributes - ---------- - inputs : list of int - Input frame IDs. - all : bool - True for UNION ALL operation. - """ - - def __init__(self, inputs, all): - super(CalciteUnionNode, self).__init__("LogicalUnion") - self.inputs = inputs - self.all = all diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py deleted file mode 100644 index 08ed41e6104..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py +++ /dev/null @@ -1,1168 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides ``CalciteBuilder`` class.""" -from collections import abc - -import pandas -from pandas.core.dtypes.common import _get_dtype, is_bool_dtype - -from .calcite_algebra import ( - CalciteAggregateNode, - CalciteBaseNode, - CalciteCollation, - CalciteFilterNode, - CalciteInputIdxExpr, - CalciteInputRefExpr, - CalciteJoinNode, - CalciteProjectionNode, - CalciteScanNode, - CalciteSortNode, - CalciteUnionNode, -) -from .dataframe.utils import ColNameCodec -from .df_algebra import ( - FilterNode, - FrameNode, - GroupbyAggNode, - JoinNode, - MaskNode, - SortNode, - TransformNode, - UnionNode, -) -from .expr import ( - AggregateExpr, - InputRefExpr, - LiteralExpr, - OpExpr, - _quantile_agg_dtype, - build_if_then_else, - build_row_idx_filter_expr, -) - - -class CalciteBuilder: - """Translator used to transform ``DFAlgNode`` tree into a calcite node sequence.""" - - class CompoundAggregate: - """ - A base class for a compound aggregate translation. - - Translation is done in three steps. Step 1 is an additional - values generation using a projection. Step 2 is a generation - of aggregates that will be later used for a compound aggregate - value computation. Step 3 is a final aggregate value generation - using another projection. - - Parameters - ---------- - builder : CalciteBuilder - A builder to use for translation. - arg : BaseExpr or List of BaseExpr - An aggregated values. - """ - - def __init__(self, builder, arg): - self._builder = builder - self._arg = arg - - def gen_proj_exprs(self): - """ - Generate values required for intermediate aggregates computation. - - Returns - ------- - dict - New column expressions mapped to their names. - """ - return [] - - def gen_agg_exprs(self): - """ - Generate intermediate aggregates required for a compound aggregate computation. - - Returns - ------- - dict - New aggregate expressions mapped to their names. - """ - pass - - def gen_reduce_expr(self): - """ - Generate an expression for a compound aggregate. - - Returns - ------- - BaseExpr - A final compound aggregate expression. - """ - pass - - class CompoundAggregateWithColArg(CompoundAggregate): - """ - A base class for a compound aggregate that require a `LiteralExpr` column argument. - - This aggregate requires 2 arguments. The first argument is an `InputRefExpr`, - refering to the aggregation column. The second argument is a `LiteralExpr`, - this expression is added into the frame as a new column. - - Parameters - ---------- - agg : str - Aggregate name. - builder : CalciteBuilder - A builder to use for translation. - arg : List of BaseExpr - Aggregate arguments. - dtype : dtype, optional - Aggregate data type. If not specified, `_dtype` from the first argument is used. - """ - - def __init__(self, agg, builder, arg, dtype=None): - assert isinstance(arg[0], InputRefExpr) - assert isinstance(arg[1], LiteralExpr) - super().__init__(builder, arg) - self._agg = agg - self._agg_column = f"{arg[0].column}__{agg}__" - self._dtype = dtype or arg[0]._dtype - - def gen_proj_exprs(self): - return {self._agg_column: self._arg[1]} - - def gen_agg_exprs(self): - frame = self._arg[0].modin_frame - return { - self._agg_column: AggregateExpr( - self._agg, - [ - self._builder._ref_idx(frame, self._arg[0].column), - self._builder._ref_idx(frame, self._agg_column), - ], - dtype=self._dtype, - ) - } - - def gen_reduce_expr(self): - return self._builder._ref(self._arg[0].modin_frame, self._agg_column) - - class StdAggregate(CompoundAggregate): - """ - A sample standard deviation aggregate generator. - - Parameters - ---------- - builder : CalciteBuilder - A builder to use for translation. - arg : list of BaseExpr - An aggregated value. - """ - - def __init__(self, builder, arg): - assert isinstance(arg[0], InputRefExpr) - super().__init__(builder, arg[0]) - - self._quad_name = self._arg.column + "__quad__" - self._sum_name = self._arg.column + "__sum__" - self._quad_sum_name = self._arg.column + "__quad_sum__" - self._count_name = self._arg.column + "__count__" - - def gen_proj_exprs(self): - """ - Generate values required for intermediate aggregates computation. - - Returns - ------- - dict - New column expressions mapped to their names. - """ - expr = self._builder._translate(self._arg.mul(self._arg)) - return {self._quad_name: expr} - - def gen_agg_exprs(self): - """ - Generate intermediate aggregates required for a compound aggregate computation. - - Returns - ------- - dict - New aggregate expressions mapped to their names. - """ - count_expr = self._builder._translate(AggregateExpr("count", self._arg)) - sum_expr = self._builder._translate(AggregateExpr("sum", self._arg)) - self._sum_dtype = sum_expr._dtype - qsum_expr = AggregateExpr( - "SUM", - self._builder._ref_idx(self._arg.modin_frame, self._quad_name), - dtype=sum_expr._dtype, - ) - - return { - self._sum_name: sum_expr, - self._quad_sum_name: qsum_expr, - self._count_name: count_expr, - } - - def gen_reduce_expr(self): - """ - Generate an expression for a compound aggregate. - - Returns - ------- - BaseExpr - A final compound aggregate expression. - """ - count_expr = self._builder._ref(self._arg.modin_frame, self._count_name) - count_expr._dtype = _get_dtype(int) - sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name) - sum_expr._dtype = self._sum_dtype - qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name) - qsum_expr._dtype = self._sum_dtype - - null_expr = LiteralExpr(None) - count_or_null = build_if_then_else( - count_expr.eq(LiteralExpr(0)), null_expr, count_expr, count_expr._dtype - ) - count_m_1_or_null = build_if_then_else( - count_expr.eq(LiteralExpr(1)), - null_expr, - count_expr.sub(LiteralExpr(1)), - count_expr._dtype, - ) - - # sqrt((sum(x * x) - sum(x) * sum(x) / n) / (n - 1)) - return ( - qsum_expr.sub(sum_expr.mul(sum_expr).truediv(count_or_null)) - .truediv(count_m_1_or_null) - .pow(LiteralExpr(0.5)) - ) - - class SkewAggregate(CompoundAggregate): - """ - An unbiased skew aggregate generator. - - Parameters - ---------- - builder : CalciteBuilder - A builder to use for translation. - arg : list of BaseExpr - An aggregated value. - """ - - def __init__(self, builder, arg): - assert isinstance(arg[0], InputRefExpr) - super().__init__(builder, arg[0]) - - self._quad_name = self._arg.column + "__quad__" - self._cube_name = self._arg.column + "__cube__" - self._sum_name = self._arg.column + "__sum__" - self._quad_sum_name = self._arg.column + "__quad_sum__" - self._cube_sum_name = self._arg.column + "__cube_sum__" - self._count_name = self._arg.column + "__count__" - - def gen_proj_exprs(self): - """ - Generate values required for intermediate aggregates computation. - - Returns - ------- - dict - New column expressions mapped to their names. - """ - quad_expr = self._builder._translate(self._arg.mul(self._arg)) - cube_expr = self._builder._translate( - self._arg.mul(self._arg).mul(self._arg) - ) - return {self._quad_name: quad_expr, self._cube_name: cube_expr} - - def gen_agg_exprs(self): - """ - Generate intermediate aggregates required for a compound aggregate computation. - - Returns - ------- - dict - New aggregate expressions mapped to their names. - """ - count_expr = self._builder._translate(AggregateExpr("count", self._arg)) - sum_expr = self._builder._translate(AggregateExpr("sum", self._arg)) - self._sum_dtype = sum_expr._dtype - qsum_expr = AggregateExpr( - "SUM", - self._builder._ref_idx(self._arg.modin_frame, self._quad_name), - dtype=sum_expr._dtype, - ) - csum_expr = AggregateExpr( - "SUM", - self._builder._ref_idx(self._arg.modin_frame, self._cube_name), - dtype=sum_expr._dtype, - ) - - return { - self._sum_name: sum_expr, - self._quad_sum_name: qsum_expr, - self._cube_sum_name: csum_expr, - self._count_name: count_expr, - } - - def gen_reduce_expr(self): - """ - Generate an expression for a compound aggregate. - - Returns - ------- - BaseExpr - A final compound aggregate expression. - """ - count_expr = self._builder._ref(self._arg.modin_frame, self._count_name) - count_expr._dtype = _get_dtype(int) - sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name) - sum_expr._dtype = self._sum_dtype - qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name) - qsum_expr._dtype = self._sum_dtype - csum_expr = self._builder._ref(self._arg.modin_frame, self._cube_sum_name) - csum_expr._dtype = self._sum_dtype - - mean_expr = sum_expr.truediv(count_expr) - - # n * sqrt(n - 1) / (n - 2) - # * (sum(x ** 3) - 3 * mean * sum(x * x) + 2 * mean * mean * sum(x)) - # / (sum(x * x) - mean * sum(x)) ** 1.5 - part1 = count_expr.mul( - count_expr.sub(LiteralExpr(1)).pow(LiteralExpr(0.5)) - ).truediv(count_expr.sub(LiteralExpr(2))) - part2 = csum_expr.sub(mean_expr.mul(qsum_expr).mul(LiteralExpr(3.0))).add( - mean_expr.mul(mean_expr).mul(sum_expr).mul(LiteralExpr(2.0)) - ) - part3 = qsum_expr.sub(mean_expr.mul(sum_expr)).pow(LiteralExpr(1.5)) - skew_expr = part1.mul(part2).truediv(part3) - - # The result is NULL if n <= 2 - return build_if_then_else( - count_expr.le(LiteralExpr(2)), - LiteralExpr(None), - skew_expr, - skew_expr._dtype, - ) - - class TopkAggregate(CompoundAggregateWithColArg): - """ - A TOP_K aggregate generator. - - Parameters - ---------- - builder : CalciteBuilder - A builder to use for translation. - arg : List of BaseExpr - An aggregated values. - """ - - def __init__(self, builder, arg): - super().__init__("TOP_K", builder, arg) - - def gen_reduce_expr(self): - return OpExpr( - "PG_UNNEST", - [super().gen_reduce_expr()], - self._dtype, - ) - - class QuantileAggregate(CompoundAggregateWithColArg): - """ - A QUANTILE aggregate generator. - - Parameters - ---------- - builder : CalciteBuilder - A builder to use for translation. - arg : List of BaseExpr - A list of 3 values: - 0. InputRefExpr - the column to compute the quantiles for. - 1. LiteralExpr - the quantile value. - 2. str - the interpolation method to use. - """ - - def __init__(self, builder, arg): - super().__init__( - "QUANTILE", - builder, - arg, - _quantile_agg_dtype(arg[0]._dtype), - ) - self._interpolation = arg[2].val.upper() - - def gen_agg_exprs(self): - exprs = super().gen_agg_exprs() - for expr in exprs.values(): - expr.interpolation = self._interpolation - return exprs - - _compound_aggregates = { - "std": StdAggregate, - "skew": SkewAggregate, - "nlargest": TopkAggregate, - "nsmallest": TopkAggregate, - "quantile": QuantileAggregate, - } - - class InputContext: - """ - A class to track current input frames and corresponding nodes. - - Used to translate input column references to numeric indices. - - Parameters - ---------- - input_frames : list of DFAlgNode - Input nodes of the currently translated node. - input_nodes : list of CalciteBaseNode - Translated input nodes. - - Attributes - ---------- - input_nodes : list of CalciteBaseNode - Input nodes of the currently translated node. - frame_to_node : dict - Maps input frames to corresponding calcite nodes. - input_offsets : dict - Maps input frame to an input index used for its first column. - replacements : dict - Maps input frame to a new list of columns to use. Used when - a single `DFAlgNode` is lowered into multiple computation - steps, e.g. for compound aggregates requiring additional - projections. - """ - - _simple_aggregates = { - "sum": "SUM", - "mean": "AVG", - "max": "MAX", - "min": "MIN", - "size": "COUNT", - "count": "COUNT", - } - _no_arg_aggregates = {"size"} - - def __init__(self, input_frames, input_nodes): - self.input_nodes = input_nodes - self.frame_to_node = {x: y for x, y in zip(input_frames, input_nodes)} - self.input_offsets = {} - self.replacements = {} - offs = 0 - for frame in input_frames: - self.input_offsets[frame] = offs - offs += len(frame._table_cols) - # Materialized frames have additional 'rowid' column - if isinstance(frame._op, FrameNode): - offs += 1 - - def replace_input_node(self, frame, node, new_cols): - """ - Use `node` as an input node for references to columns of `frame`. - - Parameters - ---------- - frame : DFAlgNode - Replaced input frame. - node : CalciteBaseNode - A new node to use. - new_cols : list of str - A new columns list to use. - """ - self.replacements[frame] = new_cols - - def _idx(self, frame, col): - """ - Get a numeric input index for an input column. - - Parameters - ---------- - frame : DFAlgNode - An input frame. - col : str - An input column. - - Returns - ------- - int - """ - assert ( - frame in self.input_offsets - ), f"unexpected reference to {frame.id_str()}" - - offs = self.input_offsets[frame] - - if frame in self.replacements: - return self.replacements[frame].index(col) + offs - - if col == ColNameCodec.ROWID_COL_NAME: - if not isinstance(self.frame_to_node[frame], CalciteScanNode): - raise NotImplementedError( - "rowid can be accessed in materialized frames only" - ) - return len(frame._table_cols) + offs - - assert ( - col in frame._table_cols - ), f"unexpected reference to '{col}' in {frame.id_str()}" - return frame._table_cols.index(col) + offs - - def ref(self, frame, col): - """ - Translate input column into ``CalciteInputRefExpr``. - - Parameters - ---------- - frame : DFAlgNode - An input frame. - col : str - An input column. - - Returns - ------- - CalciteInputRefExpr - """ - return CalciteInputRefExpr(self._idx(frame, col)) - - def ref_idx(self, frame, col): - """ - Translate input column into ``CalciteInputIdxExpr``. - - Parameters - ---------- - frame : DFAlgNode - An input frame. - col : str - An input column. - - Returns - ------- - CalciteInputIdxExpr - """ - return CalciteInputIdxExpr(self._idx(frame, col)) - - def input_ids(self): - """ - Get ids of all input nodes. - - Returns - ------- - list of int - """ - return [x.id for x in self.input_nodes] - - def translate(self, expr): - """ - Translate an expression. - - Translation is done by replacing ``InputRefExpr`` with - ``CalciteInputRefExpr`` and ``CalciteInputIdxExpr``. - - Parameters - ---------- - expr : BaseExpr - An expression to translate. - - Returns - ------- - BaseExpr - Translated expression. - """ - return self._maybe_copy_and_translate_expr(expr) - - def _maybe_copy_and_translate_expr(self, expr, ref_idx=False): - """ - Translate an expression. - - Translate an expression replacing ``InputRefExpr`` with ``CalciteInputRefExpr`` - and ``CalciteInputIdxExpr``. An expression tree branches with input columns - are copied into a new tree, other branches are used as is. - - Parameters - ---------- - expr : BaseExpr - An expression to translate. - ref_idx : bool, default: False - If True then translate ``InputRefExpr`` to ``CalciteInputIdxExpr``, - use ``CalciteInputRefExr`` otherwise. - - Returns - ------- - BaseExpr - Translated expression. - """ - if isinstance(expr, InputRefExpr): - if ref_idx: - return self.ref_idx(expr.modin_frame, expr.column) - else: - return self.ref(expr.modin_frame, expr.column) - - if isinstance(expr, AggregateExpr): - expr = expr.copy() - if expr.agg in self._no_arg_aggregates: - expr.operands = [] - else: - expr.operands[0] = self._maybe_copy_and_translate_expr( - expr.operands[0], True - ) - expr.agg = self._simple_aggregates[expr.agg] - return expr - - gen = expr.nested_expressions() - for op in gen: - expr = gen.send(self._maybe_copy_and_translate_expr(op)) - return expr - - class InputContextMgr: - """ - A helper class to manage an input context stack. - - The class is designed to be used in a recursion with nested - 'with' statements. - - Parameters - ---------- - builder : CalciteBuilder - An outer builder. - input_frames : list of DFAlgNode - Input nodes for the new context. - input_nodes : list of CalciteBaseNode - Translated input nodes. - - Attributes - ---------- - builder : CalciteBuilder - An outer builder. - input_frames : list of DFAlgNode - Input nodes for the new context. - input_nodes : list of CalciteBaseNode - Translated input nodes. - """ - - def __init__(self, builder, input_frames, input_nodes): - self.builder = builder - self.input_frames = input_frames - self.input_nodes = input_nodes - - def __enter__(self): - """ - Push new input context into the input context stack. - - Returns - ------- - InputContext - New input context. - """ - self.builder._input_ctx_stack.append( - self.builder.InputContext(self.input_frames, self.input_nodes) - ) - return self.builder._input_ctx_stack[-1] - - def __exit__(self, type, value, traceback): - """ - Pop current input context. - - Parameters - ---------- - type : Any - An exception type. - value : Any - An exception value. - traceback : Any - A traceback. - """ - self.builder._input_ctx_stack.pop() - - type_strings = { - int: "INTEGER", - bool: "BOOLEAN", - } - - # The following aggregates require boolean columns to be cast. - _bool_cast_aggregates = { - "sum": _get_dtype(int), - "mean": _get_dtype(float), - "quantile": _get_dtype(float), - } - - def __init__(self): - self._input_ctx_stack = [] - self.has_join = False - self.has_groupby = False - - def build(self, op): - """ - Translate a ``DFAlgNode`` tree into a calcite nodes sequence. - - Parameters - ---------- - op : DFAlgNode - A tree to translate. - - Returns - ------- - list of CalciteBaseNode - The resulting calcite nodes sequence. - """ - CalciteBaseNode.reset_id() - self.res = [] - self._to_calcite(op) - return self.res - - def _add_projection(self, frame): - """ - Add a projection node to the resulting sequence. - - Added node simply selects all frame's columns. This method can be used - to discard a virtual 'rowid' column provided by all scan nodes. - - Parameters - ---------- - frame : HdkOnNativeDataframe - An input frame for a projection. - - Returns - ------- - CalciteProjectionNode - Created projection node. - """ - proj = CalciteProjectionNode( - frame._table_cols, [self._ref(frame, col) for col in frame._table_cols] - ) - self._push(proj) - return proj - - def _input_ctx(self): - """ - Get current input context. - - Returns - ------- - InputContext - """ - return self._input_ctx_stack[-1] - - def _set_input_ctx(self, op): - """ - Create input context manager for a node translation. - - Parameters - ---------- - op : DFAlgNode - A translated node. - - Returns - ------- - InputContextMgr - Created input context manager. - """ - input_frames = getattr(op, "input", []) - input_nodes = [self._to_calcite(x._op) for x in input_frames] - return self.InputContextMgr(self, input_frames, input_nodes) - - def _set_tmp_ctx(self, input_frames, input_nodes): - """ - Create a temporary input context manager. - - This method is deprecated. - - Parameters - ---------- - input_frames : list of DFAlgNode - Input nodes of the currently translated node. - input_nodes : list of CalciteBaseNode - Translated input nodes. - - Returns - ------- - InputContextMgr - Created input context manager. - """ - return self.InputContextMgr(self, input_frames, input_nodes) - - def _ref(self, frame, col): - """ - Translate input column into ``CalciteInputRefExpr``. - - Parameters - ---------- - frame : DFAlgNode - An input frame. - col : str - An input column. - - Returns - ------- - CalciteInputRefExpr - """ - return self._input_ctx().ref(frame, col) - - def _ref_idx(self, frame, col): - """ - Translate input column into ``CalciteInputIdxExpr``. - - Parameters - ---------- - frame : DFAlgNode - An input frame. - col : str - An input column. - - Returns - ------- - CalciteInputIdxExpr - """ - return self._input_ctx().ref_idx(frame, col) - - def _translate(self, exprs): - """ - Translate expressions. - - Translate expressions replacing ``InputRefExpr`` with ``CalciteInputRefExpr`` and - ``CalciteInputIdxExpr``. - - Parameters - ---------- - exprs : BaseExpr or list-like of BaseExpr - Expressions to translate. - - Returns - ------- - BaseExpr or list of BaseExpr - Translated expression. - """ - if isinstance(exprs, abc.Iterable): - return [self._input_ctx().translate(x) for x in exprs] - return self._input_ctx().translate(exprs) - - def _push(self, node): - """ - Append node to the resulting sequence. - - Parameters - ---------- - node : CalciteBaseNode - A node to add. - """ - if ( - len(self.res) != 0 - and isinstance(node, CalciteProjectionNode) - and isinstance(self.res[-1], CalciteProjectionNode) - and all(isinstance(expr, CalciteInputRefExpr) for expr in node.exprs) - ): - # Replace the last CalciteProjectionNode with this one and - # translate the input refs. The `id` attribute is preserved. - last = self.res.pop() - exprs = last.exprs - last.reset_id(int(last.id)) - node = CalciteProjectionNode( - node.fields, [exprs[expr.input] for expr in node.exprs] - ) - self.res.append(node) - - def _last(self): - """ - Get the last node of the resulting calcite node sequence. - - Returns - ------- - CalciteBaseNode - """ - return self.res[-1] - - def _input_nodes(self): - """ - Get current input calcite nodes. - - Returns - ------- - list if CalciteBaseNode - """ - return self._input_ctx().input_nodes - - def _input_node(self, idx): - """ - Get an input calcite node by index. - - Parameters - ---------- - idx : int - An input node's index. - - Returns - ------- - CalciteBaseNode - """ - return self._input_nodes()[idx] - - def _input_ids(self): - """ - Get ids of the current input nodes. - - Returns - ------- - list of int - """ - return self._input_ctx().input_ids() - - def _to_calcite(self, op): - """ - Translate tree to a calcite node sequence. - - Parameters - ---------- - op : DFAlgNode - A tree to translate. - - Returns - ------- - CalciteBaseNode - The last node of the generated sequence. - """ - # This context translates input operands and setup current - # input context to translate input references (recursion - # over tree happens here). - with self._set_input_ctx(op): - if isinstance(op, FrameNode): - self._process_frame(op) - elif isinstance(op, MaskNode): - self._process_mask(op) - elif isinstance(op, GroupbyAggNode): - self._process_groupby(op) - elif isinstance(op, TransformNode): - self._process_transform(op) - elif isinstance(op, JoinNode): - self._process_join(op) - elif isinstance(op, UnionNode): - self._process_union(op) - elif isinstance(op, SortNode): - self._process_sort(op) - elif isinstance(op, FilterNode): - self._process_filter(op) - else: - raise NotImplementedError( - f"CalciteBuilder doesn't support {type(op).__name__}" - ) - return self.res[-1] - - def _process_frame(self, op): - """ - Translate ``FrameNode`` node. - - Parameters - ---------- - op : FrameNode - A frame to translate. - """ - self._push(CalciteScanNode(op.modin_frame)) - - def _process_mask(self, op): - """ - Translate ``MaskNode`` node. - - Parameters - ---------- - op : MaskNode - An operation to translate. - """ - if op.row_labels is not None: - raise NotImplementedError("row indices masking is not yet supported") - - frame = op.input[0] - - # select rows by rowid - rowid_col = self._ref(frame, ColNameCodec.ROWID_COL_NAME) - condition = build_row_idx_filter_expr(op.row_positions, rowid_col) - self._push(CalciteFilterNode(condition)) - - # mask is currently always applied over scan, it means - # we need additional projection to remove rowid column - self._add_projection(frame) - - def _process_groupby(self, op): - """ - Translate ``GroupbyAggNode`` node. - - Parameters - ---------- - op : GroupbyAggNode - An operation to translate. - """ - self.has_groupby = True - frame = op.input[0] - - # Aggregation's input should always be a projection and - # group key columns should always go first - proj_cols = op.by.copy() - for col in frame._table_cols: - if col not in op.by: - proj_cols.append(col) - - # Cast boolean columns, if required - agg_exprs = op.agg_exprs - cast_agg = self._bool_cast_aggregates - if any(v.agg in cast_agg for v in agg_exprs.values()) and ( - bool_cols := { - c: cast_agg[agg_exprs[c].agg] - for c, t in frame.dtypes.items() - # Do not call is_bool_dtype() for categorical since it checks all the categories - if not isinstance(t, pandas.CategoricalDtype) - and is_bool_dtype(t) - and agg_exprs[c].agg in cast_agg - } - ): - trans = self._input_ctx()._maybe_copy_and_translate_expr - proj_exprs = [ - ( - trans(frame.ref(c).cast(bool_cols[c])) - if c in bool_cols - else self._ref(frame, c) - ) - for c in proj_cols - ] - else: - proj_exprs = [self._ref(frame, col) for col in proj_cols] - # Add expressions required for compound aggregates - compound_aggs = {} - for agg, expr in agg_exprs.items(): - if expr.agg in self._compound_aggregates: - compound_aggs[agg] = self._compound_aggregates[expr.agg]( - self, expr.operands - ) - extra_exprs = compound_aggs[agg].gen_proj_exprs() - proj_cols.extend(extra_exprs.keys()) - proj_exprs.extend(extra_exprs.values()) - proj = CalciteProjectionNode(proj_cols, proj_exprs) - self._push(proj) - - self._input_ctx().replace_input_node(frame, proj, proj_cols) - - group = [self._ref_idx(frame, col) for col in op.by] - fields = op.by.copy() - aggs = [] - for agg, expr in agg_exprs.items(): - if agg in compound_aggs: - extra_aggs = compound_aggs[agg].gen_agg_exprs() - fields.extend(extra_aggs.keys()) - aggs.extend(extra_aggs.values()) - else: - fields.append(agg) - aggs.append(self._translate(expr)) - node = CalciteAggregateNode(fields, group, aggs) - self._push(node) - - if compound_aggs: - self._input_ctx().replace_input_node(frame, node, fields) - proj_cols = op.by.copy() - proj_exprs = [self._ref(frame, col) for col in proj_cols] - proj_cols.extend(agg_exprs.keys()) - for agg in agg_exprs: - if agg in compound_aggs: - proj_exprs.append(compound_aggs[agg].gen_reduce_expr()) - else: - proj_exprs.append(self._ref(frame, agg)) - proj = CalciteProjectionNode(proj_cols, proj_exprs) - self._push(proj) - - if op.groupby_opts["sort"]: - collation = [CalciteCollation(col) for col in group] - self._push(CalciteSortNode(collation)) - - def _process_transform(self, op): - """ - Translate ``TransformNode`` node. - - Parameters - ---------- - op : TransformNode - An operation to translate. - """ - fields = list(op.exprs.keys()) - exprs = self._translate(op.exprs.values()) - self._push(CalciteProjectionNode(fields, exprs)) - - def _process_join(self, op): - """ - Translate ``JoinNode`` node. - - Parameters - ---------- - op : JoinNode - An operation to translate. - """ - self.has_join = True - node = CalciteJoinNode( - left_id=self._input_node(0).id, - right_id=self._input_node(1).id, - how=op.how, - condition=self._translate(op.condition), - ) - self._push(node) - - self._push( - CalciteProjectionNode( - op.exprs.keys(), [self._translate(val) for val in op.exprs.values()] - ) - ) - - def _process_union(self, op): - """ - Translate ``UnionNode`` node. - - Parameters - ---------- - op : UnionNode - An operation to translate. - """ - self._push(CalciteUnionNode(self._input_ids(), True)) - - def _process_sort(self, op): - """ - Translate ``SortNode`` node. - - Parameters - ---------- - op : SortNode - An operation to translate. - """ - frame = op.input[0] - if not isinstance(self._input_node(0), CalciteProjectionNode): - proj = self._add_projection(frame) - self._input_ctx().replace_input_node(frame, proj, frame._table_cols) - - nulls = op.na_position.upper() - collations = [] - for col, asc in zip(op.columns, op.ascending): - ascending = "ASCENDING" if asc else "DESCENDING" - collations.append( - CalciteCollation(self._ref_idx(frame, col), ascending, nulls) - ) - self._push(CalciteSortNode(collations)) - - def _process_filter(self, op): - """ - Translate ``FilterNode`` node. - - Parameters - ---------- - op : FilterNode - An operation to translate. - """ - condition = self._translate(op.condition) - self._push(CalciteFilterNode(condition)) - - if isinstance(self._input_node(0), CalciteScanNode): - # if filter was applied over scan, then we need additional - # projection to remove rowid column - self._add_projection(op.input[0]) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py deleted file mode 100644 index 7099751dafe..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py +++ /dev/null @@ -1,416 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides ``CalciteSerializer`` class.""" - -import json - -import numpy as np -from pandas.core.dtypes.common import is_datetime64_dtype - -from modin.error_message import ErrorMessage - -from .calcite_algebra import ( - CalciteAggregateNode, - CalciteBaseNode, - CalciteCollation, - CalciteFilterNode, - CalciteInputIdxExpr, - CalciteInputRefExpr, - CalciteJoinNode, - CalciteProjectionNode, - CalciteScanNode, - CalciteSortNode, - CalciteUnionNode, -) -from .expr import AggregateExpr, BaseExpr, LiteralExpr, OpExpr - - -def _warn_if_unsigned(dtype): # noqa: GL08 - if np.issubdtype(dtype, np.unsignedinteger): - ErrorMessage.single_warning( - "HDK does not support unsigned integer types, such types will be rounded up to the signed equivalent." - ) - - -class CalciteSerializer: - """ - Serializer for calcite node sequence. - - ``CalciteSerializer`` is used to serialize a sequence of ``CalciteBaseNode`` - based nodes including nested ``BaseExpr`` based expression trees into - a request in JSON format which can be fed to HDK. - """ - - _DTYPE_STRINGS = { - "int8": "TINYINT", - "int16": "SMALLINT", - "int32": "INTEGER", - "int64": "BIGINT", - "uint8": "SMALLINT", - "uint16": "INTEGER", - "uint32": "BIGINT", - "uint64": "BIGINT", - "bool": "BOOLEAN", - "float32": "FLOAT", - "float64": "DOUBLE", - "datetime64": "TIMESTAMP", - } - - _INT_OPTS = { - np.int8: ("TINYINT", 3), - np.int16: ("SMALLINT", 5), - np.int32: ("INTEGER", 10), - np.int64: ("BIGINT", 19), - np.uint8: ("SMALLINT", 5), - np.uint16: ("INTEGER", 10), - np.uint32: ("BIGINT", 19), - np.uint64: ("BIGINT", 19), - int: ("BIGINT", 19), - } - - _TIMESTAMP_PRECISION = { - "s": 0, - "ms": 3, - "us": 6, - "ns": 9, - } - _DTYPE_STRINGS.update( - {f"datetime64[{u}]": "TIMESTAMP" for u in _TIMESTAMP_PRECISION} - ) - - def serialize(self, plan): - """ - Serialize a sequence of Calcite nodes into JSON format. - - Parameters - ---------- - plan : list of CalciteBaseNode - A sequence to serialize. - - Returns - ------- - str - A query in JSON format. - """ - return json.dumps({"rels": [self.serialize_item(node) for node in plan]}) - - def expect_one_of(self, val, *types): - """ - Raise an error if values doesn't belong to any of specified types. - - Parameters - ---------- - val : Any - Value to check. - *types : list of type - Allowed value types. - """ - for t in types: - if isinstance(val, t): - return - raise TypeError("Can not serialize {}".format(type(val).__name__)) - - def serialize_item(self, item): - """ - Serialize a single expression item. - - Parameters - ---------- - item : Any - Item to serialize. - - Returns - ------- - str, int, None, dict or list of dict - Serialized item. - """ - if isinstance(item, CalciteBaseNode): - return self.serialize_node(item) - elif isinstance(item, BaseExpr): - return self.serialize_expr(item) - elif isinstance(item, CalciteCollation): - return self.serialize_obj(item) - elif isinstance(item, list): - return [self.serialize_item(v) for v in item] - elif isinstance(item, dict): - return {k: self.serialize_item(v) for k, v in item.items()} - - self.expect_one_of(item, str, int, type(None)) - return item - - def serialize_node(self, node): - """ - Serialize a frame operation. - - Parameters - ---------- - node : CalciteBaseNode - A node to serialize. - - Returns - ------- - dict - Serialized object. - """ - if isinstance( - node, - ( - CalciteScanNode, - CalciteProjectionNode, - CalciteFilterNode, - CalciteAggregateNode, - CalciteSortNode, - CalciteJoinNode, - CalciteUnionNode, - ), - ): - return self.serialize_obj(node) - else: - raise NotImplementedError( - "Can not serialize {}".format(type(node).__name__) - ) - - def serialize_obj(self, obj): - """ - Serialize an object into a dictionary. - - Add all non-hidden attributes (not starting with '_') of the object - to the output dictionary. - - Parameters - ---------- - obj : object - An object to serialize. - - Returns - ------- - dict - Serialized object. - """ - res = {} - for k, v in obj.__dict__.items(): - if k[0] != "_": - if k == "op" and isinstance(obj, OpExpr) and v == "//": - res[k] = "/" - else: - res[k] = self.serialize_item(v) - return res - - def serialize_typed_obj(self, obj): - """ - Serialize an object and its dtype into a dictionary. - - Similar to `serialize_obj` but also include '_dtype' field - of the object under 'type' key. - - Parameters - ---------- - obj : object - An object to serialize. - - Returns - ------- - dict - Serialized object. - """ - res = self.serialize_obj(obj) - res["type"] = self.serialize_dtype(obj._dtype) - return res - - def serialize_expr(self, expr): - """ - Serialize ``BaseExpr`` based expression into a dictionary. - - Parameters - ---------- - expr : BaseExpr - An expression to serialize. - - Returns - ------- - dict - Serialized expression. - """ - if isinstance(expr, LiteralExpr): - return self.serialize_literal(expr) - elif isinstance(expr, CalciteInputRefExpr): - return self.serialize_obj(expr) - elif isinstance(expr, CalciteInputIdxExpr): - return self.serialize_input_idx(expr) - elif isinstance(expr, OpExpr): - return self.serialize_typed_obj(expr) - elif isinstance(expr, AggregateExpr): - return self.serialize_typed_obj(expr) - else: - raise NotImplementedError( - "Can not serialize {}".format(type(expr).__name__) - ) - - def serialize_literal(self, literal): - """ - Serialize ``LiteralExpr`` into a dictionary. - - Parameters - ---------- - literal : LiteralExpr - A literal to serialize. - - Returns - ------- - dict - Serialized literal. - """ - val = literal.val - if val is None: - return { - "literal": None, - "type": "BIGINT", - "target_type": "BIGINT", - "scale": 0, - "precision": 19, - "type_scale": 0, - "type_precision": 19, - } - if type(val) is str: - return { - "literal": val, - "type": "CHAR", - "target_type": "CHAR", - "scale": -2147483648, - "precision": len(val), - "type_scale": -2147483648, - "type_precision": len(val), - } - if type(val) in self._INT_OPTS.keys(): - target_type, precision = self.opts_for_int_type(type(val)) - return { - "literal": int(val), - "type": "DECIMAL", - "target_type": target_type, - "scale": 0, - "precision": len(str(val)), - "type_scale": 0, - "type_precision": precision, - } - if type(val) in (float, np.float64): - if np.isnan(val): - return { - "literal": None, - "type": "DOUBLE", - "target_type": "DOUBLE", - "scale": 0, - "precision": 19, - "type_scale": 0, - "type_precision": 19, - } - - str_val = f"{val:f}" - precision = len(str_val) - 1 - scale = precision - str_val.index(".") - return { - "literal": int(str_val.replace(".", "")), - "type": "DECIMAL", - "target_type": "DOUBLE", - "scale": scale, - "precision": precision, - "type_scale": -2147483648, - "type_precision": 15, - } - if type(val) is bool: - return { - "literal": val, - "type": "BOOLEAN", - "target_type": "BOOLEAN", - "scale": -2147483648, - "precision": 1, - "type_scale": -2147483648, - "type_precision": 1, - } - if isinstance(val, np.datetime64): - unit = np.datetime_data(val)[0] - precision = self._TIMESTAMP_PRECISION.get(unit, None) - if precision is not None: - return { - "literal": int(val.astype(np.int64)), - "type": "TIMESTAMP", - "target_type": "TIMESTAMP", - "scale": -2147483648, - "precision": precision, - "type_scale": -2147483648, - "type_precision": precision, - } - - raise NotImplementedError(f"Can not serialize {type(val).__name__}") - - def opts_for_int_type(self, int_type): - """ - Get serialization params for an integer type. - - Return a SQL type name and a number of meaningful decimal digits - for an integer type. - - Parameters - ---------- - int_type : type - An integer type to describe. - - Returns - ------- - tuple - """ - try: - _warn_if_unsigned(int_type) - return self._INT_OPTS[int_type] - except KeyError: - raise NotImplementedError(f"Unsupported integer type {int_type.__name__}") - - def serialize_dtype(self, dtype): - """ - Serialize data type to a dictionary. - - Parameters - ---------- - dtype : dtype - Data type to serialize. - - Returns - ------- - dict - Serialized data type. - """ - _warn_if_unsigned(dtype) - try: - type_info = {"type": self._DTYPE_STRINGS[dtype.name], "nullable": True} - if is_datetime64_dtype(dtype): - unit = np.datetime_data(dtype)[0] - type_info["precision"] = self._TIMESTAMP_PRECISION[unit] - return type_info - except KeyError: - raise TypeError(f"Unsupported dtype: {dtype}") - - def serialize_input_idx(self, expr): - """ - Serialize ``CalciteInputIdxExpr`` expression. - - Parameters - ---------- - expr : CalciteInputIdxExpr - An expression to serialize. - - Returns - ------- - int - Serialized expression. - """ - return expr.input diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/__init__.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/__init__.py deleted file mode 100644 index 5b91bc2b403..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module holds classes implementing HDK storage format based lazy dataframe.""" diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py deleted file mode 100644 index 1c8d3a7134b..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py +++ /dev/null @@ -1,3109 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides ``HdkOnNativeDataframe`` class implementing lazy frame.""" - -import re -from typing import Hashable, Iterable, List, Optional, Tuple, Union - -import numpy as np -import pandas as pd -import pyarrow -from pandas._libs.lib import no_default -from pandas.core.dtypes.common import ( - _get_dtype, - is_bool_dtype, - is_datetime64_dtype, - is_integer_dtype, - is_list_like, - is_string_dtype, -) -from pandas.core.indexes.api import Index, MultiIndex, RangeIndex -from pyarrow.types import is_dictionary - -from modin.core.dataframe.base.dataframe.utils import ( - Axis, - JoinType, - is_trivial_index, - join_columns, -) -from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( - ProtocolDataframe, -) -from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe -from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype -from modin.core.dataframe.pandas.metadata.dtypes import get_categories_dtype -from modin.core.dataframe.pandas.utils import concatenate -from modin.error_message import ErrorMessage -from modin.experimental.core.storage_formats.hdk.query_compiler import ( - DFAlgQueryCompiler, -) -from modin.pandas.indexing import is_range_like -from modin.pandas.utils import check_both_not_none -from modin.utils import MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings - -from ..db_worker import DbTable -from ..df_algebra import ( - FilterNode, - FrameNode, - GroupbyAggNode, - JoinNode, - MaskNode, - SortNode, - TransformNode, - UnionNode, - replace_frame_in_exprs, - translate_exprs_to_base, -) -from ..expr import ( - AggregateExpr, - InputRefExpr, - LiteralExpr, - OpExpr, - _get_common_dtype, - build_dt_expr, - build_if_then_else, - is_cmp_op, -) -from ..partitioning.partition_manager import HdkOnNativeDataframePartitionManager -from .utils import ( - ColNameCodec, - arrow_to_pandas, - arrow_type_to_pandas, - build_categorical_from_at, - check_cols_to_join, - check_join_supported, - ensure_supported_dtype, - get_data_for_join_by_index, - maybe_range, -) - -IDX_COL_NAME = ColNameCodec.IDX_COL_NAME -ROWID_COL_NAME = ColNameCodec.ROWID_COL_NAME -UNNAMED_IDX_COL_NAME = ColNameCodec.UNNAMED_IDX_COL_NAME -encode_col_name = ColNameCodec.encode -decode_col_name = ColNameCodec.decode -mangle_index_names = ColNameCodec.mangle_index_names -demangle_index_names = ColNameCodec.demangle_index_names - - -class HdkOnNativeDataframe(PandasDataframe): - """ - Lazy dataframe based on Arrow table representation and embedded HDK storage format. - - Currently, materialized dataframe always has a single partition. This partition - can hold either Arrow table or pandas dataframe. - - Operations on a dataframe are not instantly executed and build an operations - tree instead. When frame's data is accessed this tree is transformed into - a query which is executed in HDK storage format. In case of simple transformations - Arrow API can be used instead of HDK storage format. - - Since frames are used as an input for other frames, all operations produce - new frames and are not executed in-place. - - Parameters - ---------- - partitions : np.ndarray, optional - Partitions of the frame. - index : pandas.Index, optional - Index of the frame to be used as an index cache. If None then will be - computed on demand. - columns : pandas.Index, optional - Columns of the frame. - row_lengths : np.ndarray, optional - Partition lengths. Should be None if lengths are unknown. - column_widths : np.ndarray, optional - Partition widths. Should be None if widths are unknown. - dtypes : pandas.Index, optional - Column data types. - op : DFAlgNode, optional - A tree describing how frame is computed. For materialized frames it - is always ``FrameNode``. - index_cols : list of str, optional - A list of columns included into the frame's index. None value means - a default index (row id is used as an index). - uses_rowid : bool, default: False - True for frames which require access to the virtual 'rowid' column - for its execution. - force_execution_mode : str or None - Used by tests to control frame's execution process. - has_unsupported_data : bool - True for frames holding data not supported by Arrow or HDK storage format. - - Attributes - ---------- - id : int - ID of the frame. Used for debug prints only. - _op : DFAlgNode - A tree to be used to compute the frame. For materialized frames it is - always ``FrameNode``. - _partitions : numpy.ndarray or None - Partitions of the frame. For materialized dataframes it holds a single - partition. None for frames requiring execution. - _index_cols : list of str or None - Names of index columns. None for default index. Index columns have mangled - names to handle labels which cannot be directly used as an HDK table - column name (e.g. non-string labels, SQL keywords etc.). - _table_cols : list of str - A list of all frame's columns. It includes index columns if any. Index - columns are always in the head of the list. - _index_cache : pandas.Index, callable or None - Materialized index of the frame or None when index is not materialized. - If ``callable() -> (pandas.Index, list of row lengths or None)`` type, - then the calculation will be done in `__init__`. - _has_unsupported_data : bool - True for frames holding data not supported by Arrow or HDK storage format. - Operations on such frames are not allowed and should be defaulted - to pandas instead. - _dtypes : pandas.Series - Column types. - _uses_rowid : bool - True for frames which require access to the virtual 'rowid' column - for its execution. - _force_execution_mode : str or None - Used by tests to control frame's execution process. Value "lazy" - is used to raise RuntimeError if execution is triggered for the frame. - The values "arrow" and "hdk" are used to force the corresponding - execution mode. - """ - - _query_compiler_cls = DFAlgQueryCompiler - _partition_mgr_cls = HdkOnNativeDataframePartitionManager - - _next_id = [1] - - def __init__( - self, - partitions=None, - index=None, - columns=None, - row_lengths=None, - column_widths=None, - dtypes=None, - op=None, - index_cols=None, - uses_rowid=False, - force_execution_mode=None, - has_unsupported_data=False, - ): - assert dtypes is not None - assert partitions is None or ( - partitions.size == 1 and partitions[0][0] is not None - ) - - self.id = str(type(self)._next_id[0]) - type(self)._next_id[0] += 1 - - if op is None and partitions is not None: - op = FrameNode(self) - - self._op = op - self._index_cols = index_cols - self._partitions = partitions - self.set_index_cache(index) - self.set_columns_cache(columns) - # The following code assumes that the type of `columns` is pandas.Index. - # The initial type of `columns` might be callable. - columns = self._columns_cache.get() - self._row_lengths_cache = row_lengths - self._column_widths_cache = column_widths - self._has_unsupported_data = has_unsupported_data - if self._op is None: - self._op = FrameNode(self) - - if self._index_cols is not None: - self._table_cols = self._index_cols + columns.tolist() - else: - self._table_cols = columns.tolist() - - assert len(dtypes) == len( - self._table_cols - ), f"unaligned dtypes ({dtypes}) and table columns ({self._table_cols})" - - if isinstance(dtypes, list): - if self._index_cols is not None: - # Table stores both index and data columns but those are accessed - # differently if we have a MultiIndex for columns. To unify access - # to dtype we extend index column names to tuples to have a MultiIndex - # of dtypes. - if isinstance(columns, MultiIndex): - tail = [""] * (columns.nlevels - 1) - index_tuples = [(col, *tail) for col in self._index_cols] - dtype_index = MultiIndex.from_tuples(index_tuples).append(columns) - self.set_dtypes_cache(pd.Series(dtypes, index=dtype_index)) - else: - self.set_dtypes_cache(pd.Series(dtypes, index=self._table_cols)) - else: - self.set_dtypes_cache(pd.Series(dtypes, index=columns)) - else: - self.set_dtypes_cache(dtypes) - - self._uses_rowid = uses_rowid - self._force_execution_mode = force_execution_mode - - def copy( - self, - partitions=no_default, - index=no_default, - columns=no_default, - dtypes=no_default, - op=no_default, - index_cols=no_default, - uses_rowid=no_default, - has_unsupported_data=no_default, - ): - """ - Copy this DataFrame. - - Parameters - ---------- - partitions : np.ndarray, optional - Partitions of the frame. - index : pandas.Index or list, optional - Index of the frame to be used as an index cache. If None then will be - computed on demand. - columns : pandas.Index or list, optional - Columns of the frame. - dtypes : pandas.Index or list, optional - Column data types. - op : DFAlgNode, optional - A tree describing how frame is computed. For materialized frames it - is always ``FrameNode``. - index_cols : list of str, optional - A list of columns included into the frame's index. None value means - a default index (row id is used as an index). - uses_rowid : bool, optional - True for frames which require access to the virtual 'rowid' column - for its execution. - has_unsupported_data : bool, optional - True for frames holding data not supported by Arrow or HDK storage format. - - Returns - ------- - HdkOnNativeDataframe - A copy of this DataFrame. - """ - if partitions is no_default: - partitions = self._partitions - if index is no_default: - index = self.copy_index_cache() - if columns is no_default: - columns = self.copy_columns_cache() - if op is no_default: - op = self._op - if dtypes is no_default: - dtypes = self.copy_dtypes_cache() - if index_cols is no_default: - index_cols = self._index_cols - if uses_rowid is no_default: - uses_rowid = self._uses_rowid - if has_unsupported_data is no_default: - has_unsupported_data = self._has_unsupported_data - return self.__constructor__( - partitions=partitions, - index=index, - columns=columns, - row_lengths=self._row_lengths_cache, - column_widths=self._column_widths_cache, - dtypes=dtypes, - op=op, - index_cols=index_cols, - uses_rowid=uses_rowid, - force_execution_mode=self._force_execution_mode, - has_unsupported_data=has_unsupported_data, - ) - - def id_str(self): - """ - Return string identifier of the frame. - - Used for debug dumps. - - Returns - ------- - str - """ - return f"frame${self.id}" - - def get_dtype(self, col): - """ - Get data type for a column. - - Parameters - ---------- - col : str - Column name. - - Returns - ------- - dtype - """ - # If we search for an index column type in a MultiIndex then we need to - # extend index column names to tuples. - if isinstance(self._dtypes.index, MultiIndex) and not isinstance(col, tuple): - return self._dtypes[(col, *([""] * (self._dtypes.index.nlevels - 1)))] - return self._dtypes[col] - - def ref(self, col): - """ - Return an expression referencing a frame's column. - - Parameters - ---------- - col : str - Column name. - - Returns - ------- - InputRefExpr - """ - if col == ROWID_COL_NAME: - return InputRefExpr(self, col, _get_dtype(int)) - return InputRefExpr(self, col, self.get_dtype(col)) - - def take_2d_labels_or_positional( - self, - row_labels: Optional[List[Hashable]] = None, - row_positions: Optional[List[int]] = None, - col_labels: Optional[List[Hashable]] = None, - col_positions: Optional[List[int]] = None, - ) -> "HdkOnNativeDataframe": - """ - Mask rows and columns in the dataframe. - - Allow users to perform selection and projection on the row and column labels (named notation), - in addition to the row and column number (positional notation). - - Parameters - ---------- - row_labels : list of hashable, optional - The row labels to extract. - row_positions : list of int, optional - The row positions to extract. - col_labels : list of hashable, optional - The column labels to extract. - col_positions : list of int, optional - The column positions to extract. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - - Notes - ----- - If both `row_labels` and `row_positions` are provided, a ValueError is raised. - The same rule applies for `col_labels` and `col_positions`. - """ - if check_both_not_none(row_labels, row_positions): - raise ValueError( - "Both row_labels and row_positions were provided - please provide only one of row_labels and row_positions." - ) - if check_both_not_none(col_labels, col_positions): - raise ValueError( - "Both col_labels and col_positions were provided - please provide only one of col_labels and col_positions." - ) - base = self - - if col_labels is not None or col_positions is not None: - if col_labels is not None: - new_columns = col_labels - elif col_positions is not None: - new_columns = base.columns[col_positions] - exprs = self._index_exprs() - for col in new_columns: - expr = base.ref(col) - if exprs.setdefault(col, expr) is not expr: - raise NotImplementedError( - "duplicate column names are not supported" - ) - dtypes = self._dtypes_for_exprs(exprs) - base = self.__constructor__( - columns=new_columns, - dtypes=dtypes, - op=TransformNode(base, exprs), - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - if row_labels is not None: - raise NotImplementedError("Row labels masking is not yet supported") - - if row_positions is None: - return base - - row_positions = maybe_range(row_positions) - - # If row_positions is not a range, then MaskNode will generate a filter, - # containing enumeration of all the positions. Filtering rows in this - # way is not efficient and, in case of too many values in row_positions, - # may result in a huge JSON query. To workaround this issue, creating an - # empty frame with row_positions index and inner joining with this one. - # If row_positions has less than 10 values, MaskNode is used. - if ( - not is_range_like(row_positions) - and is_list_like(row_positions) - and len(row_positions) > 10 - ): - lhs = base._maybe_materialize_rowid() - if len(lhs._index_cols) == 1 and is_integer_dtype(lhs._dtypes[0]): - pdf = pd.DataFrame(index=row_positions) - rhs = self.from_pandas(pdf) - exprs = lhs._index_exprs() - for col in lhs.columns: - exprs[col] = lhs.ref(col) - condition = lhs._build_equi_join_condition( - rhs, lhs._index_cols, rhs._index_cols - ) - op = JoinNode( - lhs, - rhs, - exprs=exprs, - condition=condition, - ) - return lhs.copy(op=op, index=pdf.index, partitions=None) - - base = base._maybe_materialize_rowid() - op = MaskNode(base, row_labels=row_labels, row_positions=row_positions) - base = self.__constructor__( - columns=base.columns, - dtypes=base.copy_dtypes_cache(), - op=op, - index_cols=base._index_cols, - force_execution_mode=base._force_execution_mode, - ) - - # Reverse the frame rows, if performing a reverse order selection via HDK. - if ( - is_range_like(row_positions) and row_positions.step < 0 - ) and not base._can_execute_arrow(): - cols = base.columns - table_cols = base._table_cols - # Add the rowid column - rowid_col = "__tmp_rowid__" - while rowid_col in table_cols: - rowid_col += "1" - exprs = base._index_exprs() - exprs[rowid_col] = base.ref(ROWID_COL_NAME) - for col in cols: - exprs[col] = base.ref(col) - base = base.copy( - columns=[rowid_col] + base.columns.tolist(), - dtypes=base._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - uses_rowid=True, - ) - # Sort by the rowid column - base = base.copy(op=SortNode(base, [rowid_col], [False], "last")) - # Remove the rowid column - exprs = dict() - for col in table_cols: - exprs[col] = base.ref(col) - base = base.copy( - columns=cols, - dtypes=base._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - ) - - return base - - def _has_arrow_table(self): - """ - Return True for materialized frame with Arrow table. - - Returns - ------- - bool - """ - return self._partitions is not None and self._partitions[0][0].raw - - def _dtypes_for_exprs(self, exprs): - """ - Return dtypes for expressions. - - Parameters - ---------- - exprs : dict - Expression to get types for. - - Returns - ------- - list of dtype - """ - return [expr._dtype for expr in exprs.values()] - - @_inherit_docstrings(PandasDataframe._maybe_update_proxies) - def _maybe_update_proxies(self, dtypes, new_parent=None): - if new_parent is not None: - return super()._maybe_update_proxies(dtypes, new_parent) - if self._partitions is None: - return dtypes - table = self._partitions[0][0].get() - if isinstance(table, pyarrow.Table): - return super()._maybe_update_proxies(dtypes, new_parent=table) - return dtypes - - def groupby_agg(self, by, axis, agg, groupby_args, **kwargs): - """ - Groupby with aggregation operation. - - Parameters - ---------- - by : DFAlgQueryCompiler or list-like of str - Grouping keys. - axis : {0, 1} - Only rows groupby is supported, so should be 0. - agg : str or dict - Aggregates to compute. - groupby_args : dict - Additional groupby args. - **kwargs : dict - Keyword args. Currently ignored. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - # Currently we only expect 'by' to be a projection of the same frame. - # If 'by' holds a list of columns/series, then we create such projection - # to re-use code. - if not isinstance(by, DFAlgQueryCompiler): - if is_list_like(by): - by_cols = [] - by_frames = [] - for obj in by: - if isinstance(obj, str): - by_cols.append(obj) - elif hasattr(obj, "_modin_frame"): - by_frames.append(obj._modin_frame) - else: - raise NotImplementedError("unsupported groupby args") - by_cols = Index.__new__(Index, data=by_cols, dtype=self.columns.dtype) - by_frame = self.take_2d_labels_or_positional(col_labels=by_cols) - if by_frames: - by_frame = by_frame.concat( - axis=1, other_modin_frames=by_frames, ignore_index=True - ) - else: - raise NotImplementedError("unsupported groupby args") - else: - by_frame = by._modin_frame - - if axis != 0: - raise NotImplementedError("groupby is supported for axis = 0 only") - - base = by_frame._find_common_projections_base(self) - if base is None: - raise NotImplementedError("unsupported groupby args") - - if groupby_args["level"] is not None: - raise NotImplementedError("levels are not supported for groupby") - - drop = kwargs.get("drop", True) - as_index = groupby_args.get("as_index", True) - groupby_cols = by_frame.columns - if isinstance(agg, dict): - agg_cols = agg.keys() - elif not drop: - # If 'by' data came from a different frame then 'self-aggregation' - # columns are more prioritized. - agg_cols = self.columns - else: - agg_cols = [col for col in self.columns if col not in groupby_cols] - - # Mimic pandas behavior: pandas does not allow for aggregation to be empty - # in case of multi-column 'by'. - if not as_index and len(agg_cols) == 0 and len(groupby_cols) > 1: - agg_cols = self.columns - - # Create new base where all required columns are computed. We don't allow - # complex expressions to be a group key or an aggeregate operand. - allowed_nodes = (FrameNode, TransformNode) - if not isinstance(by_frame._op, allowed_nodes): - raise NotImplementedError( - "HDK doesn't allow complex expression to be a group key. " - + f"The only allowed frame nodes are: {tuple(o.__name__ for o in allowed_nodes)}, " - + f"met '{type(by_frame._op).__name__}'." - ) - - if agg in ("head", "tail"): - n = kwargs["agg_kwargs"]["n"] - return self._groupby_head_tail(agg, n, groupby_cols) - - col_to_delete_template = "__delete_me_{name}" - - def generate_by_name(by): - """Generate unuqie name for `by` column in the resulted frame.""" - if as_index: - return f"{IDX_COL_NAME}0_{by}" - elif by in agg_cols: - # Aggregation columns are more prioritized than the 'by' cols, - # so in case of naming conflicts, we drop 'by' cols. - return col_to_delete_template.format(name=by) - else: - return by - - exprs = dict( - ((generate_by_name(col), by_frame.ref(col)) for col in groupby_cols) - ) - groupby_cols = list(exprs.keys()) - exprs.update(((col, self.ref(col)) for col in agg_cols)) - exprs = translate_exprs_to_base(exprs, base) - base_cols = Index.__new__(Index, data=exprs.keys(), dtype=self.columns.dtype) - base = self.__constructor__( - columns=base_cols, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - index_cols=None, - force_execution_mode=self._force_execution_mode, - ) - - new_columns = [] - index_cols = None - - # TODO: check performance changes after enabling 'dropna' and decide - # is it worth it or not. - if groupby_args["dropna"]: - ErrorMessage.single_warning( - "'dropna' is temporary disabled due to https://github.com/modin-project/modin/issues/2896" - ) - # base = base.dropna(subset=groupby_cols, how="any") - - if as_index: - index_cols = groupby_cols.copy() - else: - new_columns = groupby_cols.copy() - - new_dtypes = base._dtypes[groupby_cols].tolist() - - agg_exprs = dict() - if isinstance(agg, str): - col_to_ref = {col: base.ref(col) for col in agg_cols} - self._add_agg_exprs(agg, col_to_ref, kwargs, agg_exprs) - elif isinstance(agg, (dict, list)): - if isinstance(agg, list): - agg = {col: agg for col in agg_cols} - multiindex = any(isinstance(v, list) for v in agg.values()) - for col, aggs in agg.items(): - if isinstance(aggs, list): - for a in aggs: - col_to_ref = {(col, a): base.ref(col)} - self._add_agg_exprs(a, col_to_ref, kwargs, agg_exprs) - else: - col_to_ref = {((col, aggs) if multiindex else col): base.ref(col)} - self._add_agg_exprs(aggs, col_to_ref, kwargs, agg_exprs) - else: - raise NotImplementedError(f"aggregate type {type(agg)}") - - new_columns.extend(agg_exprs.keys()) - new_dtypes.extend((x._dtype for x in agg_exprs.values())) - new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) - - new_op = GroupbyAggNode(base, groupby_cols, agg_exprs, groupby_args) - new_frame = self.__constructor__( - columns=new_columns, - dtypes=new_dtypes, - op=new_op, - index_cols=index_cols, - force_execution_mode=self._force_execution_mode, - ) - - if not as_index: - col_to_delete = col_to_delete_template.format(name=".*") - filtered_columns = [ - col - for col in new_frame.columns - if not (isinstance(col, str) and re.match(col_to_delete, col)) - ] - if len(filtered_columns) != len(new_frame.columns): - new_frame = new_frame.take_2d_labels_or_positional( - col_labels=filtered_columns - ) - return new_frame - - def _add_agg_exprs(self, agg, col_to_ref, kwargs, agg_exprs): - """ - Add `AggregateExpr`s for each column to `agg_exprs`. - - Parameters - ---------- - agg : str - col_to_ref : dict - kwargs : dict - agg_exprs : dict - """ - if agg == "nlargest" or agg == "nsmallest": - n = kwargs["agg_kwargs"]["n"] - if agg == "nsmallest": - n = -n - n = LiteralExpr(n) - for col, ref in col_to_ref.items(): - agg_exprs[col] = AggregateExpr(agg, [ref, n]) - elif agg == "median" or agg == "quantile": - agg_kwargs = kwargs["agg_kwargs"] - q = agg_kwargs.get("q", 0.5) - if not isinstance(q, float): - raise NotImplementedError("Non-float quantile") - q = LiteralExpr(q) - interpolation = LiteralExpr(agg_kwargs.get("interpolation", "linear")) - for col, ref in col_to_ref.items(): - agg_exprs[col] = AggregateExpr("quantile", [ref, q, interpolation]) - else: - for col, ref in col_to_ref.items(): - agg_exprs[col] = AggregateExpr(agg, ref) - - def _groupby_head_tail( - self, agg: str, n: int, cols: Iterable[str] - ) -> "HdkOnNativeDataframe": - """ - Return first/last n rows of each group. - - Parameters - ---------- - agg : {"head", "tail"} - n : int - If positive: number of entries to include from start/end of each group. - If negative: number of entries to exclude from start/end of each group. - cols : Iterable[str] - Group by column names. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if isinstance(self._op, SortNode): - base = self._op.input[0] - order_keys = self._op.columns - ascending = self._op.ascending - na_pos = self._op.na_position.upper() - fold = True # Fold TransformNodes - else: - base = self._maybe_materialize_rowid() - order_keys = base._index_cols[0:1] - ascending = [True] - na_pos = "FIRST" - fold = base is self # Do not fold if rowid is added - if (n < 0) == (agg == "head"): # Invert sorting - ascending = [not a for a in ascending] - na_pos = "FIRST" if na_pos == "LAST" else "LAST" - partition_keys = [base.ref(col) for col in cols] - order_keys = [base.ref(col) for col in order_keys] - - row_num_name = "__HDK_ROW_NUMBER__" - row_num_op = OpExpr("ROW_NUMBER", [], _get_dtype(int)) - row_num_op.set_window_opts(partition_keys, order_keys, ascending, na_pos) - exprs = base._index_exprs() - exprs.update((col, base.ref(col)) for col in base.columns) - exprs[row_num_name] = row_num_op - transform = base.copy( - columns=list(base.columns) + [row_num_name], - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs, fold), - ) - - if n < 0: - cond = transform.ref(row_num_name).ge(-n + 1) - else: - cond = transform.ref(row_num_name).le(n) - - filter = transform.copy(op=FilterNode(transform, cond)) - exprs = filter._index_exprs() - exprs.update((col, filter.ref(col)) for col in base.columns) - return base.copy(op=TransformNode(filter, exprs), partitions=None, index=None) - - def agg(self, agg): - """ - Perform specified aggregation along columns. - - Parameters - ---------- - agg : str - Name of the aggregation function to perform. - - Returns - ------- - HdkOnNativeDataframe - New frame containing the result of aggregation. - """ - assert isinstance(agg, str) - - agg_exprs = dict() - for col in self.columns: - agg_exprs[col] = AggregateExpr(agg, self.ref(col)) - - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(agg_exprs), - op=GroupbyAggNode(self, [], agg_exprs, {"sort": False}), - index_cols=None, - force_execution_mode=self._force_execution_mode, - ) - - def fillna(self, value=None, method=None, axis=None, limit=None, downcast=None): - """ - Replace NULLs operation. - - Parameters - ---------- - value : dict or scalar, optional - A value to replace NULLs with. Can be a dictionary to assign - different values to columns. - method : None, optional - Should be None. - axis : {0, 1}, optional - Should be 0. - limit : None, optional - Should be None. - downcast : None, optional - Should be None. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if axis != 0: - raise NotImplementedError("fillna is supported for axis = 0 only") - - if limit is not None: - raise NotImplementedError("fillna doesn't support limit yet") - - if downcast is not None: - raise NotImplementedError("fillna doesn't support downcast yet") - - if method is not None: - raise NotImplementedError("fillna doesn't support method yet") - - try: - exprs = self._index_exprs() - if isinstance(value, dict): - for col in self.columns: - col_expr = self.ref(col) - if col in value: - value_expr = LiteralExpr(value[col]) - res_type = _get_common_dtype(value_expr._dtype, col_expr._dtype) - exprs[col] = build_if_then_else( - col_expr.is_null(), value_expr, col_expr, res_type - ) - else: - exprs[col] = col_expr - elif np.isscalar(value): - value_expr = LiteralExpr(value) - for col in self.columns: - col_expr = self.ref(col) - res_type = _get_common_dtype(value_expr._dtype, col_expr._dtype) - exprs[col] = build_if_then_else( - col_expr.is_null(), value_expr, col_expr, res_type - ) - else: - raise NotImplementedError("unsupported value for fillna") - except TypeError: - raise NotImplementedError( - "Heterogenous data is not supported in HDK storage format" - ) - - new_op = TransformNode(self, exprs) - dtypes = self._dtypes_for_exprs(exprs) - new_frame = self.__constructor__( - columns=self.columns, - dtypes=dtypes, - op=new_op, - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - return new_frame - - def dropna(self, subset, how="any"): - """ - Drop rows with NULLs. - - Parameters - ---------- - subset : list of str - Columns to check. - how : {"any", "all"}, default: "any" - Determine if row is removed from DataFrame, when we have - at least one NULL or all NULLs. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - how_to_merge = {"any": "AND", "all": "OR"} - - # If index columns are not presented in the frame, then we have to create them - # based on "rowid". This is needed because 'dropna' preserves index. - if self._index_cols is None: - base = self._materialize_rowid() - else: - base = self - - checks = [base.ref(col).is_not_null() for col in subset] - condition = ( - checks[0] - if len(checks) == 1 - else OpExpr(how_to_merge[how], checks, np.dtype("bool")) - ) - result = base.__constructor__( - columns=base.columns, - dtypes=base.copy_dtypes_cache(), - op=FilterNode(base, condition), - index_cols=base._index_cols, - force_execution_mode=base._force_execution_mode, - ) - return result - - def isna(self, invert): - """ - Detect missing values. - - Parameters - ---------- - invert : bool - - Returns - ------- - HdkOnNativeDataframe - """ - expr = "is_not_null" if invert else "is_null" - exprs = self._index_exprs() - for col in self.columns: - exprs[col] = getattr(self.ref(col), expr)() - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def invert(self): - """ - Apply bitwise inverse to each column. - - Returns - ------- - HdkOnNativeDataframe - """ - exprs = self._index_exprs() - for col in self.columns: - exprs[col] = self.ref(col).invert() - return self.copy(op=TransformNode(self, exprs)) - - def dt_extract(self, obj): - """ - Extract a date or a time unit from a datetime value. - - Parameters - ---------- - obj : str - Datetime unit to extract. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - exprs = self._index_exprs() - for col in self.columns: - exprs[col] = build_dt_expr(obj, self.ref(col)) - new_op = TransformNode(self, exprs) - dtypes = self._dtypes_for_exprs(exprs) - return self.__constructor__( - columns=self.columns, - dtypes=dtypes, - op=new_op, - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def astype(self, col_dtypes, **kwargs): - """ - Cast frame columns to specified types. - - Parameters - ---------- - col_dtypes : dict or str - Maps column names to new data types. - **kwargs : dict - Keyword args. Not used. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if not isinstance(col_dtypes, dict): - col_dtypes = {column: col_dtypes for column in self.columns} - columns = col_dtypes.keys() - new_dtypes = self.copy_dtypes_cache() - for column in columns: - try: - old_dtype = np.dtype(self._dtypes[column]) - new_dtype = np.dtype(col_dtypes[column]) - except TypeError: - raise NotImplementedError( - f"Type conversion {self._dtypes[column]} -> {col_dtypes[column]}" - ) - if old_dtype != new_dtype: - # NotImplementedError is raised if the type cast is not supported. - _get_common_dtype(new_dtype, self._dtypes[column]) - new_dtypes[column] = new_dtype - - exprs = self._index_exprs() - for col in self.columns: - col_expr = self.ref(col) - if col in columns: - exprs[col] = col_expr.cast(new_dtypes[col]) - else: - exprs[col] = col_expr - - new_op = TransformNode(self, exprs) - return self.__constructor__( - columns=self.columns, - dtypes=new_dtypes, - op=new_op, - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def join( - self, - other: "HdkOnNativeDataframe", - how: Optional[Union[str, JoinType]] = JoinType.INNER, - left_on: Optional[List[str]] = None, - right_on: Optional[List[str]] = None, - sort: Optional[bool] = False, - suffixes: Optional[Tuple[str]] = ("_x", "_y"), - ): - """ - Join operation. - - Parameters - ---------- - other : HdkOnNativeDataframe - A frame to join with. - how : str or modin.core.dataframe.base.utils.JoinType, default: JoinType.INNER - A type of join. - left_on : list of str, optional - A list of columns for the left frame to join on. - right_on : list of str, optional - A list of columns for the right frame to join on. - sort : bool, default: False - Sort the result by join keys. - suffixes : list-like of str, default: ("_x", "_y") - A length-2 sequence of suffixes to add to overlapping column names - of left and right operands respectively. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - check_join_supported(how) - assert ( - left_on is not None and right_on is not None - ), "Merge with unspecified 'left_on' or 'right_on' parameter is not supported in the engine" - assert len(left_on) == len( - right_on - ), "'left_on' and 'right_on' lengths don't match" - - if other is self: - # To avoid the self-join failure - #5891 - if isinstance(self._op, FrameNode): - other = self.copy() - else: - exprs = dict((c, self.ref(c)) for c in self._table_cols) - other = self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - orig_left_on = left_on - orig_right_on = right_on - left, left_on = check_cols_to_join("left_on", self, left_on) - right, right_on = check_cols_to_join("right_on", other, right_on) - for left_col, right_col in zip(left_on, right_on): - left_dt = self._dtypes[left_col] - right_dt = other._dtypes[right_col] - if isinstance(left_dt, pd.CategoricalDtype) and isinstance( - right_dt, pd.CategoricalDtype - ): - left_dt = get_categories_dtype(left_dt) - right_dt = get_categories_dtype(right_dt) - if not ( - (is_integer_dtype(left_dt) and is_integer_dtype(right_dt)) - or (is_string_dtype(left_dt) and is_string_dtype(right_dt)) - or (is_datetime64_dtype(left_dt) and is_datetime64_dtype(right_dt)) - ): - raise NotImplementedError( - f"Join on columns of '{left_dt}' and '{right_dt}' dtypes" - ) - - # If either left_on or right_on has been changed, it means that there - # are index columns in the list. Joining by index in this case. - if (left_on is not orig_left_on) or (right_on is not orig_right_on): - index_cols, exprs, new_dtypes, new_columns = get_data_for_join_by_index( - self, other, how, orig_left_on, orig_right_on, sort, suffixes - ) - ignore_index = False - else: - ignore_index = True - index_cols = None - exprs = dict() - new_dtypes = [] - - new_columns, left_renamer, right_renamer = join_columns( - left.columns, right.columns, left_on, right_on, suffixes - ) - for old_c, new_c in left_renamer.items(): - new_dtypes.append(left._dtypes[old_c]) - exprs[new_c] = left.ref(old_c) - - for old_c, new_c in right_renamer.items(): - new_dtypes.append(right._dtypes[old_c]) - exprs[new_c] = right.ref(old_c) - - condition = left._build_equi_join_condition(right, left_on, right_on) - - op = JoinNode( - left, - right, - how=how, - exprs=exprs, - condition=condition, - ) - - res = left.__constructor__( - dtypes=new_dtypes, - columns=new_columns, - index_cols=index_cols, - op=op, - force_execution_mode=self._force_execution_mode, - ) - - if sort: - res = res.sort_rows( - left_on, ascending=True, ignore_index=ignore_index, na_position="last" - ) - - return res - - def _build_equi_join_condition(self, rhs, lhs_cols, rhs_cols): - """ - Build condition for equi-join. - - Parameters - ---------- - rhs : HdkOnNativeDataframe - Joined frame. - lhs_cols : list - Left frame columns to join by. - rhs_cols : list - Right frame columns to join by. - - Returns - ------- - BaseExpr - """ - condition = [ - self.ref(lhs_col).eq(rhs.ref(rhs_col)) - for lhs_col, rhs_col in zip(lhs_cols, rhs_cols) - ] - condition = ( - condition[0] - if len(condition) == 1 - else OpExpr("AND", condition, _get_dtype(bool)) - ) - return condition - - def _index_width(self): - """ - Return a number of columns in the frame's index. - - Returns - ------- - int - """ - if self._index_cols is None: - return 1 - return len(self._index_cols) - - def _union_all( - self, axis, other_modin_frames, join="outer", sort=False, ignore_index=False - ): - """ - Concatenate frames' rows. - - Parameters - ---------- - axis : {0, 1} - Should be 0. - other_modin_frames : list of HdkOnNativeDataframe - Frames to concat. - join : {"outer", "inner"}, default: "outer" - How to handle columns with mismatched names. - "inner" - drop such columns. "outer" - fill - with NULLs. - sort : bool, default: False - Sort unaligned columns for 'outer' join. - ignore_index : bool, default: False - Ignore index columns. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - index_cols = None - col_name_to_dtype = dict() - for col in self.columns: - col_name_to_dtype[col] = self._dtypes[col] - - if join == "inner": - for frame in other_modin_frames: - for col in list(col_name_to_dtype): - if col not in frame.columns: - del col_name_to_dtype[col] - elif join == "outer": - for frame in other_modin_frames: - for col in frame.columns: - if col not in col_name_to_dtype: - col_name_to_dtype[col] = frame._dtypes[col] - else: - raise NotImplementedError(f"Unsupported join type {join=}") - - frames = [] - for frame in [self] + other_modin_frames: - # Empty frames are filtered out only in case of the outer join. - if ( - join == "inner" - or len(frame.columns) != 0 - or (frame.has_materialized_index and len(frame.index) != 0) - or (not frame.has_materialized_index and frame.index_cols) - ): - if isinstance(frame._op, UnionNode): - frames.extend(frame._op.input) - else: - frames.append(frame) - - if len(col_name_to_dtype) == 0: - if len(frames) == 0: - dtypes = pd.Series() - elif ignore_index: - index_cols = [UNNAMED_IDX_COL_NAME] - dtypes = pd.Series([_get_dtype(int)], index=index_cols) - else: - index_names = ColNameCodec.concat_index_names(frames) - index_cols = list(index_names) - dtypes = pd.Series(index_names.values(), index=index_cols) - else: - # Find common dtypes - for frame in other_modin_frames: - frame_dtypes = frame._dtypes - for col in col_name_to_dtype: - if col in frame_dtypes: - col_name_to_dtype[col] = pd.core.dtypes.cast.find_common_type( - [col_name_to_dtype[col], frame_dtypes[col]] - ) - - if sort: - col_name_to_dtype = dict( - (col, col_name_to_dtype[col]) for col in sorted(col_name_to_dtype) - ) - - if ignore_index: - table_col_name_to_dtype = col_name_to_dtype - else: - table_col_name_to_dtype = ColNameCodec.concat_index_names(frames) - index_cols = list(table_col_name_to_dtype) - table_col_name_to_dtype.update(col_name_to_dtype) - - dtypes = pd.Series( - table_col_name_to_dtype.values(), index=table_col_name_to_dtype.keys() - ) - for i, frame in enumerate(frames): - frame_dtypes = frame._dtypes.get() - if ( - len(frame_dtypes) != len(dtypes) - or any(frame_dtypes.index != dtypes.index) - or any(frame_dtypes.values != dtypes.values) - ): - exprs = dict() - uses_rowid = False - for col in table_col_name_to_dtype: - if col in frame_dtypes: - expr = frame.ref(col) - elif col == UNNAMED_IDX_COL_NAME: - if frame._index_cols is not None: - assert len(frame._index_cols) == 1 - expr = frame.ref(frame._index_cols[0]) - else: - uses_rowid = True - expr = frame.ref(ROWID_COL_NAME) - else: - expr = LiteralExpr(None, table_col_name_to_dtype[col]) - if expr._dtype != table_col_name_to_dtype[col]: - expr = expr.cast(table_col_name_to_dtype[col]) - exprs[col] = expr - frames[i] = frame.__constructor__( - columns=dtypes.index, - dtypes=dtypes, - uses_rowid=uses_rowid, - op=TransformNode(frame, exprs), - force_execution_mode=frame._force_execution_mode, - ) - - return self.__constructor__( - index_cols=index_cols, - columns=col_name_to_dtype.keys(), - dtypes=dtypes, - op=UnionNode(frames, col_name_to_dtype, ignore_index), - force_execution_mode=self._force_execution_mode, - ) - - def _join_by_index(self, other_modin_frames, how, sort, ignore_index): - """ - Perform equi-join operation for multiple frames by index columns. - - Parameters - ---------- - other_modin_frames : list of HdkOnNativeDataframe - Frames to join with. - how : str - A type of join. - sort : bool - Sort the result by join keys. - ignore_index : bool - If True then reset column index for the resulting frame. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - try: - check_join_supported(how) - except NotImplementedError as err: - # The outer join is not supported by HDK, however, if all the frames - # have a trivial index, we can simply concatenate the columns with arrow. - if (frame := self._join_arrow_columns(other_modin_frames)) is not None: - return frame - raise err - - lhs = self._maybe_materialize_rowid() - reset_index_names = False - new_columns_dtype = self.columns.dtype - for rhs in other_modin_frames: - rhs = rhs._maybe_materialize_rowid() - if len(lhs._index_cols) != len(rhs._index_cols): - raise NotImplementedError( - "join by indexes with different sizes is not supported" - ) - if new_columns_dtype != rhs.columns.dtype: - new_columns_dtype = None - - reset_index_names = reset_index_names or lhs._index_cols != rhs._index_cols - - condition = lhs._build_equi_join_condition( - rhs, lhs._index_cols, rhs._index_cols - ) - - exprs = lhs._index_exprs() - new_columns = lhs.columns.to_list() - for col in lhs.columns: - exprs[col] = lhs.ref(col) - for col in rhs.columns: - # Handle duplicating column names here. When user specifies - # suffixes to make a join, actual renaming is done in front-end. - new_col_name = col - rename_idx = 0 - while new_col_name in exprs: - new_col_name = f"{col}{rename_idx}" - rename_idx += 1 - exprs[new_col_name] = rhs.ref(col) - new_columns.append(new_col_name) - - op = JoinNode( - lhs, - rhs, - how=how, - exprs=exprs, - condition=condition, - ) - - # in the case of heterogeneous data, using the `dtype` parameter of the - # `Index` constructor can lead to the following error: - # `ValueError: string values cannot be losslessly cast to int64` - # that's why we explicitly call astype below - new_columns = Index(new_columns) - if new_columns.dtype != new_columns_dtype and new_columns_dtype is not None: - # ValueError: string values cannot be losslessly cast to int64 - new_columns = new_columns.astype(new_columns_dtype) - lhs = lhs.__constructor__( - dtypes=lhs._dtypes_for_exprs(exprs), - columns=new_columns, - index_cols=lhs._index_cols, - op=op, - force_execution_mode=self._force_execution_mode, - ) - - if sort: - lhs = lhs.sort_rows( - lhs._index_cols, - ascending=True, - ignore_index=False, - na_position="last", - ) - - if reset_index_names: - lhs = lhs._reset_index_names() - - if ignore_index: - new_columns = RangeIndex(range(len(lhs.columns))) - lhs = lhs._set_columns(new_columns) - - return lhs - - def _join_arrow_columns(self, other_modin_frames): - """ - Join arrow table columns. - - If all the frames have a trivial index and an arrow - table in partitions, concatenate the table columns. - - Parameters - ---------- - other_modin_frames : list of HdkOnNativeDataframe - Frames to join with. - - Returns - ------- - HdkOnNativeDataframe or None - """ - frames = [self] + other_modin_frames - if all( - f._index_cols is None - # Make sure all the frames have an arrow table in partitions. - and isinstance(f._execute(), (DbTable, pyarrow.Table)) - for f in frames - ): - tables = [f._partitions[0][0].get(to_arrow=True) for f in frames] - column_names = [c for t in tables for c in t.column_names] - if len(column_names) != len(set(column_names)): - raise NotImplementedError("Duplicate column names") - max_len = max(len(t) for t in tables) - columns = [c for t in tables for c in t.columns] - new_dtypes = [dt for frame in frames for dt in frame.dtypes] - # Make all columns of the same length, if required. - for i, col in enumerate(columns): - if len(col) < max_len: - columns[i] = pyarrow.chunked_array( - col.chunks + [pyarrow.nulls(max_len - len(col), col.type)] - ) - new_dtypes[i] = arrow_type_to_pandas(columns[i].type) - return self.from_arrow( - at=pyarrow.table(columns, column_names), - columns=[c for f in frames for c in f.columns], - encode_col_names=False, - new_dtypes=new_dtypes, - ) - return None - - def concat( - self, - axis: Union[int, Axis], - other_modin_frames: List["HdkOnNativeDataframe"], - join: Optional[str] = "outer", - sort: Optional[bool] = False, - ignore_index: Optional[bool] = False, - ): - """ - Concatenate frames along a particular axis. - - Parameters - ---------- - axis : int or modin.core.dataframe.base.utils.Axis - The axis to concatenate along. - other_modin_frames : list of HdkOnNativeDataframe - Frames to concat. - join : {"outer", "inner"}, default: "outer" - How to handle mismatched indexes on other axis. - sort : bool, default: False - Sort non-concatenation axis if it is not already aligned - when join is 'outer'. - ignore_index : bool, default: False - Ignore index along the concatenation axis. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - axis = Axis(axis) - if axis == Axis.ROW_WISE: - return self._union_all( - axis.value, other_modin_frames, join, sort, ignore_index - ) - - if not other_modin_frames: - return self - - base = self - for frame in other_modin_frames: - base = base._find_common_projections_base(frame) - if base is None: - return self._join_by_index( - other_modin_frames, how=join, sort=sort, ignore_index=ignore_index - ) - - exprs = self._index_exprs() - new_columns = self.columns.tolist() - for col in self.columns: - exprs[col] = self.ref(col) - for frame in other_modin_frames: - for col in frame.columns: - if col == "" or col in exprs: - new_col = f"__col{len(exprs)}__" - else: - new_col = col - exprs[new_col] = frame.ref(col) - new_columns.append(new_col) - - exprs = translate_exprs_to_base(exprs, base) - new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) - new_frame = self.__constructor__( - columns=new_columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - return new_frame - - def bin_op(self, other, op_name, **kwargs): - """ - Perform binary operation. - - An arithmetic binary operation or a comparison operation to - perform on columns. - - Parameters - ---------- - other : scalar, list-like, or HdkOnNativeDataframe - The second operand. - op_name : str - An operation to perform. - **kwargs : dict - Keyword args. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if isinstance(other, (int, float, str)): - value_expr = LiteralExpr(other) - exprs = self._index_exprs() - for col in self.columns: - exprs[col] = self.ref(col).bin_op(value_expr, op_name) - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - elif isinstance(other, list): - if kwargs.get("axis", 1) == 0: - raise NotImplementedError(f"{op_name} on rows") - if len(other) != len(self.columns): - raise ValueError( - f"length must be {len(self.columns)}: given {len(other)}" - ) - exprs = self._index_exprs() - for col, val in zip(self.columns, other): - exprs[col] = self.ref(col).bin_op(LiteralExpr(val), op_name) - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - elif isinstance(other, type(self)): - # For now we only support binary operations on - # projections of the same frame, because we have - # no support for outer join. - base = self._find_common_projections_base(other) - if base is None: - raise NotImplementedError( - "unsupported binary op args (outer join is not supported)" - ) - - new_columns = self.columns.tolist() - for col in other.columns: - if col not in self.columns: - new_columns.append(col) - new_columns = sorted(new_columns) - - fill_value = kwargs.get("fill_value", None) - if fill_value is not None: - fill_value = LiteralExpr(fill_value) - if is_cmp_op(op_name): - null_value = LiteralExpr(op_name == "ne") - else: - null_value = LiteralExpr(None) - - exprs = self._index_exprs() - for col in new_columns: - lhs = self.ref(col) if col in self.columns else fill_value - rhs = other.ref(col) if col in other.columns else fill_value - if lhs is None or rhs is None: - exprs[col] = null_value - else: - exprs[col] = lhs.bin_op(rhs, op_name) - - exprs = translate_exprs_to_base(exprs, base) - return self.__constructor__( - columns=new_columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - else: - raise NotImplementedError(f"unsupported operand type: {type(other)}") - - def insert(self, loc, column, value): - """ - Insert a constant column. - - Parameters - ---------- - loc : int - Inserted column location. - column : str - Inserted column name. - value : scalar - Inserted column value. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - assert column not in self._table_cols - assert 0 <= loc <= len(self.columns) - - if is_list_like(value): - if isinstance(value, pd.Series) and not self.index.equals(value.index): - # Align by index - value = value.reindex(self.index) - value.reset_index(drop=True, inplace=True) - return self._insert_list(loc, column, value) - - exprs = self._index_exprs() - for i in range(0, loc): - col = self.columns[i] - exprs[col] = self.ref(col) - exprs[column] = LiteralExpr(value) - for i in range(loc, len(self.columns)): - col = self.columns[i] - exprs[col] = self.ref(col) - - new_columns = self.columns.insert(loc, column) - - return self.__constructor__( - columns=new_columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def _insert_list(self, loc, name, value): - """ - Insert a list-like value. - - Parameters - ---------- - loc : int - name : str - value : list - - Returns - ------- - HdkOnNativeDataframe - """ - ncols = len(self.columns) - - if loc == -1: - loc = ncols - - if ncols == 0: - assert loc == 0 - return self._list_to_df(name, value, True) - - if self._partitions and self._partitions[0][0].raw: - return self._insert_list_col(loc, name, value) - - if loc == 0 or loc == ncols: - in_idx = 0 if loc == 0 else 1 - if ( - isinstance(self._op, JoinNode) - and self._op.by_rowid - and self._op.input[in_idx]._partitions - and self._op.input[in_idx]._partitions[0][0].raw - ): - lhs = self._op.input[0] - rhs = self._op.input[1] - if loc == 0: - lhs = lhs._insert_list(0, name, value) - dtype = lhs.dtypes[0] - else: - rhs = rhs._insert_list(-1, name, value) - dtype = rhs.dtypes[-1] - elif loc == 0: - lhs = self._list_to_df(name, value, False) - rhs = self - dtype = lhs.dtypes[0] - else: - lhs = self - rhs = self._list_to_df(name, value, False) - dtype = rhs.dtypes[0] - elif isinstance(self._op, JoinNode) and self._op.by_rowid: - left_len = len(self._op.input[0].columns) - if loc < left_len: - lhs = self._op.input[0]._insert_list(loc, name, value) - rhs = self._op.input[1] - dtype = lhs.dtypes[loc] - else: - lhs = self._op.input[0] - rhs = self._op.input[1]._insert_list(loc - left_len, name, value) - dtype = rhs.dtypes[loc] - else: - lexprs = self._index_exprs() - rexprs = {} - for i, col in enumerate(self.columns): - (lexprs if i < loc else rexprs)[col] = self.ref(col) - lhs = self.__constructor__( - columns=self.columns[0:loc], - dtypes=self._dtypes_for_exprs(lexprs), - op=TransformNode(self, lexprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - )._insert_list(loc, name, value) - rhs = self.__constructor__( - columns=self.columns[loc:], - dtypes=self._dtypes_for_exprs(rexprs), - op=TransformNode(self, rexprs), - force_execution_mode=self._force_execution_mode, - ) - dtype = lhs.dtypes[loc] - - op = self._join_by_rowid_op(lhs, rhs) - return self._insert_list_col(loc, name, value, dtype, op) - - def _insert_list_col(self, idx, name, value, dtype=None, op=None): - """ - Insert a list-like column. - - Parameters - ---------- - idx : int - name : str - value : list - dtype : dtype, default: None - op : DFAlgNode, default: None - - Returns - ------- - HdkOnNativeDataframe - """ - cols = self.columns.tolist() - cols.insert(idx, name) - has_unsupported_data = self._has_unsupported_data - if self._index_cols: - idx += len(self._index_cols) - if dtype is None: - part, dtype = self._partitions[0][0].insert(idx, name, value) - part = np.array([[part]]) - if not has_unsupported_data: - try: - ensure_supported_dtype(dtype) - except NotImplementedError: - has_unsupported_data = True - else: - part = None - dtypes = self._dtypes.tolist() - dtypes.insert(idx, dtype) - return self.copy( - partitions=part, - columns=cols, - dtypes=dtypes, - op=op, - has_unsupported_data=has_unsupported_data, - ) - - def _list_to_df(self, name, value, add_index): - """ - Create a single-column frame from the list-like value. - - Parameters - ---------- - name : str - value : list - add_index : bool - - Returns - ------- - HdkOnNativeDataframe - """ - df = pd.DataFrame({name: value}, index=self.index if add_index else None) - ensure_supported_dtype(df.dtypes[0]) - return self.from_pandas(df) - - @staticmethod - def _join_by_rowid_op(lhs, rhs): - """ - Create a JoinNode for join by rowid. - - Parameters - ---------- - lhs : HdkOnNativeDataframe - rhs : HdkOnNativeDataframe - - Returns - ------- - JoinNode - """ - exprs = lhs._index_exprs() if lhs._index_cols else rhs._index_exprs() - exprs.update((c, lhs.ref(c)) for c in lhs.columns) - exprs.update((c, rhs.ref(c)) for c in rhs.columns) - condition = lhs._build_equi_join_condition( - rhs, [ROWID_COL_NAME], [ROWID_COL_NAME] - ) - return JoinNode(lhs, rhs, exprs=exprs, condition=condition) - - def cat_codes(self): - """ - Extract codes for a category column. - - The frame should have a single data column. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - assert len(self.columns) == 1 - assert isinstance(self._dtypes[-1], pd.CategoricalDtype) - - exprs = self._index_exprs() - col_expr = self.ref(self.columns[-1]) - code_expr = OpExpr("KEY_FOR_STRING", [col_expr], _get_dtype("int32")) - null_val = LiteralExpr(np.int32(-1)) - col_name = MODIN_UNNAMED_SERIES_LABEL - exprs[col_name] = build_if_then_else( - col_expr.is_null(), null_val, code_expr, _get_dtype("int32") - ) - dtypes = [expr._dtype for expr in exprs.values()] - - return self.__constructor__( - columns=Index([col_name]), - dtypes=pd.Series(dtypes, index=Index(exprs.keys())), - op=TransformNode(self, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def sort_rows(self, columns, ascending, ignore_index, na_position): - """ - Sort rows of the frame. - - Parameters - ---------- - columns : str or list of str - Sorting keys. - ascending : bool or list of bool - Sort order. - ignore_index : bool - Drop index columns. - na_position : {"first", "last"} - NULLs position. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if na_position != "first" and na_position != "last": - raise ValueError(f"Unsupported na_position value '{na_position}'") - - base = self - - # If index is preserved and we have no index columns then we - # need to create one using __rowid__ virtual column. - if not ignore_index and base._index_cols is None: - base = base._materialize_rowid() - - if not isinstance(columns, list): - columns = [columns] - columns = [base._find_index_or_col(col) for col in columns] - - if isinstance(ascending, list): - if len(ascending) != len(columns): - raise ValueError("ascending list length doesn't match columns list") - else: - if not isinstance(ascending, bool): - raise ValueError("unsupported ascending value") - ascending = [ascending] * len(columns) - - if ignore_index: - # If index is ignored then we might need to drop some columns. - # At the same time some of dropped index columns can be used - # for sorting and should be droped after sorting is done. - if base._index_cols is not None: - drop_index_cols_before = [ - col for col in base._index_cols if col not in columns - ] - drop_index_cols_after = [ - col for col in base._index_cols if col in columns - ] - - if drop_index_cols_before: - exprs = dict() - index_cols = ( - drop_index_cols_after if drop_index_cols_after else None - ) - for col in drop_index_cols_after: - exprs[col] = base.ref(col) - for col in base.columns: - exprs[col] = base.ref(col) - base = base.__constructor__( - columns=base.columns, - dtypes=base._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - index_cols=index_cols, - force_execution_mode=base._force_execution_mode, - ) - - base = base.__constructor__( - columns=base.columns, - dtypes=base.copy_dtypes_cache(), - op=SortNode(base, columns, ascending, na_position), - index_cols=base._index_cols, - force_execution_mode=base._force_execution_mode, - ) - - if drop_index_cols_after: - exprs = dict() - for col in base.columns: - exprs[col] = base.ref(col) - base = base.__constructor__( - columns=base.columns, - dtypes=base._dtypes_for_exprs(exprs), - op=TransformNode(base, exprs), - index_cols=None, - force_execution_mode=base._force_execution_mode, - ) - - return base - else: - return base.__constructor__( - columns=base.columns, - dtypes=base.copy_dtypes_cache(), - op=SortNode(base, columns, ascending, na_position), - index_cols=None, - force_execution_mode=base._force_execution_mode, - ) - else: - return base.__constructor__( - columns=base.columns, - dtypes=base.copy_dtypes_cache(), - op=SortNode(base, columns, ascending, na_position), - index_cols=base._index_cols, - force_execution_mode=base._force_execution_mode, - ) - - def filter(self, key): - """ - Filter rows by a boolean key column. - - Parameters - ---------- - key : HdkOnNativeDataframe - A frame with a single bool data column used as a filter. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if not isinstance(key, type(self)): - raise NotImplementedError("Unsupported key type in filter") - - if not isinstance(key._op, TransformNode) or len(key.columns) != 1: - raise NotImplementedError("Unsupported key in filter") - - key_col = key.columns[0] - if not is_bool_dtype(key._dtypes[key_col]): - raise NotImplementedError("Unsupported key in filter") - - base = self._find_common_projections_base(key) - if base is None: - raise NotImplementedError("Unsupported key in filter") - - # We build the resulting frame by applying the filter to the - # base frame and then using the filtered result as a new base. - # If base frame has no index columns, then we need to create - # one. - key_exprs = translate_exprs_to_base(key._op.exprs, base) - if base._index_cols is None: - filter_base = base._materialize_rowid() - key_exprs = replace_frame_in_exprs(key_exprs, base, filter_base) - else: - filter_base = base - condition = key_exprs[key_col] - filtered_base = self.__constructor__( - columns=filter_base.columns, - dtypes=filter_base.copy_dtypes_cache(), - op=FilterNode(filter_base, condition), - index_cols=filter_base._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - if self is base: - exprs = dict() - for col in filtered_base._table_cols: - exprs[col] = filtered_base.ref(col) - else: - assert isinstance( - self._op, TransformNode - ), f"unexpected op: {self._op.dumps()}" - exprs = translate_exprs_to_base(self._op.exprs, base) - exprs = replace_frame_in_exprs(exprs, base, filtered_base) - if base._index_cols is None: - idx_name = mangle_index_names([None])[0] - # `idx_name` should be first - exprs = {idx_name: filtered_base.ref(idx_name)} | exprs - - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(filtered_base, exprs), - index_cols=filtered_base._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def force_import(self) -> DbTable: - """ - Force table import. - - Returns - ------- - DbTable - The imported table. - """ - if self._has_unsupported_data: - raise NotImplementedError("Unable to import a frame with unsupported data") - self._execute() - return self._partition_mgr_cls.import_table(self) - - def _maybe_materialize_rowid(self): - """ - Materialize virtual 'rowid' column if frame uses it as an index. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if self._index_cols is None: - return self._materialize_rowid() - return self - - def _materialize_rowid(self): - """ - Materialize virtual 'rowid' column. - - Make a projection with a virtual 'rowid' column materialized - as '__index__' column. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - name = self._index_cache.get().name if self.has_materialized_index else None - name = mangle_index_names([name])[0] - exprs = dict() - exprs[name] = self.ref(ROWID_COL_NAME) - for col in self._table_cols: - exprs[col] = self.ref(col) - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index_cols=[name], - uses_rowid=True, - force_execution_mode=self._force_execution_mode, - ) - - def _index_exprs(self): - """ - Build index column expressions. - - Build dictionary with references to all index columns - mapped to index column names. - - Returns - ------- - dict - """ - exprs = dict() - if self._index_cols: - for col in self._index_cols: - exprs[col] = self.ref(col) - return exprs - - def _find_common_projections_base(self, rhs): - """ - Try to find a common base for projections. - - Check if two frames can be expressed as `TransformNode` - operations from the same input frame. - - Parameters - ---------- - rhs : HdkOnNativeDataframe - The second frame. - - Returns - ------- - HdkOnNativeDataframe - The found common projection base or None. - """ - bases = {self} - while self._is_projection(): - self = self._op.input[0] - bases.add(self) - - while rhs not in bases and rhs._is_projection(): - rhs = rhs._op.input[0] - - if rhs in bases: - return rhs - - return None - - def _is_projection(self): - """ - Check if frame is a ``TranformNode`` operation. - - Returns - ------- - bool - """ - return isinstance(self._op, TransformNode) - - def _execute(self): - """ - Materialize lazy frame. - - After this call frame always has ``FrameNode`` operation. - - Returns - ------- - DbTable or pyarrow.Table or pandas.Dataframe - """ - if isinstance(self._op, FrameNode): - return self._op.execute_arrow() - - result = None - stack = [self._materialize, self] - while stack: - frame = stack.pop() - if callable(frame): - result = frame() - continue - if isinstance(frame._op, FrameNode): - result = frame._op.execute_arrow() - continue - if not frame._op.can_execute_hdk() and stack[-1] != frame._materialize: - stack.append(frame._materialize) - if frame._uses_rowid or frame._op.require_executed_base(): - for i in reversed(frame._op.input): - if not isinstance(i._op, FrameNode): - stack.append(i._materialize) - stack.append(i) - else: - stack.extend(reversed(frame._op.input)) - return result - - def _materialize(self): - """ - Materialize this frame. - - Returns - ------- - DbTable or pyarrow.Table - """ - mode = self._force_execution_mode - assert mode != "lazy", "Unexpected execution triggered on lazy frame!" - - if isinstance(self._op, FrameNode): - return self._op.execute_arrow() - - if ( - mode == "arrow" - or not self._op.can_execute_hdk() - or (self._can_execute_arrow() and mode != "hdk") - ): - new_table = self._execute_arrow() - partitions = self._partition_mgr_cls.from_arrow( - new_table, unsupported_cols=[], encode_col_names=False - )[0] - else: - partitions = self._partition_mgr_cls.run_exec_plan(self._op) - - self._partitions = partitions - self._op = FrameNode(self) - return partitions[0][0].get() - - def _can_execute_arrow(self): - """ - Check for possibility of Arrow execution. - - Check if operation's tree for the frame can be executed using - Arrow API instead of HDK query. - - Returns - ------- - bool - """ - if self._force_execution_mode == "hdk": - return False - - stack = [self] - while stack: - op = stack.pop()._op - if not op.can_execute_arrow(): - return False - if input := getattr(op, "input", None): - stack.extend(input) - return True - - def _execute_arrow(self): - """ - Compute the frame data using Arrow API. - - Returns - ------- - pyarrow.Table - The resulting table. - """ - result = None - stack = [self] - - while stack: - frame = stack.pop() - - if callable(frame): - if isinstance(result := frame(result), DbTable): - result = result.to_arrow() - elif input := getattr(frame._op, "input", None): - if len(input) == 1: - stack.append(frame._op.execute_arrow) - stack.append(input[0]) - else: - - def to_arrow(result, op=frame._op, tables=[], frames=iter(input)): - """ - Convert the input list to a list of arrow tables. - - This function is created for each input list. When the function - is created, the frames iterator is saved in the `frames` argument. - Then, the function is added to the stack followed by the first - frame from the `frames` iterator. When the frame is processed, the - arrow table is added to the `tables` list. This procedure is - repeated until the iterator is not empty. When all the frames are - processed, the arrow tables are passed to `execute_arrow` and the - result is returned. - """ - if (f := next(frames, None)) is None: - return op.execute_arrow(tables) - else: - # When this function is called, the `frame` attribute contains - # a reference to this function. - stack.append(frame if callable(frame) else to_arrow) - stack.append(tables.append) - stack.append(f) - return result - - to_arrow(result) - elif isinstance(result := frame._op.execute_arrow(result), DbTable): - result = result.to_arrow() - - return result - - def _compute_axis_labels_and_lengths(self, axis: int, partitions=None): - """ - Compute the labels for specific `axis`. - - Parameters - ---------- - axis : int - Axis to compute labels along. - partitions : np.ndarray, optional - This parameter serves compatibility purpose and must always be ``None``. - - Returns - ------- - pandas.Index - Labels for the specified `axis`. - List of int - Size of partitions alongside specified `axis`. - """ - ErrorMessage.catch_bugs_and_request_email( - failure_condition=partitions is not None, - extra_log="'._compute_axis_labels_and_lengths(partitions)' is not yet supported for HDK backend", - ) - - obj = self._execute() - - if axis == 1: - cols = self._table_cols - if self._index_cols is not None: - cols = cols[len(self._index_cols) :] - return (cols, [len(cols)]) - - if self._index_cols is None: - index = RangeIndex(range(len(obj))) - return (index, [len(index)]) - if isinstance(obj, DbTable): - # TODO: Get the index columns only - obj = obj.to_arrow() - if isinstance(obj, pyarrow.Table): - # The index columns must be in the beginning of the list - col_names = obj.column_names[len(self._index_cols) :] - index_at = obj.drop(col_names) - index_df = index_at.to_pandas() - index_df.set_index(self._index_cols, inplace=True) - idx = index_df.index - idx.rename(demangle_index_names(self._index_cols), inplace=True) - if ( - isinstance(idx, (pd.DatetimeIndex, pd.TimedeltaIndex)) - and len(idx) >= 3 # infer_freq() requires at least 3 values - ): - idx.freq = pd.infer_freq(idx) - return (idx, [len(idx)]) - else: - return (obj.index, [len(obj.index)]) - - def _build_index_cache(self): - """Materialize index and store it in the cache.""" - if self._partitions and not self._index_cols: - nrows = self._partitions[0][0]._length_cache - self.set_index_cache(RangeIndex(range(nrows))) - else: - index, _ = self._compute_axis_labels_and_lengths(axis=0) - self.set_index_cache(index) - - def _get_index(self): - """ - Get the index of the frame in pandas format. - - Materializes the frame if required. - - Returns - ------- - pandas.Index - """ - if not self.has_index_cache: - self._build_index_cache() - return self._index_cache.get() - - def _set_index(self, new_index): - """ - Set new index for the frame. - - Parameters - ---------- - new_index : pandas.Index - New index. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if not isinstance(new_index, (Index, MultiIndex)): - raise NotImplementedError( - "HdkOnNativeDataframe._set_index is not yet suported" - ) - - obj = self._execute() - if isinstance(obj, pd.DataFrame): - raise NotImplementedError( - "HdkOnNativeDataframe._set_index is not yet suported" - ) - else: - at = obj if isinstance(obj, pyarrow.Table) else obj.to_arrow() - if self._index_cols: - at = at.drop(self._index_cols) - - new_index = new_index.copy() - index_names = mangle_index_names(new_index.names) - new_index.names = index_names - index_df = pd.DataFrame(data={}, index=new_index) - index_df = index_df.reset_index() - index_at = pyarrow.Table.from_pandas(index_df) - - for i, field in enumerate(at.schema): - index_at = index_at.append_column(field, at.column(i)) - - return self.from_arrow(index_at, index_names, new_index, self.columns) - - def reset_index(self, drop): - """ - Set the default index for the frame. - - Parameters - ---------- - drop : bool - If True then drop current index columns, otherwise - make them data columns. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if drop: - exprs = dict() - for c in self.columns: - exprs[c] = self.ref(c) - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index_cols=None, - force_execution_mode=self._force_execution_mode, - ) - else: - if self._index_cols is None: - raise NotImplementedError( - "default index reset with no drop is not supported" - ) - # Need to demangle index names. - exprs = dict() - for i, c in enumerate(self._index_cols): - name = ColNameCodec.demangle_index_name(c) - if name is None: - name = f"level_{i}" - if name in exprs: - raise ValueError(f"cannot insert {name}, already exists") - if isinstance(self.columns, MultiIndex) and not isinstance(name, tuple): - name = (name, *([""] * (self.columns.nlevels - 1))) - exprs[name] = self.ref(c) - for c in self.columns: - if c in exprs: - raise ValueError(f"cannot insert {c}, already exists") - exprs[c] = self.ref(c) - new_columns = Index.__new__( - Index, - data=exprs.keys(), - dtype="O", - name=( - self.columns.names - if isinstance(self.columns, MultiIndex) - else self.columns.name - ), - ) - return self.__constructor__( - columns=new_columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index_cols=None, - force_execution_mode=self._force_execution_mode, - ) - - def _reset_index_names(self): - """ - Reset names for all index columns. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if self.has_multiindex(): - return self.set_index_names([None] * len(self._index_cols)) - return self.set_index_name(None) - - def _set_columns(self, new_columns): - """ - Rename columns. - - Parameters - ---------- - new_columns : list-like of str - New column names. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if ( - self.columns.identical(new_columns) - if isinstance(new_columns, Index) - else all(self.columns == new_columns) - ): - return self - exprs = self._index_exprs() - for old, new in zip(self.columns, new_columns): - expr = self.ref(old) - if exprs.setdefault(new, expr) is not expr: - raise NotImplementedError("duplicate column names are not supported") - return self.__constructor__( - columns=new_columns, - dtypes=self._dtypes.tolist(), - op=TransformNode(self, exprs), - index=self._index_cache, - index_cols=self._index_cols, - force_execution_mode=self._force_execution_mode, - ) - - def _get_columns(self): - """ - Return column labels of the frame. - - Returns - ------- - pandas.Index - """ - return super(HdkOnNativeDataframe, self)._get_columns() - - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): - """ - Get a DataFrame exchange protocol object representing data of the Modin DataFrame. - - Parameters - ---------- - nan_as_null : bool, default: False - A keyword intended for the consumer to tell the producer - to overwrite null values in the data with ``NaN`` (or ``NaT``). - This currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - allow_copy : bool, default: True - A keyword that defines whether or not the library is allowed - to make a copy of the data. For example, copying data would be necessary - if a library supports strided buffers, given that this protocol - specifies contiguous buffers. Currently, if the flag is set to ``False`` - and a copy is needed, a ``RuntimeError`` will be raised. - - Returns - ------- - ProtocolDataframe - A dataframe object following the dataframe exchange protocol specification. - """ - if self._has_unsupported_data: - ErrorMessage.default_to_pandas(message="`__dataframe__`") - pd_df = self.to_pandas() - if hasattr(pd_df, "__dataframe__"): - return pd_df.__dataframe__() - raise NotImplementedError( - "HDK execution does not support exchange protocol if the frame contains data types " - + "that are unsupported by HDK." - ) - - from ..interchange.dataframe_protocol.dataframe import HdkProtocolDataframe - - return HdkProtocolDataframe( - self, nan_as_null=nan_as_null, allow_copy=allow_copy - ) - - @classmethod - def from_dataframe(cls, df: ProtocolDataframe) -> "HdkOnNativeDataframe": - """ - Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe. - - See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. - - Parameters - ---------- - df : ProtocolDataframe - The DataFrame object supporting the dataframe exchange protocol. - - Returns - ------- - HdkOnNativeDataframe - A new Core Modin Dataframe object. - """ - if isinstance(df, cls): - return df - - if not hasattr(df, "__dataframe__"): - raise ValueError( - "`df` does not support DataFrame exchange protocol, i.e. `__dataframe__` method" - ) - - from modin.core.dataframe.pandas.interchange.dataframe_protocol.from_dataframe import ( - from_dataframe_to_pandas, - ) - - # TODO: build a PyArrow table instead of a pandas DataFrame from the protocol object - # as it's possible to do zero-copy with `cls.from_arrow` - ErrorMessage.default_to_pandas(message="`from_dataframe`") - pd_df = from_dataframe_to_pandas(df) - return cls.from_pandas(pd_df) - - columns = property(_get_columns) - index = property(_get_index) - - @property - def dtypes(self): - """ - Return column data types. - - Returns - ------- - pandas.Series - A pandas Series containing the data types for this dataframe. - """ - if self._index_cols is not None: - # [] operator will return pandas.Series - return self._dtypes[len(self._index_cols) :] - return self._dtypes.get() - - def has_multiindex(self): - """ - Check for multi-index usage. - - Return True if the frame has a multi-index (index with - multiple columns) and False otherwise. - - Returns - ------- - bool - """ - if self.has_materialized_index: - return isinstance(self.index, MultiIndex) - return self._index_cols is not None and len(self._index_cols) > 1 - - def get_index_name(self): - """ - Get the name of the index column. - - Returns None for default index and multi-index. - - Returns - ------- - str or None - """ - if self.has_materialized_index: - return self._index_cache.get().name - if self._index_cols is None: - return None - if len(self._index_cols) > 1: - return None - return self._index_cols[0] - - def set_index_name(self, name): - """ - Set new name for the index column. - - Shouldn't be called for frames with multi-index. - - Parameters - ---------- - name : str or None - New index name. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if self.has_multiindex(): - ErrorMessage.single_warning("Scalar name for MultiIndex is not supported!") - return self - - if self._index_cols is None and name is None: - return self - - names = mangle_index_names([name]) - exprs = dict() - if self._index_cols is None: - exprs[names[0]] = self.ref(ROWID_COL_NAME) - else: - exprs[names[0]] = self.ref(self._index_cols[0]) - - for col in self.columns: - exprs[col] = self.ref(col) - - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index_cols=names, - uses_rowid=self._index_cols is None, - force_execution_mode=self._force_execution_mode, - ) - - def get_index_names(self): - """ - Get index column names. - - Returns - ------- - list of str - """ - if self.has_materialized_index: - return self._index_cache.get().names - if self.has_multiindex(): - return self._index_cols.copy() - return [self.get_index_name()] - - def set_index_names(self, names): - """ - Set index labels for frames with multi-index. - - Parameters - ---------- - names : list of str - New index labels. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - if not self.has_multiindex(): - raise ValueError("Can set names for MultiIndex only") - - if len(names) != len(self._index_cols): - raise ValueError( - f"Unexpected names count: expected {len(self._index_cols)} got {len(names)}" - ) - - names = mangle_index_names(names) - exprs = dict() - for old, new in zip(self._index_cols, names): - exprs[new] = self.ref(old) - for col in self.columns: - exprs[col] = self.ref(col) - - return self.__constructor__( - columns=self.columns, - dtypes=self._dtypes_for_exprs(exprs), - op=TransformNode(self, exprs), - index_cols=names, - force_execution_mode=self._force_execution_mode, - ) - - def to_pandas(self): - """ - Transform the frame to pandas format. - - Returns - ------- - pandas.DataFrame - """ - if self._force_execution_mode == "lazy": - raise RuntimeError("unexpected to_pandas triggered on lazy frame") - - obj = self._execute() - - if isinstance(obj, DbTable): - obj = obj.to_arrow() - if isinstance(obj, pyarrow.Table): - # If the table is exported from HDK, the string columns are converted - # to dictionary. On conversion to pandas, these columns will be of type - # Categorical, that is not correct. To make the valid conversion, these - # fields are cast to string. - schema = obj.schema - cast = { - idx: arrow_type.name - for idx, (arrow_type, pandas_type) in enumerate( - zip(schema, self._dtypes) - ) - if is_dictionary(arrow_type.type) - and not isinstance(pandas_type, pd.CategoricalDtype) - } - if cast: - for idx, new_type in cast.items(): - schema = schema.set(idx, pyarrow.field(new_type, pyarrow.string())) - obj = obj.cast(schema) - # concatenate() is called by _partition_mgr_cls.to_pandas - # to preserve the categorical dtypes - df = concatenate([arrow_to_pandas(obj, self._dtypes)]) - else: - df = obj.copy() - - # If we make dataframe from Arrow table then we might need to set - # index columns. - if len(df.columns) != len(self.columns): - assert self._index_cols - if self.has_materialized_index: - df.drop(columns=self._index_cols, inplace=True) - df.index = self._index_cache.get().copy() - else: - df.set_index(self._index_cols, inplace=True) - df.index.rename(demangle_index_names(self._index_cols), inplace=True) - assert len(df.columns) == len(self.columns) - else: - assert self._index_cols is None - assert ( - df.index.name is None or self._has_unsupported_data - ), f"index name '{df.index.name}' is not None" - if self.has_materialized_index: - df.index = self._index_cache.get().copy() - - # Restore original column labels encoded in HDK to meet its - # restrictions on column names. - df.columns = self.columns - - return df - - def _find_index_or_col(self, col): - """ - Find a column name corresponding to a column or index label. - - Parameters - ---------- - col : str - A column or index label. - - Returns - ------- - str - A column name corresponding to a label. - """ - if col in self.columns: - return col - - if self._index_cols is not None: - if col in self._index_cols: - return col - - pattern = re.compile(f"{IDX_COL_NAME}\\d+_{encode_col_name(col)}") - for idx_col in self._index_cols: - if pattern.match(idx_col): - return idx_col - - raise ValueError(f"Unknown column '{col}'") - - @classmethod - def from_pandas(cls, df): - """ - Build a frame from a `pandas.DataFrame`. - - Parameters - ---------- - df : pandas.DataFrame - Source frame. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - new_index = df.index - new_columns = df.columns - - if isinstance(new_columns, MultiIndex): - # MultiIndex columns are not supported by the HDK backend. - # We just print this warning here and fall back to pandas. - index_cols = None - ErrorMessage.single_warning( - "MultiIndex columns are not currently supported by the HDK backend." - ) - # If there is non-trivial index, we put it into columns. - # If the index is trivial, but there are no columns, we put - # it into columns either because, otherwise, we don't know - # the number of rows and, thus, unable to restore the index. - # That's what we usually have for arrow tables and execution - # result. Unnamed index is renamed to {IDX_COL_PREF}. Also all - # columns get encoded to handle names unsupported in HDK. - elif ( - len(new_index) == 0 - and not isinstance(new_index, MultiIndex) - and new_index.name is None - ) or ( - new_index.name is None - and len(new_columns) != 0 - and is_trivial_index(new_index) - ): - index_cols = None - else: - orig_index_names = new_index.names - orig_df = df - index_cols = mangle_index_names(new_index.names) - df.index.names = index_cols - df = df.reset_index() - orig_df.index.names = orig_index_names - - new_dtypes = df.dtypes - - def encoder(n): - return ( - n - if n == MODIN_UNNAMED_SERIES_LABEL - else encode_col_name(n, ignore_reserved=False) - ) - - if index_cols is not None: - cols = index_cols.copy() - cols.extend([encoder(n) for n in df.columns[len(index_cols) :]]) - df.columns = cols - else: - df = df.rename(columns=encoder) - - ( - new_parts, - new_lengths, - new_widths, - unsupported_cols, - ) = cls._partition_mgr_cls.from_pandas( - df, return_dims=True, encode_col_names=False - ) - - if len(unsupported_cols) > 0: - ErrorMessage.single_warning( - f"Frame contain columns with unsupported data-types: {unsupported_cols}. " - + "All operations with this frame will be default to pandas!" - ) - - return cls( - new_parts, - new_index, - new_columns, - new_lengths, - new_widths, - dtypes=new_dtypes, - index_cols=index_cols, - has_unsupported_data=len(unsupported_cols) > 0, - ) - - @classmethod - def from_arrow( - cls, - at, - index_cols=None, - index=None, - columns=None, - encode_col_names=True, - new_dtypes=None, - ): - """ - Build a frame from an Arrow table. - - Parameters - ---------- - at : pyarrow.Table - Source table. - index_cols : list of str, optional - List of index columns in the source table which - are ignored in transformation. - index : pandas.Index, optional - An index to be used by the new frame. Should present - if `index_cols` is not None. - columns : Index or array-like, optional - Column labels to use for resulting frame. - encode_col_names : bool, default: True - Encode column names. - new_dtypes : pandas.Index or list, optional - Column data types. - - Returns - ------- - HdkOnNativeDataframe - The new frame. - """ - ( - new_frame, - new_lengths, - new_widths, - unsupported_cols, - ) = cls._partition_mgr_cls.from_arrow( - at, return_dims=True, encode_col_names=encode_col_names - ) - - if columns is not None: - new_columns = columns - new_index = index - elif index_cols: - data_cols = [col for col in at.column_names if col not in index_cols] - new_columns = pd.Index(data=data_cols, dtype="O") - new_index = index - else: - assert index is None - new_columns = pd.Index(data=at.column_names, dtype="O") - new_index = None - - dtype_index = [] if index_cols is None else list(index_cols) - dtype_index.extend(new_columns) - - if new_dtypes is None: - new_dtypes = [] - for col in at.columns: - if pyarrow.types.is_dictionary(col.type): - new_dtypes.append( - LazyProxyCategoricalDtype._build_proxy( - parent=at, - column_name=col._name, - materializer=build_categorical_from_at, - dtype=arrow_type_to_pandas(col.type.value_type), - ) - ) - else: - new_dtypes.append(cls._arrow_type_to_dtype(col.type)) - - if len(unsupported_cols) > 0: - ErrorMessage.single_warning( - f"Frame contain columns with unsupported data-types: {unsupported_cols}. " - + "All operations with this frame will be default to pandas!" - ) - - return cls( - partitions=new_frame, - index=new_index, - columns=new_columns, - row_lengths=new_lengths, - column_widths=new_widths, - dtypes=pd.Series(data=new_dtypes, index=dtype_index), - index_cols=index_cols, - has_unsupported_data=len(unsupported_cols) > 0, - ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py deleted file mode 100644 index f99cc256baa..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py +++ /dev/null @@ -1,675 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Utilities for internal use by the ``HdkOnNativeDataframe``.""" - -from __future__ import annotations - -import re -from functools import lru_cache -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import pandas -import pyarrow as pa -from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -from pandas.core.dtypes.common import _get_dtype, is_string_dtype -from pyarrow.types import is_dictionary - -from modin.pandas.indexing import is_range_like -from modin.utils import MODIN_UNNAMED_SERIES_LABEL - -if TYPE_CHECKING: - from modin.core.dataframe.pandas.metadata import ModinDtypes - -EMPTY_ARROW_TABLE = pa.Table.from_pandas(pandas.DataFrame({})) - - -class ColNameCodec: - IDX_COL_NAME = "__index__" - ROWID_COL_NAME = "__rowid__" - UNNAMED_IDX_COL_NAME = "__index__0__N" - - _IDX_NAME_PATTERN = re.compile(f"{IDX_COL_NAME}\\d+_(.*)") - _RESERVED_NAMES = (MODIN_UNNAMED_SERIES_LABEL, ROWID_COL_NAME) - _COL_TYPES = Union[str, int, float, pandas.Timestamp, None] - _COL_NAME_TYPE = Union[_COL_TYPES, Tuple[_COL_TYPES, ...]] - - def _encode_tuple(values: Tuple[_COL_TYPES, ...]) -> str: # noqa: GL08 - dst = ["_T"] - count = len(values) - for value in values: - if isinstance(value, str): - dst.append(value.replace("_", "_Q")) - else: - dst.append(ColNameCodec._ENCODERS[type(value)](value)) - count -= 1 - if count != 0: - dst.append("_T") - return "".join(dst) - - def _decode_tuple(encoded: str) -> Tuple[_COL_TYPES, ...]: # noqa: GL08 - items = [] - for item in encoded[2:].split("_T"): - dec = ( - None - if len(item) < 2 or item[0] != "_" - else ColNameCodec._DECODERS.get(item[1], None) - ) - items.append(item.replace("_Q", "_") if dec is None else dec(item)) - return tuple(items) - - _ENCODERS = { - tuple: _encode_tuple, - type(None): lambda v: "_N", - str: lambda v: "_E" if len(v) == 0 else "_S" + v[1:] if v[0] == "_" else v, - int: lambda v: f"_I{v}", - float: lambda v: f"_F{v}", - pandas.Timestamp: lambda v: f"_D{v.timestamp()}_{v.tz}", - } - - _DECODERS = { - "T": _decode_tuple, - "N": lambda v: None, - "E": lambda v: "", - "S": lambda v: "_" + v[2:], - "I": lambda v: int(v[2:]), - "F": lambda v: float(v[2:]), - "D": lambda v: pandas.Timestamp.fromtimestamp( - float(v[2 : (idx := v.index("_", 2))]), tz=v[idx + 1 :] - ), - } - - @staticmethod - @lru_cache(1024) - def encode( - name: _COL_NAME_TYPE, - ignore_reserved: bool = True, - ) -> str: - """ - Encode column name. - - The supported name types are specified in the type hints. Non-string names - are converted to string and prefixed with a corresponding tag. - - Parameters - ---------- - name : str, int, float, Timestamp, None, tuple - Column name to be encoded. - ignore_reserved : bool, default: True - Do not encode reserved names. - - Returns - ------- - str - Encoded name. - """ - if ( - ignore_reserved - and isinstance(name, str) - and ( - name.startswith(ColNameCodec.IDX_COL_NAME) - or name in ColNameCodec._RESERVED_NAMES - ) - ): - return name - - try: - return ColNameCodec._ENCODERS[type(name)](name) - except KeyError: - raise TypeError(f"Unsupported column name: {name}") - - @staticmethod - @lru_cache(1024) - def decode(name: str) -> _COL_NAME_TYPE: - """ - Decode column name, previously encoded with encode_col_name(). - - Parameters - ---------- - name : str - Encoded name. - - Returns - ------- - str, int, float, Timestamp, None, tuple - Decoded name. - """ - if ( - len(name) < 2 - or name[0] != "_" - or name.startswith(ColNameCodec.IDX_COL_NAME) - or name in ColNameCodec._RESERVED_NAMES - ): - return name - - try: - return ColNameCodec._DECODERS[name[1]](name) - except KeyError: - raise ValueError(f"Invalid encoded column name: {name}") - - @staticmethod - def mangle_index_names(names: List[_COL_NAME_TYPE]) -> List[str]: - """ - Return mangled index names for index labels. - - Mangled names are used for index columns because index - labels cannot always be used as HDK table column - names. E.e. label can be a non-string value or an - unallowed string (empty strings, etc.) for a table column - name. - - Parameters - ---------- - names : list of str - Index labels. - - Returns - ------- - list of str - Mangled names. - """ - pref = ColNameCodec.IDX_COL_NAME - return [f"{pref}{i}_{ColNameCodec.encode(n)}" for i, n in enumerate(names)] - - @staticmethod - def demangle_index_names( - cols: List[str], - ) -> Union[_COL_NAME_TYPE, List[_COL_NAME_TYPE]]: - """ - Demangle index column names to index labels. - - Parameters - ---------- - cols : list of str - Index column names. - - Returns - ------- - list or a single demangled name - Demangled index names. - """ - if len(cols) == 1: - return ColNameCodec.demangle_index_name(cols[0]) - return [ColNameCodec.demangle_index_name(n) for n in cols] - - @staticmethod - def demangle_index_name(col: str) -> _COL_NAME_TYPE: - """ - Demangle index column name into index label. - - Parameters - ---------- - col : str - Index column name. - - Returns - ------- - str - Demangled index name. - """ - match = ColNameCodec._IDX_NAME_PATTERN.search(col) - if match: - name = match.group(1) - if name == MODIN_UNNAMED_SERIES_LABEL: - return None - return ColNameCodec.decode(name) - return col - - @staticmethod - def concat_index_names(frames) -> Dict[str, Any]: - """ - Calculate the index names and dtypes. - - Calculate the index names and dtypes, that the index - columns will have after the frames concatenation. - - Parameters - ---------- - frames : list[HdkOnNativeDataframe] - - Returns - ------- - Dict[str, Any] - """ - first = frames[0] - names = {} - if first._index_width() > 1: - # When we're dealing with a MultiIndex case the resulting index - # inherits the levels from the first frame in concatenation. - dtypes = first._dtypes - for n in first._index_cols: - names[n] = dtypes[n] - else: - # In a non-MultiIndex case, we check if all the indices have the same - # names, and if they do - inherit the name and dtype from the first frame, - # otherwise return metadata matching unnamed RangeIndex. - mangle = ColNameCodec.mangle_index_names - idx_names = set() - for f in frames: - if f._index_cols is not None: - idx_names.update(f._index_cols) - elif f.has_index_cache: - idx_names.update(mangle(f.index.names)) - else: - idx_names.add(ColNameCodec.UNNAMED_IDX_COL_NAME) - if len(idx_names) > 1: - idx_names = [ColNameCodec.UNNAMED_IDX_COL_NAME] - break - - name = next(iter(idx_names)) - # Inherit the Index's dtype from the first frame. - if first._index_cols is not None: - names[name] = first._dtypes.iloc[0] - elif first.has_index_cache: - names[name] = first.index.dtype - else: - # A trivial index with no name - names[name] = _get_dtype(int) - return names - - -def build_categorical_from_at(table, column_name): - """ - Build ``pandas.CategoricalDtype`` from a dictionary column of the passed PyArrow Table. - - Parameters - ---------- - table : pyarrow.Table - column_name : str - - Returns - ------- - pandas.CategoricalDtype - """ - chunks = table.column(column_name).chunks - cat = pandas.concat([chunk.dictionary.to_pandas() for chunk in chunks]) - # to reduce peak memory consumption - del chunks - return pandas.CategoricalDtype(cat.unique()) - - -def check_join_supported(join_type: str): - """ - Check if join type is supported by HDK. - - Parameters - ---------- - join_type : str - Join type. - - Returns - ------- - None - """ - if join_type not in ("inner", "left"): - raise NotImplementedError(f"{join_type} join") - - -def check_cols_to_join(what, df, col_names): - """ - Check the frame columns. - - Check if the frame (`df`) has the specified columns (`col_names`). The names referring to - the index columns are replaced with the actual index column names. - - Parameters - ---------- - what : str - Attribute name. - df : HdkOnNativeDataframe - The dataframe. - col_names : list of str - The column names to check. - - Returns - ------- - Tuple[HdkOnNativeDataframe, list] - The aligned data frame and column names. - """ - cols = df.columns - new_col_names = col_names - for i, col in enumerate(col_names): - if col in cols: - continue - new_name = None - if df._index_cols is not None: - for c in df._index_cols: - if col == ColNameCodec.demangle_index_name(c): - new_name = c - break - elif df.has_index_cache: - new_name = f"__index__{0}_{col}" - df = df._maybe_materialize_rowid() - if new_name is None: - raise ValueError(f"'{what}' references unknown column {col}") - if new_col_names is col_names: - # We are replacing the index names in the original list, - # but creating a copy. - new_col_names = col_names.copy() - new_col_names[i] = new_name - return df, new_col_names - - -def get_data_for_join_by_index( - left, - right, - how, - left_on, - right_on, - sort, - suffixes, -): - """ - Return the column names, dtypes and expres, required for join by index. - - This is a helper function, used by `HdkOnNativeDataframe.join()`, when joining by index. - - Parameters - ---------- - left : HdkOnNativeDataframe - A frame to join. - right : HdkOnNativeDataframe - A frame to join with. - how : str - A type of join. - left_on : list of str - A list of columns for the left frame to join on. - right_on : list of str - A list of columns for the right frame to join on. - sort : bool - Sort the result by join keys. - suffixes : list-like of str - A length-2 sequence of suffixes to add to overlapping column names - of left and right operands respectively. - - Returns - ------- - tuple - - The index columns, exprs, dtypes and columns. - """ - - def to_empty_pandas_df(df): - # Create an empty pandas frame with the same columns and index. - idx = df._index_cache.get() if df.has_index_cache else None - if idx is not None: - idx = idx[:1] - elif df._index_cols is not None: - if len(df._index_cols) > 1: - arrays = [[i] for i in range(len(df._index_cols))] - names = [ColNameCodec.demangle_index_name(n) for n in df._index_cols] - idx = pandas.MultiIndex.from_arrays(arrays, names=names) - else: - idx = pandas.Index( - name=ColNameCodec.demangle_index_name(df._index_cols[0]) - ) - return pandas.DataFrame(columns=df.columns, index=idx) - - new_dtypes = [] - exprs = {} - merged = to_empty_pandas_df(left).merge( - to_empty_pandas_df(right), - how=how, - left_on=left_on, - right_on=right_on, - sort=sort, - suffixes=suffixes, - ) - - if len(merged.index.names) == 1 and (merged.index.names[0] is None): - index_cols = None - else: - index_cols = ColNameCodec.mangle_index_names(merged.index.names) - for name in index_cols: - # Using _dtypes here since it contains all column names, - # including the index. - df = left if name in left._dtypes else right - exprs[name] = df.ref(name) - new_dtypes.append(df._dtypes[name]) - - left_col_names = set(left.columns) - right_col_names = set(right.columns) - for col in merged.columns: - orig_name = col - if orig_name in left_col_names: - df = left - elif orig_name in right_col_names: - df = right - elif suffixes is None: - raise ValueError(f"Unknown column {col}") - elif ( - col.endswith(suffixes[0]) - and (orig_name := col[0 : -len(suffixes[0])]) in left_col_names - and orig_name in right_col_names - ): - df = left # Overlapping column from the left frame - elif ( - col.endswith(suffixes[1]) - and (orig_name := col[0 : -len(suffixes[1])]) in right_col_names - and orig_name in left_col_names - ): - df = right # Overlapping column from the right frame - else: - raise ValueError(f"Unknown column {col}") - exprs[col] = df.ref(orig_name) - new_dtypes.append(df._dtypes[orig_name]) - - return index_cols, exprs, new_dtypes, merged.columns - - -def maybe_range(numbers: Union[List[int], range]) -> Union[List[int], range]: - """ - Try to convert the specified sequence of numbers to a range. - - Parameters - ---------- - numbers : list of ints or range - - Returns - ------- - list of ints or range - """ - if len(numbers) > 2 and not is_range_like(numbers): - diff = numbers[1] - numbers[0] - is_range = True - for i in range(2, len(numbers)): - if (numbers[i] - numbers[i - 1]) != diff: - is_range = False - break - if is_range: - numbers = range(numbers[0], numbers[-1] + diff, diff) - return numbers - - -def to_arrow_type(dtype) -> pa.lib.DataType: - """ - Convert the specified dtype to arrow. - - Parameters - ---------- - dtype : dtype - - Returns - ------- - pa.lib.DataType - """ - if is_string_dtype(dtype): - return pa.from_numpy_dtype(str) - return pa.from_numpy_dtype(dtype) - - -def get_common_arrow_type(t1: pa.lib.DataType, t2: pa.lib.DataType) -> pa.lib.DataType: - """ - Get common arrow data type. - - Parameters - ---------- - t1 : pa.lib.DataType - t2 : pa.lib.DataType - - Returns - ------- - pa.lib.DataType - """ - if t1 == t2: - return t1 - if pa.types.is_string(t1): - return t1 - if pa.types.is_string(t2): - return t2 - if pa.types.is_null(t1): - return t2 - if pa.types.is_null(t2): - return t1 - - t1 = t1.to_pandas_dtype() - t2 = t2.to_pandas_dtype() - return pa.from_numpy_dtype(np.promote_types(t1, t2)) - - -def is_supported_arrow_type(dtype: pa.lib.DataType) -> bool: - """ - Return True if the specified arrow type is supported by HDK. - - Parameters - ---------- - dtype : pa.lib.DataType - - Returns - ------- - bool - """ - if ( - pa.types.is_string(dtype) - or pa.types.is_time(dtype) - or pa.types.is_dictionary(dtype) - or pa.types.is_null(dtype) - ): - return True - if isinstance(dtype, pa.ExtensionType) or pa.types.is_duration(dtype): - return False - try: - pandas_dtype = dtype.to_pandas_dtype() - return pandas_dtype != pandas.api.types.pandas_dtype("O") - except NotImplementedError: - return False - - -def ensure_supported_dtype(dtype): - """ - Check if the specified `dtype` is supported by HDK. - - If `dtype` is not supported, `NotImplementedError` is raised. - - Parameters - ---------- - dtype : dtype - """ - try: - dtype = pa.from_numpy_dtype(dtype) - except pa.ArrowNotImplementedError as err: - raise NotImplementedError(f"Type {dtype}") from err - if not is_supported_arrow_type(dtype): - raise NotImplementedError(f"Type {dtype}") - - -def arrow_to_pandas( - at: pa.Table, dtypes: Optional[Union[ModinDtypes, pandas.Series]] = None -) -> pandas.DataFrame: - """ - Convert the specified arrow table to pandas. - - Parameters - ---------- - at : pyarrow.Table - The table to convert. - dtypes : Union[ModinDtypes, pandas.Series], optional - Dtypes are used to correctly map PyArrow types to pandas. - - Returns - ------- - pandas.DataFrame - """ - - def mapper(at): - if is_dictionary(at) and isinstance(at.value_type, ArrowIntervalType): - # The default mapper fails with TypeError: unhashable type: 'dict' - return _CategoricalDtypeMapper - elif dtypes is not None and any( - ( - isinstance(dtype, pandas.core.dtypes.dtypes.ArrowDtype) - for dtype in dtypes - ) - ): - # for pandas types that are backed by pyarrow, for example: uint8[pyarrow] - dtype_mapping = { - pa.int8(): pandas.ArrowDtype(pa.int8()), - pa.int16(): pandas.ArrowDtype(pa.int16()), - pa.int32(): pandas.ArrowDtype(pa.int32()), - pa.int64(): pandas.ArrowDtype(pa.int64()), - pa.uint8(): pandas.ArrowDtype(pa.uint8()), - pa.uint16(): pandas.ArrowDtype(pa.uint16()), - pa.uint32(): pandas.ArrowDtype(pa.uint32()), - pa.uint64(): pandas.ArrowDtype(pa.uint64()), - pa.bool_(): pandas.ArrowDtype(pa.bool_()), - pa.float32(): pandas.ArrowDtype(pa.float32()), - pa.float64(): pandas.ArrowDtype(pa.float64()), - pa.string(): pandas.ArrowDtype(pa.string()), - } - return dtype_mapping.get(at, None) - return None - - df = at.to_pandas(types_mapper=mapper) - dtype = {} - for idx, _type in enumerate(at.schema.types): - if isinstance(_type, pa.lib.TimestampType) and _type.unit != "ns": - dtype[at.schema.names[idx]] = f"datetime64[{_type.unit}]" - if dtype: - # TODO: remove after https://github.com/apache/arrow/pull/35656 is merge - df = df.astype(dtype) - return df - - -def arrow_type_to_pandas(at: pa.lib.DataType): - """ - Convert the specified arrow type to pandas dtype. - - Parameters - ---------- - at : pa.lib.DataType - - Returns - ------- - dtype - """ - if at == pa.string(): - return _get_dtype(str) - return at.to_pandas_dtype() - - -class _CategoricalDtypeMapper: # noqa: GL08 - @staticmethod - def __from_arrow__(arr): # noqa: GL08 - values = [] - categories = {} - chunks = arr.chunks if isinstance(arr, pa.ChunkedArray) else (arr,) - for chunk in chunks: - assert isinstance(chunk, pa.DictionaryArray) - cat = chunk.dictionary.to_pandas() - values.append(chunk.indices.to_pandas().map(cat)) - categories.update((c, None) for c in cat) - return pandas.Categorical( - pandas.concat(values, ignore_index=True), - dtype=pandas.CategoricalDtype(categories, ordered=True), - ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py deleted file mode 100644 index 80bb792b7cd..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module chooses a proper worker class.""" -from .base_worker import DbTable -from .hdk_worker import HdkWorker as DbWorker - -__all__ = ["DbTable", "DbWorker"] diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py deleted file mode 100644 index e6a9fa8b894..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py +++ /dev/null @@ -1,1253 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides classes for lazy DataFrame algebra operations.""" - -import abc -from typing import TYPE_CHECKING, Dict, List, Union - -import numpy as np -import pandas -import pyarrow as pa -from pandas.core.dtypes.common import is_string_dtype - -from modin.pandas.indexing import is_range_like -from modin.utils import _inherit_docstrings - -from .dataframe.utils import EMPTY_ARROW_TABLE, ColNameCodec, get_common_arrow_type -from .db_worker import DbTable -from .expr import InputRefExpr, LiteralExpr, OpExpr - -if TYPE_CHECKING: - from .dataframe.dataframe import HdkOnNativeDataframe - - -class TransformMapper: - """ - A helper class for ``InputMapper``. - - This class is used to map column references to expressions used - for their computation. This mapper is used to fold expressions - from multiple ``TransformNode``-s into a single expression. - - Parameters - ---------- - op : TransformNode - Transformation used for mapping. - - Attributes - ---------- - _op : TransformNode - Transformation used for mapping. - """ - - def __init__(self, op): - self._op = op - - def translate(self, col): - """ - Translate column reference by its name. - - Parameters - ---------- - col : str - A name of the column to translate. - - Returns - ------- - BaseExpr - Translated expression. - """ - if col == ColNameCodec.ROWID_COL_NAME: - return self._op.input[0].ref(col) - return self._op.exprs[col] - - -class FrameMapper: - """ - A helper class for ``InputMapper``. - - This class is used to map column references to another frame. - This mapper is used to replace input frame in expressions. - - Parameters - ---------- - frame : HdkOnNativeDataframe - Target frame. - - Attributes - ---------- - _frame : HdkOnNativeDataframe - Target frame. - """ - - def __init__(self, frame): - self._frame = frame - - def translate(self, col): - """ - Translate column reference by its name. - - Parameters - ---------- - col : str - A name of the column to translate. - - Returns - ------- - BaseExpr - Translated expression. - """ - return self._frame.ref(col) - - -class InputMapper: - """ - Input reference mapper. - - This class is used for input translation/replacement in - expressions via ``BaseExpr.translate_input`` method. - - Translation is performed using column mappers registered via - `add_mapper` method. Each input frame can have at most one mapper. - References to frames with no registered mapper are not translated. - - Attributes - ---------- - _mappers : dict - Column mappers to use for translation. - """ - - def __init__(self): - self._mappers = {} - - def add_mapper(self, frame, mapper): - """ - Register a mapper for a frame. - - Parameters - ---------- - frame : HdkOnNativeDataframe - A frame for which a mapper is registered. - mapper : object - A mapper to register. - """ - self._mappers[frame] = mapper - - def translate(self, ref): - """ - Translate column reference by its name. - - Parameters - ---------- - ref : InputRefExpr - A column reference to translate. - - Returns - ------- - BaseExpr - Translated expression. - """ - if ref.modin_frame in self._mappers: - return self._mappers[ref.modin_frame].translate(ref.column) - return ref - - -class DFAlgNode(abc.ABC): - """ - A base class for dataframe algebra tree node. - - A dataframe algebra tree is used to describe how dataframe is computed. - - Attributes - ---------- - input : list of DFAlgNode, optional - Holds child nodes. - """ - - @abc.abstractmethod - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - DFAlgNode - """ - pass - - def walk_dfs(self, cb, *args, **kwargs): - """ - Perform a depth-first walk over a tree. - - Walk over an input in the depth-first order and call a callback function - for each node. - - Parameters - ---------- - cb : callable - A callback function. - *args : list - Arguments for the callback. - **kwargs : dict - Keyword arguments for the callback. - """ - if hasattr(self, "input"): - for i in self.input: - i._op.walk_dfs(cb, *args, **kwargs) - cb(self, *args, **kwargs) - - def collect_partitions(self): - """ - Collect all partitions participating in a tree. - - Returns - ------- - list - A list of collected partitions. - """ - partitions = [] - self.walk_dfs(lambda a, b: a._append_partitions(b), partitions) - return partitions - - def collect_frames(self): - """ - Collect all frames participating in a tree. - - Returns - ------- - list - A list of collected frames. - """ - frames = [] - self.walk_dfs(lambda a, b: a._append_frames(b), frames) - return frames - - def require_executed_base(self) -> bool: - """ - Check if materialization of input frames is required. - - Returns - ------- - bool - """ - return False - - def can_execute_hdk(self) -> bool: - """ - Check for possibility of HDK execution. - - Check if the computation can be executed using an HDK query. - - Returns - ------- - bool - """ - return True - - def can_execute_arrow(self) -> bool: - """ - Check for possibility of Arrow execution. - - Check if the computation can be executed using - the Arrow API instead of HDK query. - - Returns - ------- - bool - """ - return False - - def execute_arrow( - self, arrow_input: Union[None, pa.Table, List[pa.Table]] - ) -> pa.Table: - """ - Compute the frame data using the Arrow API. - - Parameters - ---------- - arrow_input : None, pa.Table or list of pa.Table - The input, converted to arrow. - - Returns - ------- - pyarrow.Table - The resulting table. - """ - raise RuntimeError(f"Arrow execution is not supported by {type(self)}") - - def _append_partitions(self, partitions): - """ - Append all used by the node partitions to `partitions` list. - - The default implementation is no-op. This method should be - overriden by all nodes referencing frame's partitions. - - Parameters - ---------- - partitions : list - Output list of partitions. - """ - pass - - def _append_frames(self, frames): - """ - Append all used by the node frames to `frames` list. - - The default implementation is no-op. This method should be - overriden by all nodes referencing frames. - - Parameters - ---------- - frames : list - Output list of frames. - """ - pass - - def __repr__(self): - """ - Return a string representation of the tree. - - Returns - ------- - str - """ - return self.dumps() - - def dump(self, prefix=""): - """ - Dump the tree. - - Parameters - ---------- - prefix : str, default: '' - A prefix to add at each string of the dump. - """ - print(self.dumps(prefix)) # noqa: T201 - - def dumps(self, prefix=""): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str, default: '' - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return self._prints(prefix) - - @abc.abstractmethod - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - pass - - def _prints_input(self, prefix): - """ - Return a string representation of node's operands. - - A helper method for `_prints` implementation in derived classes. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - res = "" - if hasattr(self, "input"): - for i, node in enumerate(self.input): - if isinstance(node._op, FrameNode): - res += f"{prefix}input[{i}]: {node._op}\n" - else: - res += f"{prefix}input[{i}]:\n" + node._op._prints(prefix + " ") - return res - - -class FrameNode(DFAlgNode): - """ - A node to reference a materialized frame. - - Parameters - ---------- - modin_frame : HdkOnNativeDataframe - Referenced frame. - - Attributes - ---------- - modin_frame : HdkOnNativeDataframe - Referenced frame. - """ - - def __init__(self, modin_frame: "HdkOnNativeDataframe"): - self.modin_frame = modin_frame - - @_inherit_docstrings(DFAlgNode.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return self.modin_frame._has_arrow_table() - - def execute_arrow(self, ignore=None) -> Union[DbTable, pa.Table, pandas.DataFrame]: - """ - Materialized frame. - - If `can_execute_arrow` returns True, this method returns an arrow table, - otherwise - a pandas Dataframe or DbTable. - - Parameters - ---------- - ignore : None, pa.Table or list of pa.Table, default: None - - Returns - ------- - DbTable or pa.Table or pandas.Dataframe - """ - frame = self.modin_frame - if frame._partitions is not None: - part = frame._partitions[0][0] - to_arrow = part.raw and not frame._has_unsupported_data - return part.get(to_arrow) - if frame._has_unsupported_data: - return pandas.DataFrame( - index=frame._index_cache, columns=frame._columns_cache - ) - if frame._index_cache or frame._columns_cache: - return pa.Table.from_pandas( - pandas.DataFrame(index=frame._index_cache, columns=frame._columns_cache) - ) - return EMPTY_ARROW_TABLE - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - FrameNode - """ - return FrameNode(self.modin_frame) - - def _append_partitions(self, partitions): - """ - Append all partitions of the referenced frame to `partitions` list. - - Parameters - ---------- - partitions : list - Output list of partitions. - """ - partitions += self.modin_frame._partitions.flatten() - - def _append_frames(self, frames): - """ - Append the referenced frame to `frames` list. - - Parameters - ---------- - frames : list - Output list of frames. - """ - frames.append(self.modin_frame) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return f"{prefix}{self.modin_frame.id_str()}" - - -class MaskNode(DFAlgNode): - """ - A filtering node which filters rows by index values or row id. - - Parameters - ---------- - base : HdkOnNativeDataframe - A filtered frame. - row_labels : list, optional - List of row labels to select. - row_positions : list of int, optional - List of rows ids to select. - - Attributes - ---------- - input : list of HdkOnNativeDataframe - Holds a single filtered frame. - row_labels : list or None - List of row labels to select. - row_positions : list of int or None - List of rows ids to select. - """ - - def __init__( - self, - base: "HdkOnNativeDataframe", - row_labels: List[str] = None, - row_positions: List[int] = None, - ): - self.input = [base] - self.row_labels = row_labels - self.row_positions = row_positions - - @_inherit_docstrings(DFAlgNode.require_executed_base) - def require_executed_base(self) -> bool: - return True - - @_inherit_docstrings(DFAlgNode.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return self.row_labels is None - - def execute_arrow(self, table: pa.Table) -> pa.Table: - """ - Perform row selection on the frame using Arrow API. - - Parameters - ---------- - table : pa.Table - - Returns - ------- - pyarrow.Table - The resulting table. - """ - row_positions = self.row_positions - - if not isinstance(row_positions, slice) and not is_range_like(row_positions): - if not isinstance(row_positions, (pa.Array, np.ndarray, list)): - row_positions = pa.array(row_positions) - return table.take(row_positions) - - if isinstance(row_positions, slice): - row_positions = range(*row_positions.indices(table.num_rows)) - - start, stop, step = ( - row_positions.start, - row_positions.stop, - row_positions.step, - ) - - if step == 1: - return table.slice(start, len(row_positions)) - else: - indices = np.arange(start, stop, step) - return table.take(indices) - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - MaskNode - """ - return MaskNode(self.input[0], self.row_labels, self.row_positions) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return ( - f"{prefix}MaskNode:\n" - + f"{prefix} row_labels: {self.row_labels}\n" - + f"{prefix} row_positions: {self.row_positions}\n" - + self._prints_input(prefix + " ") - ) - - -class GroupbyAggNode(DFAlgNode): - """ - A node to represent a groupby aggregation operation. - - Parameters - ---------- - base : DFAlgNode - An aggregated frame. - by : list of str - A list of columns used for grouping. - agg_exprs : dict - Aggregates to compute. - groupby_opts : dict - Additional groupby parameters. - - Attributes - ---------- - input : list of DFAlgNode - Holds a single aggregated frame. - by : list of str - A list of columns used for grouping. - agg_exprs : dict - Aggregates to compute. - groupby_opts : dict - Additional groupby parameters. - """ - - def __init__(self, base, by, agg_exprs, groupby_opts): - self.by = by - self.agg_exprs = agg_exprs - self.groupby_opts = groupby_opts - self.input = [base] - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - GroupbyAggNode - """ - return GroupbyAggNode(self.input[0], self.by, self.agg_exprs, self.groupby_opts) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return ( - f"{prefix}AggNode:\n" - + f"{prefix} by: {self.by}\n" - + f"{prefix} aggs: {self.agg_exprs}\n" - + f"{prefix} groupby_opts: {self.groupby_opts}\n" - + self._prints_input(prefix + " ") - ) - - -class TransformNode(DFAlgNode): - """ - A node to represent a projection of a single frame. - - Provides expressions to compute each column of the projection. - - Parameters - ---------- - base : HdkOnNativeDataframe - A transformed frame. - exprs : dict - Expressions for frame's columns computation. - fold : bool - - Attributes - ---------- - input : list of HdkOnNativeDataframe - Holds a single projected frame. - exprs : dict - Expressions used to compute frame's columns. - """ - - def __init__( - self, - base: "HdkOnNativeDataframe", - exprs: Dict[str, Union[InputRefExpr, LiteralExpr, OpExpr]], - fold: bool = True, - ): - # If base of this node is another `TransformNode`, then translate all - # expressions in `expr` to its base. - if fold and isinstance(base._op, TransformNode): - self.input = [base._op.input[0]] - self.exprs = exprs = translate_exprs_to_base(exprs, self.input[0]) - for col, expr in exprs.items(): - exprs[col] = expr.fold() - else: - self.input = [base] - self.exprs = exprs - - @_inherit_docstrings(DFAlgNode.can_execute_hdk) - def can_execute_hdk(self) -> bool: - return self._check_exprs("can_execute_hdk") - - @_inherit_docstrings(DFAlgNode.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return self._check_exprs("can_execute_arrow") - - def execute_arrow(self, table: pa.Table) -> pa.Table: - """ - Perform column selection on the frame using Arrow API. - - Parameters - ---------- - table : pa.Table - - Returns - ------- - pyarrow.Table - The resulting table. - """ - cols = [expr.execute_arrow(table) for expr in self.exprs.values()] - names = [ColNameCodec.encode(c) for c in self.exprs] - return pa.table(cols, names) - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - TransformNode - """ - return TransformNode(self.input[0], self.exprs) - - def is_simple_select(self): - """ - Check if transform node is a simple selection. - - Simple selection can only use InputRefExpr expressions. - - Returns - ------- - bool - True for simple select and False otherwise. - """ - return all(isinstance(expr, InputRefExpr) for expr in self.exprs.values()) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - res = f"{prefix}TransformNode:\n" - for k, v in self.exprs.items(): - res += f"{prefix} {k}: {v}\n" - res += self._prints_input(prefix + " ") - return res - - def _check_exprs(self, attr) -> bool: - """ - Check if the specified attribute is True for all expressions. - - Parameters - ---------- - attr : str - - Returns - ------- - bool - """ - stack = list(self.exprs.values()) - while stack: - expr = stack.pop() - if not getattr(expr, attr)(): - return False - if isinstance(expr, OpExpr): - stack.extend(expr.operands) - return True - - -class JoinNode(DFAlgNode): - """ - A node to represent a join of two frames. - - Parameters - ---------- - left : DFAlgNode - A left frame to join. - right : DFAlgNode - A right frame to join. - how : str, default: "inner" - A type of join. - exprs : dict, default: None - Expressions for the resulting frame's columns. - condition : BaseExpr, default: None - Join condition. - - Attributes - ---------- - input : list of DFAlgNode - Holds joined frames. The first frame in the list is considered as - the left join operand. - how : str - A type of join. - exprs : dict - Expressions for the resulting frame's columns. - condition : BaseExpr - Join condition. - """ - - def __init__( - self, - left, - right, - how="inner", - exprs=None, - condition=None, - ): - self.input = [left, right] - self.how = how - self.exprs = exprs - self.condition = condition - - @property - def by_rowid(self): - """ - Return True if this is a join by the rowid column. - - Returns - ------- - bool - """ - return ( - isinstance(self.condition, OpExpr) - and self.condition.op == "=" - and all( - isinstance(o, InputRefExpr) and o.column == ColNameCodec.ROWID_COL_NAME - for o in self.condition.operands - ) - ) - - @_inherit_docstrings(DFAlgNode.require_executed_base) - def require_executed_base(self) -> bool: - return self.by_rowid and any( - not isinstance(i._op, FrameNode) for i in self.input - ) - - @_inherit_docstrings(DFAlgNode.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return self.by_rowid and all( - isinstance(e, InputRefExpr) for e in self.exprs.values() - ) - - @_inherit_docstrings(DFAlgNode.execute_arrow) - def execute_arrow(self, tables: List[pa.Table]) -> pa.Table: - t1 = tables[0] - t2 = tables[1] - cols1 = t1.column_names - cols = [ - (t1 if (col := ColNameCodec.encode(e.column)) in cols1 else t2).column(col) - for e in self.exprs.values() - ] - names = [ColNameCodec.encode(c) for c in self.exprs] - return pa.table(cols, names) - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - JoinNode - """ - return JoinNode( - self.input[0], - self.input[1], - self.how, - self.exprs, - self.condition, - ) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - exprs_str = "" - for k, v in self.exprs.items(): - exprs_str += f"{prefix} {k}: {v}\n" - return ( - f"{prefix}JoinNode:\n" - + f"{prefix} Fields:\n" - + exprs_str - + f"{prefix} How: {self.how}\n" - + f"{prefix} Condition: {self.condition}\n" - + self._prints_input(prefix + " ") - ) - - -class UnionNode(DFAlgNode): - """ - A node to represent rows union of input frames. - - Parameters - ---------- - frames : list of HdkOnNativeDataframe - Input frames. - columns : dict - Column names and dtypes. - ignore_index : bool - - Attributes - ---------- - input : list of HdkOnNativeDataframe - Input frames. - """ - - def __init__( - self, - frames: List["HdkOnNativeDataframe"], - columns: Dict[str, np.dtype], - ignore_index: bool, - ): - self.input = frames - self.columns = columns - self.ignore_index = ignore_index - - @_inherit_docstrings(DFAlgNode.require_executed_base) - def require_executed_base(self) -> bool: - return not self.can_execute_hdk() - - @_inherit_docstrings(DFAlgNode.can_execute_hdk) - def can_execute_hdk(self) -> bool: - # Hdk does not support union of more than 2 frames. - if len(self.input) > 2: - return False - - # Arrow execution is required for empty frames to preserve the index. - if len(self.input) == 0 or len(self.columns) == 0: - return False - - # Only numeric columns of the same type are supported by HDK. - # See https://github.com/intel-ai/hdk/issues/182 - dtypes = self.input[0]._dtypes.to_dict() - if any(is_string_dtype(t) for t in dtypes.values()) or any( - f._dtypes.to_dict() != dtypes for f in self.input[1:] - ): - return False - - return True - - @_inherit_docstrings(DFAlgNode.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return True - - def execute_arrow(self, tables: Union[pa.Table, List[pa.Table]]) -> pa.Table: - """ - Concat frames' rows using Arrow API. - - Parameters - ---------- - tables : pa.Table or list of pa.Table - - Returns - ------- - pyarrow.Table - The resulting table. - """ - if len(self.columns) == 0: - frames = self.input - if len(frames) == 0: - return EMPTY_ARROW_TABLE - elif self.ignore_index: - idx = pandas.RangeIndex(0, sum(len(frame.index) for frame in frames)) - else: - idx = frames[0].index.append([f.index for f in frames[1:]]) - idx_cols = ColNameCodec.mangle_index_names(idx.names) - idx_df = pandas.DataFrame(index=idx).reset_index() - obj_cols = idx_df.select_dtypes(include=["object"]).columns.tolist() - if len(obj_cols) != 0: - # PyArrow fails to convert object fields. Converting to str. - idx_df[obj_cols] = idx_df[obj_cols].astype(str) - idx_table = pa.Table.from_pandas(idx_df, preserve_index=False) - return idx_table.rename_columns(idx_cols) - - if isinstance(tables, pa.Table): - assert len(self.input) == 1 - return tables - - try: - return pa.concat_tables(tables) - except pa.lib.ArrowInvalid: - # Probably, some tables have different column types. - # Trying to find a common type and cast the columns. - fields: Dict[str, pa.Field] = {} - for table in tables: - for col_name in table.column_names: - field = table.field(col_name) - cur_field = fields.get(col_name, None) - if cur_field is None or ( - cur_field.type - != get_common_arrow_type(cur_field.type, field.type) - ): - fields[col_name] = field - schema = pa.schema(list(fields.values())) - for i, table in enumerate(tables): - tables[i] = pa.table(table.columns, schema=schema) - return pa.concat_tables(tables) - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - UnionNode - """ - return UnionNode(self.input, self.columns, self.ignore_index) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return f"{prefix}UnionNode:\n" + self._prints_input(prefix + " ") - - -class SortNode(DFAlgNode): - """ - A sort node to order frame's rows in a specified order. - - Parameters - ---------- - frame : DFAlgNode - Sorted frame. - columns : list of str - A list of key columns for a sort. - ascending : list of bool - Ascending or descending sort. - na_position : {"first", "last"} - "first" to put NULLs at the start of the result, - "last" to put NULLs at the end of the result. - - Attributes - ---------- - input : list of DFAlgNode - Holds a single sorted frame. - columns : list of str - A list of key columns for a sort. - ascending : list of bool - Ascending or descending sort. - na_position : {"first", "last"} - "first" to put NULLs at the start of the result, - "last" to put NULLs at the end of the result. - """ - - def __init__(self, frame, columns, ascending, na_position): - self.input = [frame] - self.columns = columns - self.ascending = ascending - self.na_position = na_position - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - SortNode - """ - return SortNode(self.input[0], self.columns, self.ascending, self.na_position) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return ( - f"{prefix}SortNode:\n" - + f"{prefix} Columns: {self.columns}\n" - + f"{prefix} Ascending: {self.ascending}\n" - + f"{prefix} NULLs position: {self.na_position}\n" - + self._prints_input(prefix + " ") - ) - - -class FilterNode(DFAlgNode): - """ - A node for generic rows filtering. - - For rows filter by row id a ``MaskNode`` should be preferred. - - Parameters - ---------- - frame : DFAlgNode - A filtered frame. - condition : BaseExpr - Filter condition. - - Attributes - ---------- - input : list of DFAlgNode - Holds a single filtered frame. - condition : BaseExpr - Filter condition. - """ - - def __init__(self, frame, condition): - self.input = [frame] - self.condition = condition - - def copy(self): - """ - Make a shallow copy of the node. - - Returns - ------- - FilterNode - """ - return FilterNode(self.input[0], self.condition) - - def _prints(self, prefix): - """ - Return a string representation of the tree. - - Parameters - ---------- - prefix : str - A prefix to add at each string of the dump. - - Returns - ------- - str - """ - return ( - f"{prefix}FilterNode:\n" - + f"{prefix} Condition: {self.condition}\n" - + self._prints_input(prefix + " ") - ) - - -def translate_exprs_to_base(exprs, base): - """ - Fold expressions. - - Fold expressions with their input nodes until `base` - frame is the only input frame. - - Parameters - ---------- - exprs : dict - Expressions to translate. - base : HdkOnNativeDataframe - Required input frame for translated expressions. - - Returns - ------- - dict - Translated expressions. - """ - new_exprs = dict(exprs) - - frames = set() - for expr in new_exprs.values(): - expr.collect_frames(frames) - frames.discard(base) - - while len(frames) > 0: - mapper = InputMapper() - new_frames = set() - for frame in frames: - frame_base = frame._op.input[0] - if frame_base != base: - new_frames.add(frame_base) - assert isinstance(frame._op, TransformNode) - mapper.add_mapper(frame, TransformMapper(frame._op)) - - for k, v in new_exprs.items(): - new_expr = v.translate_input(mapper) - new_expr.collect_frames(new_frames) - new_exprs[k] = new_expr - - new_frames.discard(base) - frames = new_frames - - res = {} - for col in exprs.keys(): - res[col] = new_exprs[col] - return res - - -def replace_frame_in_exprs(exprs, old_frame, new_frame): - """ - Translate input expression replacing an input frame in them. - - Parameters - ---------- - exprs : dict - Expressions to translate. - old_frame : HdkOnNativeDataframe - An input frame to replace. - new_frame : HdkOnNativeDataframe - A new input frame to use. - - Returns - ------- - dict - Translated expressions. - """ - mapper = InputMapper() - mapper.add_mapper(old_frame, FrameMapper(new_frame)) - - res = {} - for col in exprs.keys(): - res[col] = exprs[col].translate_input(mapper) - return res diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py deleted file mode 100644 index 484d4b42e1e..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py +++ /dev/null @@ -1,1421 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides classes for scalar expression trees.""" - -import abc -from typing import Generator, Type, Union - -import numpy as np -import pandas -import pyarrow as pa -import pyarrow.compute as pc -from pandas.core.dtypes.common import ( - _get_dtype, - is_bool_dtype, - is_datetime64_any_dtype, - is_datetime64_dtype, - is_float_dtype, - is_integer_dtype, - is_list_like, - is_numeric_dtype, - is_string_dtype, -) - -from modin.pandas.indexing import is_range_like -from modin.utils import _inherit_docstrings - -from .dataframe.utils import ColNameCodec, to_arrow_type - - -def _get_common_dtype(lhs_dtype, rhs_dtype): - """ - Get data type for a binary operation result. - - Parameters - ---------- - lhs_dtype : dtype - The type of the first operand. - rhs_dtype : dtype - The type of the second operand. - - Returns - ------- - dtype - The result data type. - """ - if lhs_dtype == rhs_dtype: - return lhs_dtype - if is_float_dtype(lhs_dtype) and ( - is_float_dtype(rhs_dtype) or is_integer_dtype(rhs_dtype) - ): - return _get_dtype(float) - if is_float_dtype(rhs_dtype) and ( - is_float_dtype(lhs_dtype) or is_integer_dtype(lhs_dtype) - ): - return _get_dtype(float) - if is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype): - return _get_dtype(int) - if is_datetime64_dtype(lhs_dtype) and is_datetime64_dtype(rhs_dtype): - return np.promote_types(lhs_dtype, rhs_dtype) - if (is_datetime64_dtype(lhs_dtype) and rhs_dtype == np.int64) or ( - is_datetime64_dtype(rhs_dtype) and (lhs_dtype == np.int64) - ): - return _get_dtype(int) - raise NotImplementedError( - f"Cannot perform operation on types: {lhs_dtype}, {rhs_dtype}" - ) - - -_aggs_preserving_numeric_type = {"sum", "min", "max", "nlargest", "nsmallest"} -_aggs_with_int_result = {"count", "size"} -_aggs_with_float_result = {"mean", "median", "std", "skew"} - - -def _quantile_agg_dtype(dtype): - """ - Compute the quantile aggregate data type. - - Parameters - ---------- - dtype : dtype - - Returns - ------- - dtype - """ - return dtype if is_datetime64_any_dtype(dtype) else _get_dtype(float) - - -def _agg_dtype(agg, dtype): - """ - Compute aggregate data type. - - Parameters - ---------- - agg : str - Aggregate name. - dtype : dtype - Operand data type. - - Returns - ------- - dtype - The aggregate data type. - """ - if agg in _aggs_preserving_numeric_type: - return dtype - elif agg in _aggs_with_int_result: - return _get_dtype(int) - elif agg in _aggs_with_float_result: - return _get_dtype(float) - elif agg == "quantile": - return _quantile_agg_dtype(dtype) - else: - raise NotImplementedError(f"unsupported aggregate {agg}") - - -_cmp_ops = {"eq", "ge", "gt", "le", "lt", "ne"} - - -def is_cmp_op(op): - """ - Check if operation is a comparison. - - Parameters - ---------- - op : str - Operation to check. - - Returns - ------- - bool - True for comparison operations and False otherwise. - """ - return op in _cmp_ops - - -_logical_ops = {"and", "or"} - - -def is_logical_op(op): - """ - Check if operation is a logical one. - - Parameters - ---------- - op : str - Operation to check. - - Returns - ------- - bool - True for logical operations and False otherwise. - """ - return op in _logical_ops - - -class BaseExpr(abc.ABC): - """ - An abstract base class for expression tree node. - - An expression tree is used to describe how a single column of a dataframe - is computed. - - Each node can belong to multiple trees and therefore should be immutable - until proven to have no parent nodes (e.g. by making a copy). - - Attributes - ---------- - operands : list of BaseExpr, optional - Holds child nodes. Leaf nodes shouldn't have `operands` attribute. - """ - - binary_operations = { - "add": "+", - "sub": "-", - "mul": "*", - "mod": "MOD", - "floordiv": "//", - "truediv": "/", - "pow": "POWER", - "eq": "=", - "ge": ">=", - "gt": ">", - "le": "<=", - "lt": "<", - "ne": "<>", - "and": "AND", - "or": "OR", - } - - preserve_dtype_math_ops = {"add", "sub", "mul", "mod", "floordiv", "pow"} - promote_to_float_math_ops = {"truediv"} - - def eq(self, other): - """ - Build an equality comparison of `self` with `other`. - - Parameters - ---------- - other : BaseExpr or scalar - An operand to compare with. - - Returns - ------- - BaseExpr - The resulting comparison expression. - """ - return self.cmp("=", other) - - def le(self, other): - """ - Build a less or equal comparison with `other`. - - Parameters - ---------- - other : BaseExpr or scalar - An operand to compare with. - - Returns - ------- - BaseExpr - The resulting comparison expression. - """ - return self.cmp("<=", other) - - def ge(self, other): - """ - Build a greater or equal comparison with `other`. - - Parameters - ---------- - other : BaseExpr or scalar - An operand to compare with. - - Returns - ------- - BaseExpr - The resulting comparison expression. - """ - return self.cmp(">=", other) - - def cmp(self, op, other): - """ - Build a comparison expression with `other`. - - Parameters - ---------- - op : str - A comparison operation. - other : BaseExpr or scalar - An operand to compare with. - - Returns - ------- - BaseExpr - The resulting comparison expression. - """ - if not isinstance(other, BaseExpr): - other = LiteralExpr(other) - return OpExpr(op, [self, other], _get_dtype(bool)) - - def cast(self, res_type): - """ - Build a cast expression. - - Parameters - ---------- - res_type : dtype - A data type to cast to. - - Returns - ------- - BaseExpr - The cast expression. - """ - # From float to int cast we expect truncate behavior but CAST - # operation would give us round behavior. - if is_float_dtype(self._dtype) and is_integer_dtype(res_type): - return self.floor() - - new_expr = OpExpr("CAST", [self], res_type) - return new_expr - - def is_null(self): - """ - Build a NULL check expression. - - Returns - ------- - BaseExpr - The NULL check expression. - """ - new_expr = OpExpr("IS NULL", [self], _get_dtype(bool)) - return new_expr - - def is_not_null(self): - """ - Build a NOT NULL check expression. - - Returns - ------- - BaseExpr - The NOT NULL check expression. - """ - new_expr = OpExpr("IS NOT NULL", [self], _get_dtype(bool)) - return new_expr - - def bin_op(self, other, op_name): - """ - Build a binary operation expression. - - Parameters - ---------- - other : BaseExpr - The second operand. - op_name : str - A binary operation name. - - Returns - ------- - BaseExpr - The resulting binary operation expression. - """ - if op_name not in self.binary_operations: - raise NotImplementedError(f"unsupported binary operation {op_name}") - - if is_cmp_op(op_name): - return self._cmp_op(other, op_name) - - # True division may require prior cast to float to avoid integer division - if op_name == "truediv": - if is_integer_dtype(self._dtype) and is_integer_dtype(other._dtype): - other = other.cast(_get_dtype(float)) - res_type = self._get_bin_op_res_type(op_name, self._dtype, other._dtype) - new_expr = OpExpr(self.binary_operations[op_name], [self, other], res_type) - # Floor division may require additional FLOOR expr. - if op_name == "floordiv" and not is_integer_dtype(res_type): - return new_expr.floor() - return new_expr - - def add(self, other): - """ - Build an add expression. - - Parameters - ---------- - other : BaseExpr - The second operand. - - Returns - ------- - BaseExpr - The resulting add expression. - """ - return self.bin_op(other, "add") - - def sub(self, other): - """ - Build a sub expression. - - Parameters - ---------- - other : BaseExpr - The second operand. - - Returns - ------- - BaseExpr - The resulting sub expression. - """ - return self.bin_op(other, "sub") - - def mul(self, other): - """ - Build a mul expression. - - Parameters - ---------- - other : BaseExpr - The second operand. - - Returns - ------- - BaseExpr - The resulting mul expression. - """ - return self.bin_op(other, "mul") - - def mod(self, other): - """ - Build a mod expression. - - Parameters - ---------- - other : BaseExpr - The second operand. - - Returns - ------- - BaseExpr - The resulting mod expression. - """ - return self.bin_op(other, "mod") - - def truediv(self, other): - """ - Build a truediv expression. - - The result always has float data type. - - Parameters - ---------- - other : BaseExpr - The second operand. - - Returns - ------- - BaseExpr - The resulting truediv expression. - """ - return self.bin_op(other, "truediv") - - def floordiv(self, other): - """ - Build a floordiv expression. - - The result always has an integer data type. - - Parameters - ---------- - other : BaseExpr - The second operand. - - Returns - ------- - BaseExpr - The resulting floordiv expression. - """ - return self.bin_op(other, "floordiv") - - def pow(self, other): - """ - Build a power expression. - - Parameters - ---------- - other : BaseExpr - The power operand. - - Returns - ------- - BaseExpr - The resulting power expression. - """ - return self.bin_op(other, "pow") - - def floor(self): - """ - Build a floor expression. - - Returns - ------- - BaseExpr - The resulting floor expression. - """ - return OpExpr("FLOOR", [self], _get_dtype(int)) - - def invert(self) -> "OpExpr": - """ - Build a bitwise inverse expression. - - Returns - ------- - OpExpr - The resulting bitwise inverse expression. - """ - return OpExpr("BIT_NOT", [self], self._dtype) - - def _cmp_op(self, other, op_name): - """ - Build a comparison expression. - - Parameters - ---------- - other : BaseExpr - A value to compare with. - op_name : str - The comparison operation name. - - Returns - ------- - BaseExpr - The resulting comparison expression. - """ - lhs_dtype_class = self._get_dtype_cmp_class(self._dtype) - rhs_dtype_class = self._get_dtype_cmp_class(other._dtype) - res_dtype = _get_dtype(bool) - # In HDK comparison with NULL always results in NULL, - # but in pandas it is True for 'ne' comparison and False - # for others. - # Also pandas allows 'eq' and 'ne' comparison for values - # of incompatible types which doesn't work in HDK. - if lhs_dtype_class != rhs_dtype_class: - if op_name == "eq" or op_name == "ne": - return LiteralExpr(op_name == "ne") - else: - raise TypeError( - f"Invalid comparison between {self._dtype} and {other._dtype}" - ) - else: - cmp = OpExpr(self.binary_operations[op_name], [self, other], res_dtype) - return build_if_then_else( - self.is_null(), LiteralExpr(op_name == "ne"), cmp, res_dtype - ) - - @staticmethod - def _get_dtype_cmp_class(dtype): - """ - Get a comparison class name for specified data type. - - Values of different comparison classes cannot be compared. - - Parameters - ---------- - dtype : dtype - A data type of a compared value. - - Returns - ------- - str - The comparison class name. - """ - if is_numeric_dtype(dtype) or is_bool_dtype(dtype): - return "numeric" - if is_string_dtype(dtype) or isinstance(dtype, pandas.CategoricalDtype): - return "string" - if is_datetime64_any_dtype(dtype): - return "datetime" - return "other" - - def _get_bin_op_res_type(self, op_name, lhs_dtype, rhs_dtype): - """ - Return the result data type for a binary operation. - - Parameters - ---------- - op_name : str - A binary operation name. - lhs_dtype : dtype - A left operand's type. - rhs_dtype : dtype - A right operand's type. - - Returns - ------- - dtype - """ - if op_name in self.preserve_dtype_math_ops: - return _get_common_dtype(lhs_dtype, rhs_dtype) - elif op_name in self.promote_to_float_math_ops: - return _get_dtype(float) - elif is_cmp_op(op_name): - return _get_dtype(bool) - elif is_logical_op(op_name): - return _get_dtype(bool) - else: - raise NotImplementedError(f"unsupported binary operation {op_name}") - - @abc.abstractmethod - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - BaseExpr - """ - pass - - def nested_expressions( - self, - ) -> Generator[Type["BaseExpr"], Type["BaseExpr"], Type["BaseExpr"]]: - """ - Return a generator that allows to iterate over and replace the nested expressions. - - If the generator receives a new expression, it creates a copy of `self` and - replaces the expression in the copy. The copy is returned to the sender. - - Returns - ------- - Generator - """ - expr = self - if operands := getattr(self, "operands", None): - for i, op in enumerate(operands): - new_op = yield op - if new_op is not None: - if new_op is not op: - if expr is self: - expr = self.copy() - expr.operands[i] = new_op - yield expr - return expr - - def collect_frames(self, frames): - """ - Recursively collect all frames participating in the expression. - - Collected frames are put into the `frames` set. Default implementation - collects frames from the operands of the expression. Derived classes - directly holding frames should provide their own implementations. - - Parameters - ---------- - frames : set - Output set of collected frames. - """ - for expr in self.nested_expressions(): - expr.collect_frames(frames) - - # currently we translate only exprs with a single input frame - def translate_input(self, mapper): - """ - Make a deep copy of the expression translating input nodes using `mapper`. - - The default implementation builds a copy and recursively run - translation for all its operands. For leaf expressions - `_translate_input` is called. - - Parameters - ---------- - mapper : InputMapper - A mapper to use for input columns translation. - - Returns - ------- - BaseExpr - The expression copy with translated input columns. - """ - res = None - gen = self.nested_expressions() - for expr in gen: - res = gen.send(expr.translate_input(mapper)) - return self._translate_input(mapper) if res is None else res - - def _translate_input(self, mapper): - """ - Make a deep copy of the expression translating input nodes using `mapper`. - - Called by default translator for leaf nodes. Method should be overriden - by derived classes holding input references. - - Parameters - ---------- - mapper : InputMapper - A mapper to use for input columns translation. - - Returns - ------- - BaseExpr - The expression copy with translated input columns. - """ - return self - - def fold(self): - """ - Fold the operands. - - This operation is used by `TransformNode` when translating to base. - - Returns - ------- - BaseExpr - """ - res = self - gen = self.nested_expressions() - for expr in gen: - res = gen.send(expr.fold()) - return res - - def can_execute_hdk(self) -> bool: - """ - Check for possibility of HDK execution. - - Check if the computation can be executed using an HDK query. - - Returns - ------- - bool - """ - return True - - def can_execute_arrow(self) -> bool: - """ - Check for possibility of Arrow execution. - - Check if the computation can be executed using - the Arrow API instead of HDK query. - - Returns - ------- - bool - """ - return False - - def execute_arrow(self, table: pa.Table) -> pa.ChunkedArray: - """ - Compute the column data using the Arrow API. - - Parameters - ---------- - table : pa.Table - - Returns - ------- - pa.ChunkedArray - """ - raise RuntimeError(f"Arrow execution is not supported by {type(self)}") - - -class InputRefExpr(BaseExpr): - """ - An expression tree node to represent an input frame column. - - Parameters - ---------- - frame : HdkOnNativeDataframe - An input frame. - col : str - An input column name. - dtype : dtype - Input column data type. - - Attributes - ---------- - modin_frame : HdkOnNativeDataframe - An input frame. - column : str - An input column name. - _dtype : dtype - Input column data type. - """ - - def __init__(self, frame, col, dtype): - self.modin_frame = frame - self.column = col - self._dtype = dtype - - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - InputRefExpr - """ - return InputRefExpr(self.modin_frame, self.column, self._dtype) - - def collect_frames(self, frames): - """ - Add referenced frame to the `frames` set. - - Parameters - ---------- - frames : set - Output set of collected frames. - """ - frames.add(self.modin_frame) - - def _translate_input(self, mapper): - """ - Translate the referenced column using `mapper`. - - Parameters - ---------- - mapper : InputMapper - A mapper to use for input column translation. - - Returns - ------- - BaseExpr - The translated expression. - """ - return mapper.translate(self) - - @_inherit_docstrings(BaseExpr.fold) - def fold(self): - return self - - @_inherit_docstrings(BaseExpr.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return True - - @_inherit_docstrings(BaseExpr.execute_arrow) - def execute_arrow(self, table: pa.Table) -> pa.ChunkedArray: - if self.column == ColNameCodec.ROWID_COL_NAME: - return pa.chunked_array([range(len(table))], pa.int64()) - return table.column(ColNameCodec.encode(self.column)) - - def __repr__(self): - """ - Return a string representation of the expression. - - Returns - ------- - str - """ - return f"{self.modin_frame.id_str()}.{self.column}[{self._dtype}]" - - -class LiteralExpr(BaseExpr): - """ - An expression tree node to represent a literal value. - - Parameters - ---------- - val : int, np.int, float, bool, str, np.datetime64 or None - Literal value. - dtype : None or dtype, default: None - Value dtype. - - Attributes - ---------- - val : int, np.int, float, bool, str, np.datetime64 or None - Literal value. - _dtype : dtype - Literal data type. - """ - - def __init__(self, val, dtype=None): - if val is not None and not isinstance( - val, - ( - int, - float, - bool, - str, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - np.datetime64, - ), - ): - raise NotImplementedError(f"Literal value {val} of type {type(val)}") - self.val = val - if dtype is not None: - self._dtype = dtype - elif val is None: - self._dtype = _get_dtype(float) - else: - self._dtype = ( - val.dtype if isinstance(val, np.generic) else _get_dtype(type(val)) - ) - - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - LiteralExpr - """ - return LiteralExpr(self.val) - - @_inherit_docstrings(BaseExpr.fold) - def fold(self): - return self - - @_inherit_docstrings(BaseExpr.cast) - def cast(self, res_type): - val = self.val - if val is not None: - if isinstance(val, np.generic): - val = val.astype(res_type) - elif is_integer_dtype(res_type): - val = int(val) - elif is_float_dtype(res_type): - val = float(val) - elif is_bool_dtype(res_type): - val = bool(val) - elif is_string_dtype(res_type): - val = str(val) - else: - raise TypeError(f"Cannot cast '{val}' to '{res_type}'") - return LiteralExpr(val, res_type) - - @_inherit_docstrings(BaseExpr.is_null) - def is_null(self): - return LiteralExpr(pandas.isnull(self.val), np.dtype(bool)) - - @_inherit_docstrings(BaseExpr.is_null) - def is_not_null(self): - return LiteralExpr(not pandas.isnull(self.val), np.dtype(bool)) - - @_inherit_docstrings(BaseExpr.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return True - - @_inherit_docstrings(BaseExpr.execute_arrow) - def execute_arrow(self, table: pa.Table) -> pa.ChunkedArray: - return pa.chunked_array([[self.val] * len(table)], to_arrow_type(self._dtype)) - - def __repr__(self): - """ - Return a string representation of the expression. - - Returns - ------- - str - """ - return f"{self.val}[{self._dtype}]" - - def __eq__(self, obj): - """ - Check if `obj` is a `LiteralExpr` with an equal value. - - Parameters - ---------- - obj : Any object - - Returns - ------- - bool - """ - return isinstance(obj, LiteralExpr) and self.val == obj.val - - -class OpExpr(BaseExpr): - """ - A generic operation expression. - - Used for arithmetic, comparisons, conditional operations, etc. - - Parameters - ---------- - op : str - Operation name. - operands : list of BaseExpr - Operation operands. - dtype : dtype - Result data type. - - Attributes - ---------- - op : str - Operation name. - operands : list of BaseExpr - Operation operands. - _dtype : dtype - Result data type. - partition_keys : list of BaseExpr, optional - This attribute is used with window functions only and contains - a list of column expressions to partition the result set. - order_keys : list of dict, optional - This attribute is used with window functions only and contains - order clauses. - lower_bound : dict, optional - Lover bound for windowed aggregates. - upper_bound : dict, optional - Upper bound for windowed aggregates. - """ - - _FOLD_OPS = { - "+": lambda self: self._fold_arithm("__add__"), - "-": lambda self: self._fold_arithm("__sub__"), - "*": lambda self: self._fold_arithm("__mul__"), - "POWER": lambda self: self._fold_arithm("__pow__"), - "/": lambda self: self._fold_arithm("__truediv__"), - "//": lambda self: self._fold_arithm("__floordiv__"), - "BIT_NOT": lambda self: self._fold_invert(), - "CAST": lambda self: self._fold_literal("cast", self._dtype), - "IS NULL": lambda self: self._fold_literal("is_null"), - "IS NOT NULL": lambda self: self._fold_literal("is_not_null"), - } - - _ARROW_EXEC = { - "+": lambda self, table: self._pc("add", table), - "-": lambda self, table: self._pc("subtract", table), - "*": lambda self, table: self._pc("multiply", table), - "POWER": lambda self, table: self._pc("power", table), - "/": lambda self, table: self._pc("divide", table), - "//": lambda self, table: self._pc("divide", table), - "BIT_NOT": lambda self, table: self._invert(table), - "CAST": lambda self, table: self._col(table).cast(to_arrow_type(self._dtype)), - "IS NULL": lambda self, table: self._col(table).is_null(nan_is_null=True), - "IS NOT NULL": lambda self, table: pc.invert( - self._col(table).is_null(nan_is_null=True) - ), - } - - _UNSUPPORTED_HDK_OPS = {} - - def __init__(self, op, operands, dtype): - self.op = op - self.operands = operands - self._dtype = dtype - - def set_window_opts(self, partition_keys, order_keys, order_ascending, na_pos): - """ - Set the window function options. - - Parameters - ---------- - partition_keys : list of BaseExpr - order_keys : list of BaseExpr - order_ascending : list of bool - na_pos : {"FIRST", "LAST"} - """ - self.is_rows = True - self.partition_keys = partition_keys - self.order_keys = [] - for key, asc in zip(order_keys, order_ascending): - key = { - "field": key, - "direction": "ASCENDING" if asc else "DESCENDING", - "nulls": na_pos, - } - self.order_keys.append(key) - self.lower_bound = { - "unbounded": True, - "preceding": True, - "following": False, - "is_current_row": False, - "offset": None, - "order_key": 0, - } - self.upper_bound = { - "unbounded": False, - "preceding": False, - "following": False, - "is_current_row": True, - "offset": None, - "order_key": 1, - } - - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - OpExpr - """ - op = OpExpr(self.op, self.operands.copy(), self._dtype) - if pk := getattr(self, "partition_keys", None): - op.partition_keys = pk - op.is_rows = self.is_rows - op.order_keys = self.order_keys - op.lower_bound = self.lower_bound - op.upper_bound = self.upper_bound - return op - - @_inherit_docstrings(BaseExpr.nested_expressions) - def nested_expressions( - self, - ) -> Generator[Type["BaseExpr"], Type["BaseExpr"], Type["BaseExpr"]]: - expr = yield from super().nested_expressions() - if partition_keys := getattr(self, "partition_keys", None): - for i, key in enumerate(partition_keys): - new_key = yield key - if new_key is not None: - if new_key is not key: - if expr is self: - expr = self.copy() - expr.partition_keys[i] = new_key - yield expr - for i, key in enumerate(self.order_keys): - field = key["field"] - new_field = yield field - if new_field is not None: - if new_field is not field: - if expr is self: - expr = self.copy() - expr.order_keys[i]["field"] = new_field - yield expr - return expr - - @_inherit_docstrings(BaseExpr.fold) - def fold(self): - super().fold() - return self if (op := self._FOLD_OPS.get(self.op, None)) is None else op(self) - - def _fold_arithm(self, op) -> Union["OpExpr", LiteralExpr]: - """ - Fold arithmetic expressions. - - Parameters - ---------- - op : str - - Returns - ------- - OpExpr or LiteralExpr - """ - operands = self.operands - i = 0 - while i < len(operands): - if isinstance((o := operands[i]), OpExpr): - if self.op == o.op: - # Fold operands in case of the same operation - operands[i : i + 1] = o.operands - else: - i += 1 - continue - if i == 0: - i += 1 - continue - if isinstance(o, LiteralExpr) and isinstance(operands[i - 1], LiteralExpr): - # Fold two sequential literal expressions - val = getattr(operands[i - 1].val, op)(o.val) - operands[i - 1] = LiteralExpr(val).cast(o._dtype) - del operands[i] - else: - i += 1 - return operands[0] if len(operands) == 1 else self - - def _fold_invert(self) -> Union["OpExpr", LiteralExpr]: - """ - Fold invert expression. - - Returns - ------- - OpExpr or LiteralExpr - """ - assert len(self.operands) == 1 - op = self.operands[0] - if isinstance(op, LiteralExpr): - return LiteralExpr(~op.val, op._dtype) - if isinstance(op, OpExpr): - if op.op == "IS NULL": - return OpExpr("IS NOT NULL", op.operands, op._dtype) - if op.op == "IS NOT NULL": - return OpExpr("IS NULL", op.operands, op._dtype) - return self - - def _fold_literal(self, op, *args): - """ - Fold literal expressions. - - Parameters - ---------- - op : str - - *args : list - - Returns - ------- - OpExpr or LiteralExpr - """ - assert len(self.operands) == 1 - expr = self.operands[0] - return getattr(expr, op)(*args) if isinstance(expr, LiteralExpr) else self - - @_inherit_docstrings(BaseExpr.can_execute_hdk) - def can_execute_hdk(self) -> bool: - return self.op not in self._UNSUPPORTED_HDK_OPS - - @_inherit_docstrings(BaseExpr.can_execute_arrow) - def can_execute_arrow(self) -> bool: - return self.op in self._ARROW_EXEC - - @_inherit_docstrings(BaseExpr.execute_arrow) - def execute_arrow(self, table: pa.Table) -> pa.ChunkedArray: - return self._ARROW_EXEC[self.op](self, table) - - def __repr__(self): - """ - Return a string representation of the expression. - - Returns - ------- - str - """ - if pk := getattr(self, "partition_keys", None): - return f"({self.op} {self.operands} {pk} {self.order_keys} [{self._dtype}])" - return f"({self.op} {self.operands} [{self._dtype}])" - - def _col(self, table: pa.Table) -> pa.ChunkedArray: - """ - Return the column referenced by the `InputRefExpr` operand. - - Parameters - ---------- - table : pa.Table - - Returns - ------- - pa.ChunkedArray - """ - assert isinstance(self.operands[0], InputRefExpr) - return self.operands[0].execute_arrow(table) - - def _pc(self, op: str, table: pa.Table) -> pa.ChunkedArray: - """ - Perform the specified pyarrow.compute operation on the operands. - - Parameters - ---------- - op : str - table : pyarrow.Table - - Returns - ------- - pyarrow.ChunkedArray - """ - op = getattr(pc, op) - val = self._op_value(0, table) - for i in range(1, len(self.operands)): - val = op(val, self._op_value(i, table)) - if not isinstance(val, pa.ChunkedArray): - val = LiteralExpr(val).execute_arrow(table) - if val.type != (at := to_arrow_type(self._dtype)): - val = val.cast(at) - return val - - def _op_value(self, op_idx: int, table: pa.Table): - """ - Get the specified operand value. - - Parameters - ---------- - op_idx : int - table : pyarrow.Table - - Returns - ------- - pyarrow.ChunkedArray or expr.val - """ - expr = self.operands[op_idx] - return expr.val if isinstance(expr, LiteralExpr) else expr.execute_arrow(table) - - def _invert(self, table: pa.Table) -> pa.ChunkedArray: - """ - Bitwise inverse the column values. - - Parameters - ---------- - table : pyarrow.Table - - Returns - ------- - pyarrow.ChunkedArray - """ - if is_bool_dtype(self._dtype): - return pc.invert(self._col(table)) - - try: - return pc.bit_wise_not(self._col(table)) - except pa.ArrowNotImplementedError as err: - raise TypeError(str(err)) - - -class AggregateExpr(BaseExpr): - """ - An aggregate operation expression. - - Parameters - ---------- - agg : str - Aggregate name. - op : BaseExpr or list of BaseExpr - Aggregate operand. - distinct : bool, default: False - Distinct modifier for 'count' aggregate. - dtype : dtype, optional - Aggregate data type. Computed if not specified. - - Attributes - ---------- - agg : str - Aggregate name. - operands : list of BaseExpr - Aggregate operands. - distinct : bool - Distinct modifier for 'count' aggregate. - _dtype : dtype - Aggregate data type. - """ - - def __init__(self, agg, op, distinct=False, dtype=None): - if agg == "nunique": - self.agg = "count" - self.distinct = True - else: - self.agg = agg - self.distinct = distinct - self.operands = op if isinstance(op, list) else [op] - self._dtype = dtype or _agg_dtype(self.agg, self.operands[0]._dtype) - assert self._dtype is not None - - def copy(self): - """ - Make a shallow copy of the expression. - - Returns - ------- - AggregateExpr - """ - return AggregateExpr(self.agg, self.operands, self.distinct, self._dtype) - - def __repr__(self): - """ - Return a string representation of the expression. - - Returns - ------- - str - """ - if len(self.operands) == 1: - return f"{self.agg}({self.operands[0]})[{self._dtype}]" - return f"{self.agg}({self.operands})[{self._dtype}]" - - -def build_row_idx_filter_expr(row_idx, row_col): - """ - Build an expression to filter rows by rowid. - - Parameters - ---------- - row_idx : int or list of int - The row numeric indices to select. - row_col : InputRefExpr - The rowid column reference expression. - - Returns - ------- - BaseExpr - The resulting filtering expression. - """ - if not is_list_like(row_idx): - return row_col.eq(row_idx) - - if is_range_like(row_idx): - start = row_idx.start - stop = row_idx.stop - step = row_idx.step - if step < 0: - start, stop = stop, start - step = -step - exprs = [row_col.ge(start), row_col.cmp("<", stop)] - if step > 1: - mod = OpExpr("MOD", [row_col, LiteralExpr(step)], _get_dtype(int)) - exprs.append(mod.eq(0)) - return OpExpr("AND", exprs, _get_dtype(bool)) - - exprs = [row_col.eq(idx) for idx in row_idx] - return OpExpr("OR", exprs, _get_dtype(bool)) - - -def build_if_then_else(cond, then_val, else_val, res_type): - """ - Build a conditional operator expression. - - Parameters - ---------- - cond : BaseExpr - A condition to check. - then_val : BaseExpr - A value to use for passed condition. - else_val : BaseExpr - A value to use for failed condition. - res_type : dtype - The result data type. - - Returns - ------- - BaseExpr - The conditional operator expression. - """ - if is_datetime64_dtype(res_type): - if then_val._dtype != res_type: - then_val = then_val.cast(res_type) - if else_val._dtype != res_type: - else_val = else_val.cast(res_type) - return OpExpr("CASE", [cond, then_val, else_val], res_type) - - -def build_dt_expr(dt_operation, col_expr): - """ - Build a datetime extraction expression. - - Parameters - ---------- - dt_operation : str - Datetime field to extract. - col_expr : BaseExpr - An expression to extract from. - - Returns - ------- - BaseExpr - The extract expression. - """ - operation = LiteralExpr(dt_operation) - - res = OpExpr("PG_EXTRACT", [operation, col_expr], _get_dtype("int32")) - - if dt_operation == "isodow": - res = res.sub(LiteralExpr(1)) - elif dt_operation == "microsecond": - res = res.mod(LiteralExpr(1000000)) - elif dt_operation == "nanosecond": - res = res.mod(LiteralExpr(1000)) - - return res diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py deleted file mode 100644 index 4db43ffa194..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py +++ /dev/null @@ -1,182 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides ``HdkWorker`` class.""" -from typing import List, Optional, Tuple, Union - -import pyarrow as pa -import pyhdk -from packaging import version -from pyhdk.hdk import HDK, ExecutionResult, QueryNode, RelAlgExecutor - -from modin.config import CpuCount, HdkFragmentSize, HdkLaunchParameters -from modin.utils import _inherit_docstrings - -from .base_worker import BaseDbWorker, DbTable - -_CAST_DICT = version.parse(pyhdk.__version__) <= version.parse("0.7.0") - - -class HdkTable(DbTable): - """ - Represents a table in the HDK database. - - Parameters - ---------- - table : QueryNode or ExecutionResult - """ - - def __init__(self, table: Union[QueryNode, ExecutionResult]): - self.name = table.table_name - self._table = table - - def __del__(self): - """Drop table.""" - # The ExecutionResults are cleared by HDK. - if not isinstance(self._table, ExecutionResult): - HdkWorker.dropTable(self.name) - - @property - @_inherit_docstrings(DbTable.shape) - def shape(self) -> Tuple[int, int]: - shape = getattr(self, "_shape", None) - if shape is None: - self._shape = shape = self.scan().shape - return shape - - @property - @_inherit_docstrings(DbTable.column_names) - def column_names(self) -> List[str]: - names = getattr(self, "_column_names", None) - if names is None: - self._column_names = names = list(self.scan().schema) - return names - - @_inherit_docstrings(DbTable.to_arrow) - def to_arrow(self) -> pa.Table: - return ( - self._table.to_arrow() - if isinstance(self._table, ExecutionResult) - else self._table.run().to_arrow() - ) - - def scan(self): - """ - Return a scan query node referencing this table. - - Returns - ------- - QueryNode - """ - if isinstance(self._table, QueryNode): - return self._table - scan = getattr(self, "_scan", None) - if scan is None: - self._scan = scan = HdkWorker._hdk().scan(self.name) - return scan - - -@_inherit_docstrings(BaseDbWorker) -class HdkWorker(BaseDbWorker): # noqa: PR01 - """PyHDK based wrapper class for HDK storage format.""" - - def __new__(cls, *args, **kwargs): - instance = getattr(cls, "_instance", None) - if instance is None: - cls._instance = instance = object.__new__(cls) - return instance - - @classmethod - def dropTable(cls, name: str): - cls.dropTable = cls._hdk().drop_table - cls.dropTable(name) - - @classmethod - def executeDML(cls, query: str): - return cls.executeRA(query, True) - - @classmethod - def executeRA(cls, query: str, exec_calcite=False, **exec_args): - hdk = cls._hdk() - if exec_calcite or query.startswith("execute calcite"): - ra = hdk._calcite.process(query, db_name="hdk", legacy_syntax=True) - else: - ra = query - ra_executor = RelAlgExecutor(hdk._executor, hdk._schema_mgr, hdk._data_mgr, ra) - table = ra_executor.execute(device_type=cls._preferred_device, **exec_args) - return HdkTable(table) - - @classmethod - def import_arrow_table(cls, table: pa.Table, name: Optional[str] = None): - name = cls._genName(name) - table = cls.cast_to_compatible_types(table, _CAST_DICT) - hdk = cls._hdk() - fragment_size = cls.compute_fragment_size(table) - return HdkTable(hdk.import_arrow(table, name, fragment_size)) - - @classmethod - def compute_fragment_size(cls, table): - """ - Compute fragment size to be used for table import. - - Parameters - ---------- - table : pyarrow.Table - A table to import. - - Returns - ------- - int - Fragment size to use for import. - """ - fragment_size = HdkFragmentSize.get() - if fragment_size is None: - if cls._preferred_device == "CPU": - cpu_count = CpuCount.get() - if cpu_count is not None: - fragment_size = table.num_rows // cpu_count - fragment_size = min(fragment_size, 2**25) - fragment_size = max(fragment_size, 2**18) - else: - fragment_size = 0 - else: - fragment_size = 2**25 - else: - fragment_size = int(fragment_size) - return fragment_size - - @classmethod - def _hdk(cls) -> HDK: - """ - Initialize and return an HDK instance. - - Returns - ------- - HDK - """ - params = HdkLaunchParameters.get() - cls._preferred_device = "CPU" if params["cpu_only"] else "GPU" - cls._hdk_instance = HDK(**params) - cls._hdk = cls._get_hdk_instance - return cls._hdk() - - @classmethod - def _get_hdk_instance(cls) -> HDK: - """ - Return the initialized HDK instance. - - Returns - ------- - HDK - """ - return cls._hdk_instance diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/__init__.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/__init__.py deleted file mode 100644 index cae6413e559..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/__init__.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/__init__.py deleted file mode 100644 index cae6413e559..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/buffer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/buffer.py deleted file mode 100644 index decbcb3b6d0..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/buffer.py +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""The module houses HdkOnNative implementation of the Buffer class of DataFrame exchange protocol.""" - -from typing import Optional, Tuple - -import pyarrow as pa - -from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( - ProtocolBuffer, -) -from modin.core.dataframe.base.interchange.dataframe_protocol.utils import ( - DlpackDeviceType, -) -from modin.utils import _inherit_docstrings - - -@_inherit_docstrings(ProtocolBuffer) -class HdkProtocolBuffer(ProtocolBuffer): - """ - Wrapper of the ``pyarrow.Buffer`` object representing a continuous segment of memory. - - Parameters - ---------- - buff : pyarrow.Buffer - Data to be held by ``Buffer``. - size : int, optional - Size of the buffer in bytes, if not specified use ``buff.size``. - The parameter may be usefull for specifying the size of a virtual chunk. - """ - - def __init__(self, buff: pa.Buffer, size: Optional[int] = None) -> None: - self._buff = buff - self._size = self._buff.size if size is None else size - - @property - def bufsize(self) -> int: - return self._size - - @property - def ptr(self) -> int: - return self._buff.address - - def __dlpack__(self): - raise NotImplementedError("__dlpack__") - - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: - return (DlpackDeviceType.CPU, None) - - def __repr__(self) -> str: - """ - Produce string representation of the buffer. - - Returns - ------- - str - """ - return ( - "Buffer(" - + str( - { - "bufsize": self.bufsize, - "ptr": self.ptr, - "device": self.__dlpack_device__()[0].name, - } - ) - + ")" - ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/column.py deleted file mode 100644 index 49f98fe4651..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/column.py +++ /dev/null @@ -1,534 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""The module houses HdkOnNative implementation of the Column class of DataFrame exchange protocol.""" - -from math import ceil -from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Tuple - -import numpy as np -import pandas -import pyarrow as pa - -from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( - CategoricalDescription, - ProtocolColumn, -) -from modin.core.dataframe.base.interchange.dataframe_protocol.utils import ( - ArrowCTypes, - ColumnNullType, - DTypeKind, - Endianness, - pandas_dtype_to_arrow_c, - raise_copy_alert, -) -from modin.utils import _inherit_docstrings - -from .buffer import HdkProtocolBuffer -from .utils import arrow_dtype_to_arrow_c, arrow_types_map - -if TYPE_CHECKING: - from .dataframe import HdkProtocolDataframe - - -@_inherit_docstrings(ProtocolColumn) -class HdkProtocolColumn(ProtocolColumn): - """ - Wrapper of ``HdkProtocolDataframe`` holding a single column. - - The Column object wraps a ``ProtocolDataframe`` to ease referencing original - Modin DataFrame with no materialization of PyArrow table where possible. - ``ProtocolDataframe`` also already implements methods like chunking and ``allow_copy`` - checks, so we can just forward calls for the methods to ``ProtocolDataFrame`` without - reimplementing them. - - Parameters - ---------- - column : HdkProtocolDataframe - DataFrame protocol object holding a PyArrow table with a single column. - - Notes - ----- - The object could be modified inplace due to either casting PyArrow buffers to a new dtype - or combining physical chunks into a single congingous buffer: - ``_propagate_dtype``, ``_cast_at``, ``_combine_chunks`` - the methods replace the wrapped - ``HdkProtocolDataframe`` object with the new one holding the modified PyArrow table. - """ - - def __init__(self, column: "HdkProtocolDataframe") -> None: - self._col = column - - def size(self) -> int: - return self._col.num_rows() - - @property - def offset(self) -> int: - # The offset may change if it would require to cast buffers as the casted ones - # no longer depend on their parent tables. So materializing buffers - # before returning the offset - self._materialize_actual_buffers() - return self._pyarrow_table.column(-1).chunks[0].offset - - @property - def dtype(self) -> Tuple[DTypeKind, int, str, str]: - dtype = self._pandas_dtype - - if pandas.api.types.is_bool_dtype(dtype): - return (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) - elif pandas.api.types.is_datetime64_dtype(dtype) or isinstance( - dtype, pandas.CategoricalDtype - ): - # We can't fully describe an actual underlying type's metadata from pandas dtype, - # use a `._arrow_dtype` for missing parts of information like datetime resulution, - # dictionary metadata, etc?... - return self._dtype_from_pyarrow(self._arrow_dtype) - elif pandas.api.types.is_string_dtype(dtype): - return ( - DTypeKind.STRING, - 8, - pandas_dtype_to_arrow_c(dtype), - Endianness.NATIVE, - ) - else: - return self._dtype_from_primitive_numpy(dtype) - - def _dtype_from_pyarrow(self, dtype): - """ - Build protocol dtype from PyArrow type. - - Parameters - ---------- - dtype : pyarrow.DataType - Data type to convert from. - - Returns - ------- - tuple(DTypeKind, bitwidth: int, format_str: str, edianess: str) - """ - kind = None - if ( - pa.types.is_timestamp(dtype) - or pa.types.is_date(dtype) - or pa.types.is_time(dtype) - ): - kind = DTypeKind.DATETIME - bit_width = dtype.bit_width - elif pa.types.is_dictionary(dtype): - kind = DTypeKind.CATEGORICAL - bit_width = dtype.bit_width - elif pa.types.is_string(dtype): - kind = DTypeKind.STRING - bit_width = 8 - elif pa.types.is_boolean(dtype): - kind = DTypeKind.BOOL - bit_width = dtype.bit_width - - if kind is not None: - return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), Endianness.NATIVE) - else: - return self._dtype_from_primitive_numpy(np.dtype(dtype.to_pandas_dtype())) - - def _dtype_from_primitive_numpy( - self, dtype: np.dtype - ) -> Tuple[DTypeKind, int, str, str]: - """ - Build protocol dtype from primitive pandas dtype. - - Parameters - ---------- - dtype : np.dtype - Data type to convert from. - - Returns - ------- - tuple(DTypeKind, bitwidth: int, format_str: str, edianess: str) - """ - np_kinds = { - "i": DTypeKind.INT, - "u": DTypeKind.UINT, - "f": DTypeKind.FLOAT, - "b": DTypeKind.BOOL, - } - kind = np_kinds.get(dtype.kind, None) - if kind is None: - raise NotImplementedError( - f"Data type {dtype} not supported by exchange protocol" - ) - return ( - kind, - dtype.itemsize * 8, - pandas_dtype_to_arrow_c(dtype), - dtype.byteorder, - ) - - @property - def describe_categorical(self) -> CategoricalDescription: - dtype = self._pandas_dtype - - if dtype != "category": - raise TypeError( - "`describe_categorical only works on a column with " - + "categorical dtype!" - ) - - ordered = dtype.ordered - - # Category codes may change during materialization flow, so trigger - # materialization before returning the codes - self._materialize_actual_buffers() - - # Although we can retrieve codes from pandas dtype, they're unsynced with - # the actual PyArrow data most of the time. So getting the mapping directly - # from the materialized PyArrow table. - col = self._pyarrow_table.column(-1) - if len(col.chunks) > 1: - if not self._col._allow_copy: - raise_copy_alert( - copy_reason="physical chunks combining due to contiguous buffer materialization" - ) - col = col.combine_chunks() - - from .dataframe import HdkOnNativeDataframe - - col = col.chunks[0] - cat_frame = HdkOnNativeDataframe.from_arrow( - pa.Table.from_pydict({next(iter(self._col.column_names())): col.dictionary}) - ) - from .dataframe import HdkProtocolDataframe - - return { - "is_ordered": ordered, - "is_dictionary": True, - "categories": HdkProtocolColumn( - HdkProtocolDataframe( - cat_frame, self._col._nan_as_null, self._col._allow_copy - ) - ), - } - - @property - def describe_null(self) -> Tuple[ColumnNullType, Any]: - null_buffer = self._pyarrow_table.column(-1).chunks[0].buffers()[0] - if null_buffer is None: - return (ColumnNullType.NON_NULLABLE, None) - else: - return (ColumnNullType.USE_BITMASK, 0) - - @property - def null_count(self) -> int: - return self._pyarrow_table.column(-1).null_count - - @property - def metadata(self) -> Dict[str, Any]: - return self._col.metadata - - @property - def _pandas_dtype(self) -> np.dtype: - """ - Get column's dtype representation in Modin DataFrame. - - Returns - ------- - numpy.dtype - """ - return self._col._df.dtypes.iloc[-1] - - @property - def _arrow_dtype(self) -> pa.DataType: - """ - Get column's dtype representation in underlying PyArrow table. - - Returns - ------- - pyarrow.DataType - """ - return self._pyarrow_table.column(-1).type - - @property - def _pyarrow_table(self) -> pa.Table: - """ - Get PyArrow table representing the column. - - Returns - ------- - pyarrow.Table - """ - return self._col._pyarrow_table - - def num_chunks(self) -> int: - return self._col.num_chunks() - - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[ProtocolColumn]: - for chunk in self._col.get_chunks(n_chunks): - yield HdkProtocolColumn(chunk) - - def get_buffers(self) -> Dict[str, Any]: - self._materialize_actual_buffers() - at = self._pyarrow_table - # Get the last column since the first one could be the index - pyarrow_array = at.column(-1).chunks[0] - - result = dict() - result["data"] = self._get_data_buffer(pyarrow_array) - result["validity"] = self._get_validity_buffer(pyarrow_array) - result["offsets"] = self._get_offsets_buffer(pyarrow_array) - - return result - - def _materialize_actual_buffers(self): - """ - Materialize PyArrow table's buffers that can be zero-copy returned to a consumer, if they aren't already materialized. - - Besides materializing PyArrow table itself (if there were some delayed computations) - the function also may do the following if required: - 1. Propagate external dtypes to the PyArrow table. For example, - if ``self.dtype`` is a string kind, but internal PyArrow dtype is a dictionary - (if the table were just exported from HDK), then the dictionary will be casted - to string dtype. - 2. Combine physical chunks of PyArrow table into a single contiguous buffer. - """ - if self.num_chunks() != 1: - if not self._col._allow_copy: - raise_copy_alert( - copy_reason="physical chunks combining due to contiguous buffer materialization" - ) - self._combine_chunks() - - external_dtype = self.dtype - internal_dtype = self._dtype_from_pyarrow(self._arrow_dtype) - - if external_dtype[0] != internal_dtype[0]: - self._propagate_dtype(external_dtype) - - def _get_buffer_size(self, bit_width: int, is_offset_buffer: bool = False) -> int: - """ - Compute buffer's size in bytes for the current chunk. - - Parameters - ---------- - bit_width : int - Bit width of the underlying data type. - is_offset_buffer : bool, default: False - Whether the buffer describes offsets. - - Returns - ------- - int - Number of bytes to read from the start of the buffer + offset to retrieve the whole chunk. - """ - # Offset buffer always has ``size + 1`` elements in it as it describes slices bounds - elements_in_buffer = self.size() + 1 if is_offset_buffer else self.size() - result = ceil((bit_width * elements_in_buffer) / 8) - # For a bitmask, if the chunk started in the middle of the byte then we need to - # read one extra byte from the buffer to retrieve the chunk's tail in the last byte. Example: - # Bitmask of 3 bytes, the chunk offset is 3 elements and its size is 16 - # |* * * * * * * *|* * * * * * * *|* * * * * * * *| - # ^- the chunk starts here ^- the chunk ends here - # Although ``ceil(bit_width * elements_in_buffer / 8)`` gives us '2 bytes', - # the chunk is located in 3 bytes, that's why we assume the chunk's buffer size - # to be 'result += 1' in this case: - if bit_width == 1 and self.offset % 8 + self.size() > result * 8: - result += 1 - return result - - def _get_data_buffer( - self, arr: pa.Array - ) -> Tuple[HdkProtocolBuffer, Tuple[DTypeKind, int, str, str]]: - """ - Get column's data buffer. - - Parameters - ---------- - arr : pa.Array - PyArrow array holding column's data. - - Returns - ------- - tuple - Tuple of ``HdkProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. - """ - if self.dtype[0] == DTypeKind.CATEGORICAL: - # For dictionary data the buffer has to return categories codes - arr = arr.indices - - arrow_type = self._dtype_from_pyarrow(arr.type) - buff_size = ( - self._get_buffer_size(bit_width=arrow_type[1]) - if self.dtype[0] != DTypeKind.STRING - # We don't chunk string buffers as it would require modifying offset values, - # so just return the whole data buffer for every chunk. - else None - ) - - return ( - # According to the Arrow's memory layout, the data buffer is always present - # at the last position of `.buffers()`: - # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout - HdkProtocolBuffer(arr.buffers()[-1], buff_size), - arrow_type, - ) - - def _get_validity_buffer( - self, arr: pa.Array - ) -> Optional[Tuple[HdkProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: - """ - Get column's validity buffer. - - Parameters - ---------- - arr : pa.Array - PyArrow array holding column's data. - - Returns - ------- - tuple or None - Tuple of ``HdkProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. - None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``). - """ - # According to the Arrow's memory layout, the validity buffer is always present at zero position: - # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout - validity_buffer = arr.buffers()[0] - if validity_buffer is None: - return None - - # If exist, validity buffer is always a bit-mask. - data_size = self._get_buffer_size(bit_width=1) - return ( - HdkProtocolBuffer(validity_buffer, data_size), - (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE), - ) - - def _get_offsets_buffer( - self, arr: pa.Array - ) -> Optional[Tuple[HdkProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: - """ - Get column's offsets buffer. - - Parameters - ---------- - arr : pa.Array - PyArrow array holding column's data. - - Returns - ------- - tuple or None - Tuple of ``HdkProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. - None if the column's dtype is fixed-size. - """ - buffs = arr.buffers() - # According to the Arrow's memory layout, the offsets buffer is always at the second position - # of `.buffers()` if present. Considering the support of only Primitive, Variable-length binary, - # and Dict-encoded types from the layout table, we can assume that there's no offsets buffer - # if there are fewer than 3 buffers available. - # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout - if len(buffs) < 3: - return None - - offset_buff = buffs[1] - # According to Arrow's data layout, the offset buffer type is "int32" - dtype = self._dtype_from_primitive_numpy(np.dtype("int32")) - return ( - HdkProtocolBuffer( - offset_buff, - self._get_buffer_size(bit_width=dtype[1], is_offset_buffer=True), - ), - dtype, - ) - - def _propagate_dtype(self, dtype: Tuple[DTypeKind, int, str, str]): - """ - Propagate `dtype` to the underlying PyArrow table. - - Modifies the column object inplace by replacing underlying PyArrow table with - the casted one. - - Parameters - ---------- - dtype : tuple - Data type conforming protocol dtypes format to cast underlying PyArrow table. - """ - if not self._col._allow_copy: - raise_copy_alert( - copy_reason="casting to align pandas and PyArrow data types" - ) - - kind, bit_width, format_str, _ = dtype - arrow_type = None - - if kind in arrow_types_map: - arrow_type = arrow_types_map[kind].get(bit_width, None) - elif kind == DTypeKind.DATETIME: - arrow_type = pa.timestamp("ns") - elif kind == DTypeKind.CATEGORICAL: - index_type = arrow_types_map[DTypeKind.INT].get(bit_width, None) - if index_type is not None: - arrow_type = pa.dictionary( - index_type=index_type, - # There is no way to deduce an actual value type, so casting to a string - # as it's the most common one - value_type=pa.string(), - ) - - if arrow_type is None: - raise NotImplementedError(f"Propagation for type {dtype} is not supported.") - - at = self._pyarrow_table - schema_to_cast = at.schema - field = at.schema[-1] - - schema_to_cast = schema_to_cast.set( - len(schema_to_cast) - 1, pa.field(field.name, arrow_type, field.nullable) - ) - - # TODO: currently, each column chunk casts its buffers independently which results - # in an `N_CHUNKS - 1` amount of redundant casts. We can make the PyArrow table - # being shared across all the chunks, so the cast being triggered in a single chunk - # propagate to all of them. - self._cast_at(schema_to_cast) - - def _cast_at(self, new_schema: pa.Schema): - """ - Cast underlying PyArrow table with the passed schema. - - Parameters - ---------- - new_schema : pyarrow.Schema - New schema to cast the table. - - Notes - ----- - This method modifies the column inplace by replacing the wrapped ``HdkProtocolDataframe`` - with the new one holding the casted PyArrow table. - """ - casted_at = self._pyarrow_table.cast(new_schema) - self._col = type(self._col)( - self._col._df.from_arrow(casted_at), - self._col._nan_as_null, - self._col._allow_copy, - ) - - def _combine_chunks(self): - """ - Combine physical chunks of underlying PyArrow table. - - Notes - ----- - This method modifies the column inplace by replacing the wrapped ``HdkProtocolDataframe`` - with the new one holding PyArrow table with the column's data placed in a single contingous buffer. - """ - contiguous_at = self._pyarrow_table.combine_chunks() - self._col = type(self._col)( - self._col._df.from_arrow(contiguous_at), - self._col._nan_as_null, - self._col._allow_copy, - ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py deleted file mode 100644 index 36c9af9ed04..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py +++ /dev/null @@ -1,388 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""The module houses HdkOnNative implementation of the Dataframe class of DataFrame exchange protocol.""" - -import collections -from typing import Any, Dict, Iterable, Optional, Sequence - -import numpy as np -import pyarrow as pa - -from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( - ProtocolDataframe, -) -from modin.error_message import ErrorMessage -from modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe import ( - HdkOnNativeDataframe, -) -from modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra import ( - FrameNode, - MaskNode, - TransformNode, - UnionNode, -) -from modin.pandas.indexing import is_range_like -from modin.utils import _inherit_docstrings - -from .column import HdkProtocolColumn -from .utils import raise_copy_alert_if_materialize - - -@_inherit_docstrings(ProtocolDataframe) -class HdkProtocolDataframe(ProtocolDataframe): - """ - Implement the DataFrame exchange protocol class for ``HdkOnNative`` execution. - - Parameters - ---------- - df : HdkOnNativeDataframe - DataFrame object that holds the data. - nan_as_null : bool, default: False - Whether to overwrite null values in the data with ``NaN``. - allow_copy : bool, default: True - Whether allow to doing copy of the underlying data during export flow. - If a copy or any kind of data transfer/materialization would be required raise ``RuntimeError``. - """ - - def __init__( - self, - df: HdkOnNativeDataframe, - nan_as_null: bool = False, - allow_copy: bool = True, - ) -> None: - if nan_as_null: - raise NotImplementedError( - "Proccessing of `nan_as_null=True` is not yet supported." - ) - - self._df = df - self._nan_as_null = nan_as_null - self._allow_copy = allow_copy - - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): - return HdkProtocolDataframe( - self._df, nan_as_null=nan_as_null, allow_copy=allow_copy - ) - - @property - @raise_copy_alert_if_materialize - def metadata(self) -> Dict[str, Any]: - # TODO: as the frame's index is stored as a separate column inside PyArrow table - # we may want to return the column's name here instead of materialized index. - # This will require the internal index column to be visible in the protocol's column - # accessor methods. - return {"modin.index": self._df.index} - - def num_columns(self) -> int: - return len(self._df.columns) - - @raise_copy_alert_if_materialize - def num_rows(self) -> int: - return len(self._df.index) - - def num_chunks(self) -> int: - # `._ chunk_slices` describe chunk offsets (start-stop indices of the chunks) - # meaning that there are actually `len(self._chunk_slices) - 1` amount of chunks - return len(self._chunk_slices) - 1 - - __chunk_slices = None - - @property - def _chunk_slices(self) -> np.ndarray: - """ - Compute chunks start-stop indices in the underlying PyArrow table. - - Returns - ------- - np.ndarray - An array holding start-stop indices of the chunks, for ex. ``[0, 5, 10, 20]`` - describes 3 chunks bound by the following indices: - chunk1: [0, 5), - chunk2: [5, 10), - chunk3: [10, 20). - - Notes - ----- - Arrow table allows for the columns to be chunked independently, so in order to satisfy - the protocol's requirement of equally chunked columns, we have to align column chunks - with the minimal one. For example: - Originally chunked table: Aligned table: - |col0|col1| |col0|col1| - | | | | | | - |0 |a | |0 |a | - |----|b | |----|----| - |1 |----| |1 |b | - |2 |c | |----|----| - |3 |d | |2 |c | - |----|----| |3 |d | - |4 |e | |----|----| - |4 |e | - """ - if self.__chunk_slices is None: - at = self._pyarrow_table - # What we need to do is to union offsets of all the columns - col_slices = set({0}) - for col in at.columns: - col_slices = col_slices.union( - np.cumsum([len(chunk) for chunk in col.chunks]) - ) - self.__chunk_slices = np.sort( - np.fromiter(col_slices, dtype=int, count=len(col_slices)) - ) - - return self.__chunk_slices - - __is_zero_copy_possible = None - - @property - def _is_zero_copy_possible(self) -> bool: - """ - Check whether it's possible to retrieve data from the DataFrame zero-copy. - - The 'zero-copy' term also means that no extra computations or data transers - are needed to access the data. - - Returns - ------- - bool - """ - if self.__is_zero_copy_possible is None: - if self._df._has_arrow_table(): - # If PyArrow table is already materialized then we can - # retrieve data zero-copy - self.__is_zero_copy_possible = True - elif not self._df._can_execute_arrow(): - # When not able to execute the plan via PyArrow means - # that we have to involve HDK, so no zero-copy. - self.__is_zero_copy_possible = False - else: - # Check whether the plan for PyArrow can be executed zero-copy - self.__is_zero_copy_possible = self._is_zero_copy_arrow_op(self._df._op) - return self.__is_zero_copy_possible - - @classmethod - def _is_zero_copy_arrow_op(cls, op) -> bool: - """ - Check whether the passed node of the delayed computation tree could be executed zero-copy via PyArrow execution. - - Parameters - ---------- - op : DFAlgNode - - Returns - ------- - bool - """ - is_zero_copy_op = False - if isinstance(op, (FrameNode, TransformNode, UnionNode)): - # - FrameNode: already materialized PyArrow table - # - TransformNode: select certain columns of the table, implemented zero-copy - # - UnionNode: concatenate PyArrow tables, implemented zero-copy - is_zero_copy_op = True - elif isinstance(op, MaskNode) and ( - isinstance(op.row_positions, slice) or is_range_like(op.row_positions) - ): - # Can select rows zero-copy if indexer is a slice-like - is_zero_copy_op = True - return is_zero_copy_op and all( - # Walk the computation tree - cls._is_zero_copy_arrow_op(_op) - for _op in getattr(op, "inputs", []) - ) - - @property - @raise_copy_alert_if_materialize - def _pyarrow_table(self) -> pa.Table: - """ - Get PyArrow table representing the DataFrame. - - Returns - ------- - pyarrow.Table - """ - at = self._df._execute() - if not isinstance(at, pa.Table): - at = at.to_arrow() - assert at is not None - return at - - def column_names(self) -> Iterable[str]: - return self._df.columns - - def get_column(self, i: int) -> HdkProtocolColumn: - return HdkProtocolColumn( - HdkProtocolDataframe( - self._df.take_2d_labels_or_positional(col_positions=[i]), - allow_copy=self._allow_copy, - ), - ) - - def get_column_by_name(self, name: str) -> HdkProtocolColumn: - return HdkProtocolColumn( - HdkProtocolDataframe( - self._df.take_2d_labels_or_positional(col_labels=[name]), - allow_copy=self._allow_copy, - ), - ) - - def get_columns(self) -> Iterable[HdkProtocolColumn]: - for name in self._df.columns: - yield HdkProtocolColumn( - HdkProtocolDataframe( - self._df.take_2d_labels_or_positional(col_labels=[name]), - nan_as_null=self._nan_as_null, - allow_copy=self._allow_copy, - ), - ) - - def select_columns(self, indices: Sequence[int]) -> "HdkProtocolDataframe": - if not isinstance(indices, collections.abc.Sequence): - raise ValueError("`indices` is not a sequence") - - return HdkProtocolDataframe( - self._df.take_2d_labels_or_positional(col_positions=list(indices)), - nan_as_null=self._nan_as_null, - allow_copy=self._allow_copy, - ) - - def select_columns_by_name(self, names: Sequence[str]) -> "HdkProtocolDataframe": - if not isinstance(names, collections.abc.Sequence): - raise ValueError("`names` is not a sequence") - - return HdkProtocolDataframe( - self._df.take_2d_labels_or_positional(col_labels=list(names)), - nan_as_null=self._nan_as_null, - allow_copy=self._allow_copy, - ) - - def get_chunks( - self, n_chunks: Optional[int] = None - ) -> Iterable["HdkProtocolDataframe"]: - """ - Return an iterator yielding the chunks. - - If `n_chunks` is not specified, yields the chunks that the data is stored underneath. - If given, `n_chunks` must be a multiple of ``self.num_chunks()``, meaning that each physical - chunk is going to be split into ``n_chunks // self.num_chunks()`` virtual chunks, that are - backed by the same physical buffers but have different ``.offset`` values. - - Parameters - ---------- - n_chunks : int, optional - Number of chunks to yield. - - Returns - ------- - Iterable["HdkProtocolDataframe"] - An iterator yielding ``HdkProtocolDataframe`` objects. - - Raises - ------ - ``RuntimeError`` if ``n_chunks`` is not a multiple of ``self.num_chunks()`` or ``n_chunks`` - is greater than ``self.num_rows()``. - - Notes - ----- - There is a special casing in handling variable-sized columns (i.e. strings) when virtually chunked. - In order to make the offsets buffer be valid for each virtual chunk, the data buffer shouldn't be - chunked at all, meaning that ``.get_buffers()["data"]`` always returns a buffer owning the whole - physical chunk and the consumer must always interpret it with zero offset (validity and offsets - buffers have to be interpreted respecting the column's offset value). - """ - if n_chunks is None or n_chunks == self.num_chunks(): - return self._yield_chunks(self._chunk_slices) - - if n_chunks % self.num_chunks() != 0: - raise RuntimeError( - "The passed `n_chunks` has to be a multiple of `num_chunks`." - ) - - if n_chunks > self.num_rows(): - raise RuntimeError( - "The passed `n_chunks` value is bigger than the amout of rows in the frame." - ) - - extra_chunks = 0 - to_subdivide = n_chunks // self.num_chunks() - subdivided_slices = [] - - # The loop subdivides each chunk into `to_subdivide` chunks if possible - for i in range(len(self._chunk_slices) - 1): - chunk_length = self._chunk_slices[i + 1] - self._chunk_slices[i] - step = chunk_length // to_subdivide - if step == 0: - # Bad case: we're requested to subdivide a chunk in more pieces than it has rows in it. - # This means that there is a bigger chunk that we can subdivide into more pieces to get - # the required amount of chunks. For now, subdividing the current chunk into maximum possible - # pieces (TODO: maybe we should subdivide it into `sqrt(chunk_length)` chunks to make - # this more oprimal?), writing a number of missing pieces into `extra_chunks` variable - # to extract them from bigger chunks later. - step = 1 - extra_chunks += to_subdivide - chunk_length - to_subdivide_chunk = chunk_length - else: - to_subdivide_chunk = to_subdivide - - for j in range(to_subdivide_chunk): - subdivided_slices.append(self._chunk_slices[i] + step * j) - subdivided_slices.append(self._chunk_slices[-1]) - - if extra_chunks != 0: - # Making more pieces from big chunks to get the required amount of `n_chunks` - for _ in range(extra_chunks): - # 1. Find the biggest chunk - # 2. Split it in the middle - biggest_chunk_idx = np.argmax(np.diff(subdivided_slices)) - new_chunk_offset = ( - subdivided_slices[biggest_chunk_idx + 1] - - subdivided_slices[biggest_chunk_idx] - ) // 2 - ErrorMessage.catch_bugs_and_request_email( - failure_condition=new_chunk_offset == 0, - extra_log="No more chunks to subdivide", - ) - subdivided_slices = np.insert( - subdivided_slices, - biggest_chunk_idx + 1, - subdivided_slices[biggest_chunk_idx] + new_chunk_offset, - ) - - ErrorMessage.catch_bugs_and_request_email( - failure_condition=len(subdivided_slices) != n_chunks + 1, - extra_log=f"Chunks were incorrectly split: {len(subdivided_slices)} != {n_chunks + 1}", - ) - - return self._yield_chunks(subdivided_slices) - - def _yield_chunks(self, chunk_slices) -> "HdkProtocolDataframe": - """ - Yield DataFrame chunks according to the passed offsets. - - Parameters - ---------- - chunk_slices : list - Chunking offsets. - - Yields - ------ - HdkProtocolDataframe - """ - for i in range(len(chunk_slices) - 1): - yield HdkProtocolDataframe( - df=self._df.take_2d_labels_or_positional( - row_positions=range(chunk_slices[i], chunk_slices[i + 1]) - ), - nan_as_null=self._nan_as_null, - allow_copy=self._allow_copy, - ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py deleted file mode 100644 index 5e03cc1d6ab..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py +++ /dev/null @@ -1,98 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Utility functions for the DataFrame exchange protocol implementation for ``HdkOnNative`` execution.""" - -import functools - -import numpy as np -import pyarrow as pa - -from modin.core.dataframe.base.interchange.dataframe_protocol.utils import ( - ArrowCTypes, - DTypeKind, - pandas_dtype_to_arrow_c, - raise_copy_alert, -) - -arrow_types_map = { - DTypeKind.BOOL: {8: pa.bool_()}, - DTypeKind.INT: { - 8: pa.int8(), - 16: pa.int16(), - 32: pa.int32(), - 64: pa.int64(), - }, - DTypeKind.UINT: { - 8: pa.uint8(), - 16: pa.uint16(), - 32: pa.uint32(), - 64: pa.uint64(), - }, - DTypeKind.FLOAT: {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}, - DTypeKind.STRING: {8: pa.string()}, -} - - -def arrow_dtype_to_arrow_c(dtype: pa.DataType) -> str: - """ - Represent PyArrow `dtype` as a format string in Apache Arrow C notation. - - Parameters - ---------- - dtype : pa.DataType - Datatype of PyArrow table to represent. - - Returns - ------- - str - Format string in Apache Arrow C notation of the given `dtype`. - """ - if pa.types.is_timestamp(dtype): - return ArrowCTypes.TIMESTAMP.format( - resolution=dtype.unit[:1], tz=dtype.tz or "" - ) - elif pa.types.is_date(dtype): - return getattr(ArrowCTypes, f"DATE{dtype.bit_width}", "DATE64") - elif pa.types.is_time(dtype): - # TODO: for some reason `time32` type doesn't have a `unit` attribute, - # always return "s" for now. - # return ArrowCTypes.TIME.format(resolution=dtype.unit[:1]) - return ArrowCTypes.TIME.format(resolution=getattr(dtype, "unit", "s")[:1]) - elif pa.types.is_dictionary(dtype): - return arrow_dtype_to_arrow_c(dtype.index_type) - else: - return pandas_dtype_to_arrow_c(np.dtype(dtype.to_pandas_dtype())) - - -def raise_copy_alert_if_materialize(fn): - """ - Decorate ``HdkProtocolDataframe`` method with a check raising a copy-alert if it's impossible to retrieve the data in zero-copy way. - - Parameters - ---------- - fn : callable - ``HdkProtocolDataframe`` method. - - Returns - ------- - callable - """ - - @functools.wraps(fn) - def method(self, *args, **kwargs): - if not self._allow_copy and not self._is_zero_copy_possible: - raise_copy_alert() - return fn(self, *args, **kwargs) - - return method diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/__init__.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/io/__init__.py deleted file mode 100644 index e6f0ad34a4d..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Base IO classes optimized for HDK on Native execution.""" - -from .io import HdkOnNativeIO - -__all__ = ["HdkOnNativeIO"] diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py deleted file mode 100644 index c12dc8f18fd..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py +++ /dev/null @@ -1,674 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -""" -Module houses ``HdkOnNativeIO`` class. - -``HdkOnNativeIO`` is used for storing IO functions implementations with HDK storage format and Native engine. -""" - -import functools -import inspect -import os -from csv import Dialect -from typing import Callable, Dict, Sequence, Tuple, Union - -import pandas -import pandas._libs.lib as lib -import pyarrow as pa -from pandas.core.dtypes.common import is_list_like -from pandas.io.common import get_handle, is_url -from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions, read_csv - -from modin.core.io import BaseIO -from modin.core.io.text.text_file_dispatcher import TextFileDispatcher -from modin.error_message import ErrorMessage -from modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe import ( - HdkOnNativeDataframe, -) -from modin.experimental.core.storage_formats.hdk.query_compiler import ( - DFAlgQueryCompiler, -) -from modin.utils import _inherit_docstrings - -ReadCsvKwargsType = Dict[ - str, - Union[ - str, - int, - bool, - dict, - object, - Sequence, - Callable, - Dialect, - None, - ], -] - - -class ArrowEngineException(Exception): - """Exception raised in case of Arrow engine-specific incompatibilities are found.""" - - -class HdkOnNativeIO(BaseIO, TextFileDispatcher): - """Class contains IO functions implementations with HDK storage format and Native engine.""" - - frame_cls = HdkOnNativeDataframe - query_compiler_cls = DFAlgQueryCompiler - - unsupported_args = [ - "decimal", - "thousands", - "index_col", - "prefix", - "converters", - "skipfooter", - "nrows", - "skipinitialspace", - "na_values", - "keep_default_na", - "na_filter", - "verbose", - "infer_datetime_format", - "keep_date_col", - "date_parser", - "date_format", - "dayfirst", - "cache_dates", - "iterator", - "chunksize", - "encoding", - "encoding_errors", - "lineterminator", - "dialect", - "quoting", - "comment", - "on_bad_lines", - "low_memory", - "memory_map", - "float_precision", - "storage_options", - "dtype_backend", - ] - - @classmethod - def read_csv(cls, **kwargs): # noqa: PR01 - """ - Read csv data according to the passed `kwargs` parameters. - - Returns - ------- - BaseQueryCompiler - Query compiler with imported data for further processing. - - Notes - ----- - Reading performed by using of `pyarrow.read_csv` function. - """ - if eng := kwargs["engine"]: - eng = eng.lower().strip() - try: - if eng in ("pandas", "c"): - return super().read_csv(**kwargs) - - cls._validate_read_csv_kwargs(kwargs) - use_modin_impl, error_message = cls._read_csv_check_support( - kwargs, - ) - if not use_modin_impl: - raise ArrowEngineException(error_message) - - if (names := kwargs["names"]) is lib.no_default: - names = None - skiprows = kwargs["skiprows"] - if names and kwargs["header"] == 0: - skiprows = skiprows + 1 if skiprows is not None else 1 - - @functools.lru_cache(maxsize=None) - def get_col_names(): - # Using pandas to read the column names - return pandas.read_csv( - kwargs["filepath_or_buffer"], nrows=0, engine="c" - ).columns.tolist() - - dtype = kwargs["dtype"] - # For details: https://github.com/pandas-dev/pandas/issues/57024 - entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict) - if dtype: - if isinstance(dtype, dict): - column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()} - else: - dtype = cls._dtype_to_arrow(dtype) - column_types = {name: dtype for name in get_col_names()} - else: - column_types = {} - - if parse_dates := ( - None if entire_dataframe_dtype else kwargs["parse_dates"] - ): - # Either list of column names or list of column indices is supported. - if isinstance(parse_dates, list) and ( - all(isinstance(col, str) for col in parse_dates) - or all(isinstance(col, int) for col in parse_dates) - ): - # Pandas uses datetime64[ns] dtype for dates. - timestamp_dt = pa.timestamp("ns") - if names and isinstance(parse_dates[0], str): - # The `names` parameter could be used to override the - # column names. If new names are specified in `parse_dates` - # they should be replaced with the real names. Replacing - # with the column indices first. - parse_dates = [names.index(name) for name in parse_dates] - if isinstance(parse_dates[0], int): - # If column indices are specified, load the column names - # with pandas and replace the indices with column names. - column_names = get_col_names() - parse_dates = [column_names[i] for i in parse_dates] - for c in parse_dates: - column_types[c] = timestamp_dt - elif not isinstance(parse_dates, bool): - raise NotImplementedError( - f"Argument parse_dates={parse_dates} is not supported" - ) - - sep = kwargs["sep"] - delimiter = kwargs["delimiter"] - if delimiter is None and sep is not lib.no_default: - delimiter = sep - - usecols_md = cls._prepare_pyarrow_usecols(kwargs) - - po = ParseOptions( - delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter, - quote_char=kwargs["quotechar"], - double_quote=kwargs["doublequote"], - escape_char=kwargs["escapechar"], - newlines_in_values=False, - ignore_empty_lines=kwargs["skip_blank_lines"], - ) - true_values = kwargs["true_values"] - false_values = kwargs["false_values"] - co = ConvertOptions( - check_utf8=None, - column_types=column_types, - null_values=None, - # we need to add default true/false_values like Pandas does - true_values=( - true_values + ["TRUE", "True", "true"] - if true_values is not None - else true_values - ), - false_values=( - false_values + ["False", "FALSE", "false"] - if false_values is not None - else false_values - ), - # timestamp fields should be handled as strings if parse_dates - # didn't passed explicitly as an array or a dict - timestamp_parsers=( - [""] - if parse_dates is None or isinstance(parse_dates, bool) - else None - ), - strings_can_be_null=None, - include_columns=usecols_md, - include_missing_columns=None, - auto_dict_encode=None, - auto_dict_max_cardinality=None, - ) - ro = ReadOptions( - use_threads=True, - block_size=None, - skip_rows=skiprows, - column_names=names if names is not lib.no_default else None, - autogenerate_column_names=None, - ) - - at = read_csv( - kwargs["filepath_or_buffer"], - read_options=ro, - parse_options=po, - convert_options=co, - ) - - if names: - at = at.rename_columns(names) - else: - col_names = at.column_names - col_counts = {} - for name in col_names: - col_counts[name] = 1 if name in col_counts else 0 - - if len(col_names) != len(col_counts): - for i, name in enumerate(col_names): - count = col_counts[name] - if count != 0: - if count == 1: - col_counts[name] = 2 - else: - new_name = f"{name}.{count - 1}" - while new_name in col_counts: - new_name = f"{name}.{count}" - count += 1 - col_counts[name] = count + 1 - col_names[i] = new_name - at = at.rename_columns(col_names) - - return cls.from_arrow(at) - except ( - pa.ArrowNotImplementedError, - pa.ArrowInvalid, - NotImplementedError, - ArrowEngineException, - ) as err: - if eng in ["arrow"]: - raise - - ErrorMessage.warn( - f"Failed to read csv {kwargs['filepath_or_buffer']} " - + f"due to error: {err}. Defaulting to pandas." - ) - return super().read_csv(**kwargs) - - @classmethod - def _dtype_to_arrow(cls, dtype): - """ - Convert `pandas.read_csv` `dtype` parameter into PyArrow compatible type. - - Parameters - ---------- - dtype : str, pandas extension or NumPy dtype - Data type for data or columns, `pandas.read_csv` `dtype` parameter. - - Returns - ------- - pa.DataType or pa.DictionaryType - PyArrow compatible type. - """ - if dtype is None: - return None - tname = dtype if isinstance(dtype, str) else dtype.name - if tname == "category": - return pa.dictionary(index_type=pa.int32(), value_type=pa.string()) - elif tname == "string" or tname == "object": - return pa.string() - else: - return pa.from_numpy_dtype(tname) - - @classmethod - def _prepare_pyarrow_usecols(cls, read_csv_kwargs): - """ - Define `usecols` parameter in the way PyArrow can process it. - - Parameters - ---------- - read_csv_kwargs : dict - Parameters of read_csv. - - Returns - ------- - list - Redefined `usecols` parameter. - """ - usecols = read_csv_kwargs["usecols"] - engine = read_csv_kwargs["engine"] - usecols_md, usecols_names_dtypes = cls._validate_usecols_arg(usecols) - if usecols_md: - empty_pd_df = pandas.read_csv( - **dict( - read_csv_kwargs, - nrows=0, - skipfooter=0, - usecols=None, - engine=None if engine == "arrow" else engine, - ) - ) - column_names = empty_pd_df.columns - if usecols_names_dtypes == "string": - if usecols_md.issubset(set(column_names)): - # columns should be sorted because pandas doesn't preserve columns order - usecols_md = [ - col_name for col_name in column_names if col_name in usecols_md - ] - else: - raise NotImplementedError( - "values passed in the `usecols` parameter don't match columns names" - ) - elif usecols_names_dtypes == "integer": - # columns should be sorted because pandas doesn't preserve columns order - usecols_md = sorted(usecols_md) - if len(column_names) < usecols_md[-1]: - raise NotImplementedError( - "max usecols value is higher than the number of columns" - ) - usecols_md = [column_names[i] for i in usecols_md] - elif callable(usecols_md): - usecols_md = [ - col_name for col_name in column_names if usecols_md(col_name) - ] - else: - raise NotImplementedError("unsupported `usecols` parameter") - - return usecols_md - - read_csv_unsup_defaults = {} - for k, v in inspect.signature(pandas.read_csv).parameters.items(): - if v.default is not inspect.Parameter.empty and k in unsupported_args: - read_csv_unsup_defaults[k] = v.default - - @classmethod - def _read_csv_check_support( - cls, - read_csv_kwargs: ReadCsvKwargsType, - ) -> Tuple[bool, str]: - """ - Check if passed parameters are supported by current ``modin.pandas.read_csv`` implementation. - - Parameters - ---------- - read_csv_kwargs : dict - Parameters of read_csv function. - - Returns - ------- - bool - Whether passed parameters are supported or not. - str - Error message that should be raised if user explicitly set `engine="arrow"`. - """ - filepath_or_buffer = read_csv_kwargs["filepath_or_buffer"] - header = read_csv_kwargs["header"] - names = read_csv_kwargs["names"] - engine = read_csv_kwargs["engine"] - skiprows = read_csv_kwargs["skiprows"] - delimiter = read_csv_kwargs["delimiter"] - parse_dates = read_csv_kwargs["parse_dates"] - - if read_csv_kwargs["compression"] != "infer": - return ( - False, - "read_csv with 'arrow' engine doesn't support explicit compression parameter, compression" - + " must be inferred automatically (supported compression types are gzip and bz2)", - ) - - if isinstance(filepath_or_buffer, str): - if not os.path.exists(filepath_or_buffer): - if cls.file_exists(filepath_or_buffer) or is_url(filepath_or_buffer): - return ( - False, - "read_csv with 'arrow' engine supports only local files", - ) - else: - raise FileNotFoundError("No such file or directory") - elif not cls.pathlib_or_pypath(filepath_or_buffer): - if hasattr(filepath_or_buffer, "read"): - return ( - False, - "read_csv with 'arrow' engine doesn't support file-like objects", - ) - else: - raise ValueError( - f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" - ) - - if read_csv_kwargs.get("skipfooter") and read_csv_kwargs.get("nrows"): - return (False, "Exception is raised by pandas itself") - - for arg, def_value in cls.read_csv_unsup_defaults.items(): - if read_csv_kwargs[arg] != def_value: - return ( - False, - f"read_csv with 'arrow' engine doesn't support {arg} parameter", - ) - if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True: - raise ValueError( - "Specified a delimiter with both sep and delim_whitespace=True; you can only specify one." - ) - - parse_dates_unsupported = isinstance(parse_dates, dict) or ( - isinstance(parse_dates, list) - and any(not isinstance(date, str) for date in parse_dates) - ) - if parse_dates_unsupported: - return ( - False, - ( - "read_csv with 'arrow' engine supports only bool and " - + "flattened list of string column names for the " - + "'parse_dates' parameter" - ), - ) - if names and names != lib.no_default: - if header not in [None, 0, "infer"]: - return ( - False, - "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and " - + "'infer' header values", - ) - if isinstance(parse_dates, list) and not set(parse_dates).issubset(names): - missing_columns = set(parse_dates) - set(names) - raise ValueError( - f"Missing column provided to 'parse_dates': '{', '.join(missing_columns)}'" - ) - - empty_pandas_df = pandas.read_csv( - **dict( - read_csv_kwargs, - nrows=0, - skiprows=None, - skipfooter=0, - usecols=None, - index_col=None, - names=None, - parse_dates=None, - engine=None if engine == "arrow" else engine, - ), - ) - columns_number = len(empty_pandas_df.columns) - if columns_number != len(names): - return ( - False, - "read_csv with 'arrow' engine doesn't support names parameter, which length doesn't match " - + "with actual number of columns", - ) - else: - if header not in [0, "infer"]: - return ( - False, - "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' " - + "header values", - ) - if isinstance(parse_dates, list): - empty_pandas_df = pandas.read_csv( - **dict( - read_csv_kwargs, - nrows=0, - skiprows=None, - skipfooter=0, - usecols=None, - index_col=None, - engine=None if engine == "arrow" else engine, - ), - ) - if not set(parse_dates).issubset(empty_pandas_df.columns): - raise ValueError("Missing column provided to 'parse_dates'") - - if not read_csv_kwargs["skip_blank_lines"]: - # in some corner cases empty lines are handled as '', - # while pandas handles it as NaNs - issue #3084 - return ( - False, - "read_csv with 'arrow' engine doesn't support skip_blank_lines = False parameter", - ) - - if skiprows is not None and not isinstance(skiprows, int): - return ( - False, - "read_csv with 'arrow' engine doesn't support non-integer skiprows parameter", - ) - - return True, None - - @classmethod - def _validate_read_csv_kwargs( - cls, - read_csv_kwargs: ReadCsvKwargsType, - ): - """ - Validate `read_csv` keyword arguments. - - Should be done to mimic `pandas.read_csv` behavior. - - Parameters - ---------- - read_csv_kwargs : dict - Parameters of `read_csv` function. - """ - delimiter = read_csv_kwargs["delimiter"] - sep = read_csv_kwargs["sep"] - on_bad_lines = read_csv_kwargs["on_bad_lines"] - delim_whitespace = read_csv_kwargs["delim_whitespace"] - - if delimiter and (sep is not lib.no_default): - raise ValueError( - "Specified a sep and a delimiter; you can only specify one." - ) - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - if delim_whitespace is True and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - + "delim_whitespace=True; you can only specify one." - ) - - if on_bad_lines not in ["error", "warn", "skip", None]: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines.") - - @classmethod - @_inherit_docstrings(BaseIO.to_csv, apilink="pandas.to_csv") - def to_csv(cls, qc, **kwargs): - df = qc._modin_frame - write_opts = pa.csv.WriteOptions(include_header=True, delimiter=",") - for key, value in kwargs.items(): - if value is None: - pass - elif key == "sep": - write_opts.delimiter = value - elif key == "chunksize": - write_opts.batch_size = value - elif not ( - (key == "na_rep" and len(value) == 0) - or (key == "decimal" and value == ".") - or (key == "quotechar" and value == '"') - or (key == "doublequote" and value is True) - or (key == "encoding" and value == "utf-8") - or (key == "lineterminator" and value == os.linesep) - or key - in ( - "path_or_buf", - "columns", - "header", - "index", - "index_label", - "mode", - "compression", - "errors", - "storage_options", - ) - ): - ErrorMessage.default_to_pandas(f"Argument {key}={value}") - return df.to_pandas().to_csv(**kwargs) - - at = df._execute() - if not isinstance(at, pa.Table): - return df.to_pandas().to_csv(**kwargs) - idx_names = df._index_cols - - if kwargs.get("index", True): - if idx_names is None: # Trivial index - idx_col = pa.array(range(len(df.index)), type=pa.int64()) - at = at.add_column(0, "", idx_col) - if (idx_names := kwargs.get("index_label", None)) is None: - idx_names = df.index.names - elif idx_names is False: - idx_names = [""] * len(df.index.names) - elif not is_list_like(idx_names): - idx_names = [idx_names] - idx_names = ["" if n is None else str(n) for n in idx_names] - at = at.rename_columns(idx_names + df.columns.tolist()) - elif idx_names is not None: - at = at.drop(idx_names) - at = at.rename_columns(df.columns.tolist()) - idx_names = None - else: - at = at.rename_columns(df.columns.tolist()) - - if (value := kwargs.get("columns", None)) is not None: - if idx_names is not None: - value = idx_names + value - at = at.select(value) - - if (value := kwargs.get("header", None)) is False: - write_opts.include_header = False - elif isinstance(value, list): - if idx_names is not None: - value = idx_names + value - at = at.rename_columns(value) - - def write_header(out): - # Using pandas to write the header, because pyarrow - # writes column names enclosed in double quotes. - if write_opts.include_header: - pdf = pandas.DataFrame(columns=at.column_names) - pdf.to_csv(out, sep=write_opts.delimiter, index=False) - write_opts.include_header = False - - if (path_or_buf := kwargs.get("path_or_buf", None)) is None: - out = pa.BufferOutputStream() - write_header(out) - pa.csv.write_csv(at, out, write_opts) - return out.getvalue().to_pybytes().decode() - - # Pyarrow fails to write in text mode. - mode = kwargs.get("mode", "w").replace("t", "") - if "b" not in mode: - mode += "b" - - with get_handle( - path_or_buf=path_or_buf, - mode=mode, - errors=kwargs.get("errors", "strict"), - compression=kwargs.get("compression", "infer"), - storage_options=kwargs.get("storage_options", None), - is_text=False, - ) as handles: - out = handles.handle - write_header(out) - pa.csv.write_csv(at, out, write_opts) - - @classmethod - @_inherit_docstrings(BaseIO.read_sql, apilink="pandas.read_sql") - def read_sql(cls, **kwargs): - impl = super(HdkOnNativeIO, cls) - varnames = impl.read_sql.__code__.co_varnames - filtered = {k: v for k, v in kwargs.items() if k in varnames} - if len(filtered) != len(kwargs): - if unsupported := { - k: v for k, v in kwargs.items() if k not in filtered and v is not None - }: - raise NotImplementedError(f"Unsupported arguments: {unsupported}") - return impl.read_sql(**filtered) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/__init__.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/__init__.py deleted file mode 100644 index 70cf0a275c6..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Base Modin Dataframe classes related to its partitioning and optimized for HDK on Native execution.""" diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py deleted file mode 100644 index 11b9482aa0d..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py +++ /dev/null @@ -1,198 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides a partition class for ``HdkOnNativeDataframe`` frame.""" -from typing import Union - -import pandas -import pyarrow as pa -from pandas._typing import AnyArrayLike - -from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition - -from ..dataframe.utils import ColNameCodec, arrow_to_pandas -from ..db_worker import DbTable - - -class HdkOnNativeDataframePartition(PandasDataframePartition): - """ - A partition of ``HdkOnNativeDataframe`` frame. - - Class holds either a ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table``. - - Parameters - ---------- - data : DbTable or pandas.DataFrame or pyarrow.Table - Partition data in either pandas or PyArrow format. - - Attributes - ---------- - _data : DbTable or pandas.DataFrame or pyarrow.Table - Partition data in either pandas or PyArrow format. - _length_cache : int - Length of the partition. - _width_cache : int - Width of the partition. - """ - - def __init__( - self, - data: Union[DbTable, pa.Table, pandas.DataFrame], - ): - super().__init__() - assert isinstance(data, (DbTable, pa.Table, pandas.DataFrame)) - self._data = data - - def to_pandas(self): - """ - Transform to pandas format. - - Returns - ------- - pandas.DataFrame - """ - obj = self.get() - if isinstance(obj, pandas.DataFrame): - return obj - if isinstance(obj, DbTable): - obj = obj.to_arrow() - return arrow_to_pandas(obj) - - def to_numpy(self, **kwargs): - """ - Transform to NumPy format. - - Parameters - ---------- - **kwargs : dict - Additional keyword arguments to be passed in ``to_numpy``. - - Returns - ------- - np.ndarray - """ - return self.to_pandas().to_numpy(**kwargs) - - def get(self, to_arrow: bool = False) -> Union[DbTable, pandas.DataFrame, pa.Table]: - """ - Get partition data. - - Parameters - ---------- - to_arrow : bool, default: False - Convert the data to ``pyarrow.Table``. - - Returns - ------- - ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table`` - """ - if to_arrow: - if isinstance(self._data, pandas.DataFrame): - self._data = pa.Table.from_pandas(self._data, preserve_index=False) - elif isinstance(self._data, DbTable): - return self._data.to_arrow() - return self._data - - @classmethod - def put(cls, obj): - """ - Create partition from ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table``. - - Parameters - ---------- - obj : DbTable or pandas.DataFrame or pyarrow.Table - Source frame. - - Returns - ------- - HdkOnNativeDataframePartition - The new partition. - """ - return cls(obj) - - def insert(self, idx: int, name: str, value: AnyArrayLike): - """ - Insert column into this raw partition. - - Parameters - ---------- - idx : int - name : str - value : AnyArrayLike - - Returns - ------- - tuple of HdkOnNativeDataframePartition, dtype - """ - data = self._data - name = ColNameCodec.encode(name) - - if isinstance(data, pandas.DataFrame): - data = data.copy(False) - data.insert(idx, name, value) - dtype = data.dtypes[idx] - elif isinstance(data, pa.Table): - try: - new_data = data.add_column(idx, name, [value]) - dtype = new_data.field(idx).type.to_pandas_dtype() - data = new_data - except Exception: - try: - df = pandas.DataFrame({name: value}) - at = pa.Table.from_pandas(df, preserve_index=False) - data = data.add_column(idx, at.field(0), at.column(0)) - dtype = df.dtypes[0] - except Exception as err: - raise NotImplementedError(repr(err)) - else: - raise NotImplementedError(f"Insertion into {type(data)}") - - return HdkOnNativeDataframePartition(data), dtype - - @property - def raw(self): - """ - True if the partition contains a raw data. - - The raw data is either ``pandas.DataFrame`` or ``pyarrow.Table``. - - Returns - ------- - bool - """ - return isinstance(self._data, (pandas.DataFrame, pa.Table)) - - @property - def _length_cache(self): - """ - Number of rows. - - Returns - ------- - int - """ - return len(self._data) - - @property - def _width_cache(self): - """ - Number of columns. - - Returns - ------- - int - """ - if isinstance(self._data, pa.Table): - return self._data.num_columns - else: - return self._data.shape[1] diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py deleted file mode 100644 index 443c53a0388..00000000000 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py +++ /dev/null @@ -1,328 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Module provides a partition manager class for ``HdkOnNativeDataframe`` frame.""" - -import re - -import numpy as np -import pandas -import pyarrow - -from modin.config import DoUseCalcite -from modin.core.dataframe.pandas.partitioning.partition_manager import ( - PandasDataframePartitionManager, -) -from modin.error_message import ErrorMessage -from modin.pandas.utils import is_scalar - -from ..calcite_builder import CalciteBuilder -from ..calcite_serializer import CalciteSerializer -from ..dataframe.utils import ColNameCodec, is_supported_arrow_type -from ..db_worker import DbTable, DbWorker -from ..partitioning.partition import HdkOnNativeDataframePartition - - -class HdkOnNativeDataframePartitionManager(PandasDataframePartitionManager): - """ - Frame manager for ``HdkOnNativeDataframe``. - - This class handles several features of ``HdkOnNativeDataframe``: - - frame always has a single partition - - frame cannot process some data types - - frame has to use mangling for index labels - - frame uses HDK storage format for execution - """ - - _partition_class = HdkOnNativeDataframePartition - - @classmethod - def from_pandas(cls, df, return_dims=False, encode_col_names=True): - """ - Build partitions from a ``pandas.DataFrame``. - - Parameters - ---------- - df : pandas.DataFrame - Source frame. - return_dims : bool, default: False - Include resulting dimensions into the returned value. - encode_col_names : bool, default: True - Encode column names. - - Returns - ------- - tuple - Tuple holding array of partitions, list of columns with unsupported - data and optionally partitions' dimensions. - """ - unsupported_cols = cls._get_unsupported_cols(df) - parts = np.array([[cls._partition_class(df)]]) - if not return_dims: - return parts, unsupported_cols - else: - return parts, [len(df)], [len(df.columns)], unsupported_cols - - @classmethod - def from_arrow( - cls, at, return_dims=False, unsupported_cols=None, encode_col_names=True - ): - """ - Build partitions from a ``pyarrow.Table``. - - Parameters - ---------- - at : pyarrow.Table - Input table. - return_dims : bool, default: False - True to include dimensions into returned tuple. - unsupported_cols : list of str, optional - List of columns holding unsupported data. If None then - check all columns to compute the list. - encode_col_names : bool, default: True - Encode column names. - - Returns - ------- - tuple - Tuple holding array of partitions, list of columns with unsupported - data and optionally partitions' dimensions. - """ - if encode_col_names: - encoded_names = [ColNameCodec.encode(n) for n in at.column_names] - encoded_at = at - if encoded_names != at.column_names: - encoded_at = at.rename_columns(encoded_names) - else: - encoded_at = at - - parts = np.array([[cls._partition_class(encoded_at)]]) - if unsupported_cols is None: - unsupported_cols = cls._get_unsupported_cols(at) - - if not return_dims: - return parts, unsupported_cols - else: - return parts, [at.num_rows], [at.num_columns], unsupported_cols - - @classmethod - def _get_unsupported_cols(cls, obj): - """ - Return a list of columns with unsupported by HDK data types. - - Parameters - ---------- - obj : pandas.DataFrame or pyarrow.Table - Object to inspect on unsupported column types. - - Returns - ------- - list - List of unsupported columns. - """ - if isinstance(obj, (pandas.Series, pandas.DataFrame)): - # picking first rows from cols with `dtype="object"` to check its actual type, - # in case of homogen columns that saves us unnecessary convertion to arrow table - - if obj.empty: - unsupported_cols = [] - elif isinstance(obj.columns, pandas.MultiIndex): - unsupported_cols = [str(c) for c in obj.columns] - else: - cols = [name for name, col in obj.dtypes.items() if col == "object"] - type_samples = obj.iloc[0][cols] - unsupported_cols = [ - name - for name, col in type_samples.items() - if not isinstance(col, str) - and not (is_scalar(col) and pandas.isna(col)) - ] - - if len(unsupported_cols) > 0: - return unsupported_cols - - try: - schema = pyarrow.Schema.from_pandas(obj, preserve_index=False) - except ( - pyarrow.lib.ArrowTypeError, - pyarrow.lib.ArrowInvalid, - ValueError, - TypeError, - ) as err: - # The TypeError could be raised when converting a sparse data to - # arrow table - https://github.com/apache/arrow/pull/4497. If this - # is the case - fall back to pandas, otherwise - rethrow the error. - if type(err) is TypeError: - if any([isinstance(t, pandas.SparseDtype) for t in obj.dtypes]): - ErrorMessage.single_warning( - "Sparse data is not currently supported!" - ) - else: - raise err - - # The ValueError is raised by pyarrow in case of duplicate columns. - # We catch and handle this error here. If there are no duplicates - # (is_unique is True), then the error is caused by something different - # and we just rethrow it. - if (type(err) is ValueError) and obj.columns.is_unique: - raise err - - regex = r"Conversion failed for column ([^\W]*)" - unsupported_cols = [] - for msg in err.args: - match = re.findall(regex, msg) - unsupported_cols.extend(match) - - if len(unsupported_cols) == 0: - unsupported_cols = obj.columns.tolist() - return unsupported_cols - else: - schema = obj.schema - - return [ - field.name for field in schema if not is_supported_arrow_type(field.type) - ] - - @classmethod - def run_exec_plan(cls, plan): - """ - Run execution plan in HDK storage format to materialize frame. - - Parameters - ---------- - plan : DFAlgNode - A root of an execution plan tree. - - Returns - ------- - np.array - Created frame's partitions. - """ - worker = DbWorker() - - # First step is to make sure all partitions are in HDK. - frames = plan.collect_frames() - for frame in frames: - cls.import_table(frame, worker) - - builder = CalciteBuilder() - calcite_plan = builder.build(plan) - calcite_json = CalciteSerializer().serialize(calcite_plan) - if DoUseCalcite.get(): - exec_calcite = True - calcite_json = "execute calcite " + calcite_json - else: - exec_calcite = False - exec_args = {} - if builder.has_groupby and not builder.has_join: - exec_args = {"enable_lazy_fetch": 0, "enable_columnar_output": 0} - elif not builder.has_groupby and builder.has_join: - exec_args = {"enable_lazy_fetch": 1, "enable_columnar_output": 1} - table = worker.executeRA(calcite_json, exec_calcite, **exec_args) - - res = np.empty((1, 1), dtype=np.dtype(object)) - res[0][0] = cls._partition_class(table) - - return res - - @classmethod - def import_table(cls, frame, worker=DbWorker()) -> DbTable: - """ - Import the frame's partition data, if required. - - Parameters - ---------- - frame : HdkOnNativeDataframe - worker : DbWorker, optional - - Returns - ------- - DbTable - """ - part = frame._partitions[0][0] - table = part.get(part.raw) - if isinstance(table, pyarrow.Table): - if table.num_columns == 0: - # Tables without columns are not supported. - # Creating an empty table with index columns only. - idx_names = ( - frame.index.names if frame.has_materialized_index else [None] - ) - idx_names = ColNameCodec.mangle_index_names(idx_names) - table = pyarrow.table( - {n: [] for n in idx_names}, - schema=pyarrow.schema({n: pyarrow.int64() for n in idx_names}), - ) - table = worker.import_arrow_table(table) - frame._partitions[0][0] = cls._partition_class(table) - return table - - @classmethod - def _names_from_index_cols(cls, cols): - """ - Get index labels. - - Deprecated. - - Parameters - ---------- - cols : list of str - Index columns. - - Returns - ------- - list of str - """ - if len(cols) == 1: - return cls._name_from_index_col(cols[0]) - return [cls._name_from_index_col(n) for n in cols] - - @classmethod - def _name_from_index_col(cls, col): - """ - Get index label. - - Deprecated. - - Parameters - ---------- - col : str - Index column. - - Returns - ------- - str - """ - if col.startswith(ColNameCodec.IDX_COL_NAME): - return None - return col - - @classmethod - def _maybe_scalar(cls, lst): - """ - Transform list with a single element to scalar. - - Deprecated. - - Parameters - ---------- - lst : list - Input list. - - Returns - ------- - Any - """ - if len(lst) == 1: - return lst[0] - return lst diff --git a/modin/experimental/core/storage_formats/hdk/__init__.py b/modin/experimental/core/storage_formats/hdk/__init__.py deleted file mode 100644 index 4d7d5a91a2f..00000000000 --- a/modin/experimental/core/storage_formats/hdk/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Experimental query compiler for the HDK storage format.""" - -from .query_compiler import DFAlgQueryCompiler - -__all__ = ["DFAlgQueryCompiler"] diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py deleted file mode 100644 index 42344016c5e..00000000000 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ /dev/null @@ -1,919 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -""" -Module contains ``DFAlgQueryCompiler`` class. - -``DFAlgQueryCompiler`` is used for lazy DataFrame Algebra based engine. -""" - -from functools import wraps - -import numpy as np -import pandas -from pandas._libs.lib import no_default -from pandas.core.common import is_bool_indexer -from pandas.core.dtypes.common import is_bool_dtype, is_integer_dtype - -from modin.core.storage_formats import BaseQueryCompiler -from modin.core.storage_formats.base.query_compiler import ( - _get_axis as default_axis_getter, -) -from modin.core.storage_formats.base.query_compiler import ( - _set_axis as default_axis_setter, -) -from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler -from modin.error_message import ErrorMessage -from modin.utils import MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings - - -def is_inoperable(value): - """ - Check if value cannot be processed by HDK engine. - - Parameters - ---------- - value : any - A value to check. - - Returns - ------- - bool - """ - if isinstance(value, (tuple, list)): - result = False - for val in value: - result = result or is_inoperable(val) - return result - elif isinstance(value, dict): - return is_inoperable(list(value.values())) - else: - value = getattr(value, "_query_compiler", value) - if hasattr(value, "_modin_frame"): - return value._modin_frame._has_unsupported_data - return False - - -def build_method_wrapper(name, method): - """ - Build method wrapper to handle inoperable data types. - - Wrapper calls the original method if all its arguments can be processed - by HDK engine and fallback to parent's method otherwise. - - Parameters - ---------- - name : str - Parent's method name to fallback to. - method : callable - A method to wrap. - - Returns - ------- - callable - """ - - @wraps(method) - def method_wrapper(self, *args, **kwargs): - # If the method wasn't found in the parent query compiler that means, - # that we're calling one that is HDK-specific, if we intend - # to fallback to pandas on 'NotImplementedError' then the call of this - # private method is caused by some public QC method, so we catch - # the exception here and do fallback properly - default_method = getattr(super(type(self), self), name, None) - if is_inoperable([self, args, kwargs]): - if default_method is None: - raise NotImplementedError("Frame contains data of unsupported types.") - return default_method(*args, **kwargs) - try: - return method(self, *args, **kwargs) - # Defaulting to pandas if `NotImplementedError` was arisen - except NotImplementedError as err: - if default_method is None: - raise err - ErrorMessage.default_to_pandas(message=str(err)) - return default_method(*args, **kwargs) - - return method_wrapper - - -def bind_wrappers(cls): - """ - Wrap class methods. - - Decorator allows to fallback to the parent query compiler methods when unsupported - data types are used in a frame. - - Returns - ------- - class - """ - exclude = set( - [ - "__init__", - "to_pandas", - "from_pandas", - "from_arrow", - "default_to_pandas", - "_get_index", - "_set_index", - "_get_columns", - "_set_columns", - ] - ) - for name, method in cls.__dict__.items(): - if name in exclude: - continue - - if callable(method): - setattr( - cls, - name, - build_method_wrapper(name, method), - ) - - return cls - - -@bind_wrappers -@_inherit_docstrings(BaseQueryCompiler) -class DFAlgQueryCompiler(BaseQueryCompiler): - """ - Query compiler for the HDK storage format. - - This class doesn't perform much processing and mostly forwards calls to - :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` - for lazy execution trees build. - - Parameters - ---------- - frame : HdkOnNativeDataframe - Modin Frame to query with the compiled queries. - shape_hint : {"row", "column", None}, default: None - Shape hint for frames known to be a column or a row, otherwise None. - - Attributes - ---------- - _modin_frame : HdkOnNativeDataframe - Modin Frame to query with the compiled queries. - _shape_hint : {"row", "column", None} - Shape hint for frames known to be a column or a row, otherwise None. - """ - - lazy_execution = True - - def __init__(self, frame, shape_hint=None): - assert frame is not None - self._modin_frame = frame - if shape_hint is None and len(self._modin_frame.columns) == 1: - shape_hint = "column" - self._shape_hint = shape_hint - - def finalize(self): - # TODO: implement this for HDK storage format - raise NotImplementedError() - - def execute(self): - self._modin_frame._execute() - - def force_import(self): - """Force table import.""" - # HDK-specific method - self._modin_frame.force_import() - - def support_materialization_in_worker_process(self) -> bool: - return True - - def to_pandas(self): - return self._modin_frame.to_pandas() - - @classmethod - def from_pandas(cls, df, data_cls): - if len(df.columns) == 1: - shape_hint = "column" - elif len(df) == 1: - shape_hint = "row" - else: - shape_hint = None - return cls(data_cls.from_pandas(df), shape_hint=shape_hint) - - @classmethod - def from_arrow(cls, at, data_cls): - if len(at.columns) == 1: - shape_hint = "column" - elif len(at) == 1: - shape_hint = "row" - else: - shape_hint = None - return cls(data_cls.from_arrow(at), shape_hint=shape_hint) - - # Dataframe exchange protocol - - def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): - return self._modin_frame.__dataframe__( - nan_as_null=nan_as_null, allow_copy=allow_copy - ) - - @classmethod - def from_dataframe(cls, df, data_cls): - return cls(data_cls.from_dataframe(df)) - - # END Dataframe exchange protocol - - default_to_pandas = PandasQueryCompiler.default_to_pandas - - def copy(self): - return self.__constructor__(self._modin_frame.copy(), self._shape_hint) - - def getitem_column_array(self, key, numeric=False, ignore_order=False): - shape_hint = "column" if len(key) == 1 else None - if numeric: - new_modin_frame = self._modin_frame.take_2d_labels_or_positional( - col_positions=key - ) - else: - new_modin_frame = self._modin_frame.take_2d_labels_or_positional( - col_labels=key - ) - return self.__constructor__(new_modin_frame, shape_hint) - - def getitem_array(self, key): - if isinstance(key, type(self)): - new_modin_frame = self._modin_frame.filter(key._modin_frame) - return self.__constructor__(new_modin_frame, self._shape_hint) - - if is_bool_indexer(key): - return self.default_to_pandas(lambda df: df[key]) - - if any(k not in self.columns for k in key): - raise KeyError( - "{} not index".format( - str([k for k in key if k not in self.columns]).replace(",", "") - ) - ) - return self.getitem_column_array(key) - - # Merge - - def merge(self, right, **kwargs): - on = kwargs.get("on", None) - left_on = kwargs.get("left_on", None) - right_on = kwargs.get("right_on", None) - left_index = kwargs.get("left_index", False) - right_index = kwargs.get("right_index", False) - """Only non-index joins with explicit 'on' are supported""" - if left_index is False and right_index is False: - if left_on is None and right_on is None: - if on is None: - on = [c for c in self.columns if c in right.columns] - left_on = on - right_on = on - - if not isinstance(left_on, list): - left_on = [left_on] - if not isinstance(right_on, list): - right_on = [right_on] - - how = kwargs.get("how", "inner") - sort = kwargs.get("sort", False) - suffixes = kwargs.get("suffixes", None) - return self.__constructor__( - self._modin_frame.join( - right._modin_frame, - how=how, - left_on=left_on, - right_on=right_on, - sort=sort, - suffixes=suffixes, - ) - ) - else: - return self.default_to_pandas(pandas.DataFrame.merge, right, **kwargs) - - def take_2d_positional(self, index=None, columns=None): - return self.__constructor__( - self._modin_frame.take_2d_labels_or_positional( - row_positions=index, col_positions=columns - ) - ) - - def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - # Grouping on empty frame or on index level. - if len(self.columns) == 0: - raise NotImplementedError( - "Grouping on empty frame or on index level is not yet implemented." - ) - - groupby_kwargs = groupby_kwargs.copy() - as_index = groupby_kwargs.get("as_index", True) - # Setting 'as_index' to True to avoid 'by' and 'agg' columns naming conflict - groupby_kwargs["as_index"] = True - new_frame = self._modin_frame.groupby_agg( - by, - axis, - {self._modin_frame.columns[0]: "size"}, - groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - if as_index: - shape_hint = "column" - new_frame = new_frame._set_columns([MODIN_UNNAMED_SERIES_LABEL]) - else: - shape_hint = None - new_frame = new_frame._set_columns(["size"]).reset_index(drop=False) - return self.__constructor__(new_frame, shape_hint=shape_hint) - - def groupby_sum(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False): - new_frame = self._modin_frame.groupby_agg( - by, - axis, - "sum", - groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - return self.__constructor__(new_frame) - - def groupby_count(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False): - new_frame = self._modin_frame.groupby_agg( - by, - axis, - "count", - groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - return self.__constructor__(new_frame) - - def groupby_agg( - self, - by, - agg_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - how="axis_wise", - drop=False, - series_groupby=False, - ): - # TODO: handle `drop` args - if callable(agg_func): - raise NotImplementedError( - "Python callable is not a valid aggregation function for HDK storage format." - ) - if how != "axis_wise": - raise NotImplementedError( - f"'{how}' type of groupby-aggregation functions is not supported for HDK storage format." - ) - - new_frame = self._modin_frame.groupby_agg( - by, - axis, - agg_func, - groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - return self.__constructor__(new_frame) - - def count(self, **kwargs): - return self._agg("count", **kwargs) - - def max(self, **kwargs): - return self._agg("max", **kwargs) - - def min(self, **kwargs): - return self._agg("min", **kwargs) - - def sum(self, **kwargs): - min_count = kwargs.pop("min_count", 0) - if min_count != 0: - raise NotImplementedError( - f"HDK's sum does not support such set of parameters: min_count={min_count}." - ) - _check_int_or_float("sum", self.dtypes) - return self._agg("sum", **kwargs) - - def mean(self, **kwargs): - _check_int_or_float("mean", self.dtypes) - return self._agg("mean", **kwargs) - - def nunique(self, axis=0, dropna=True): - if axis != 0 or not dropna: - raise NotImplementedError( - f"HDK's nunique does not support such set of parameters: axis={axis}, dropna={dropna}." - ) - return self._agg("nunique") - - def _agg(self, agg, axis=0, level=None, **kwargs): - """ - Perform specified aggregation along rows/columns. - - Parameters - ---------- - agg : str - Name of the aggregation function to perform. - axis : {0, 1}, default: 0 - Axis to perform aggregation along. 0 is to apply function against each column, - all the columns will be reduced into a single scalar. 1 is to aggregate - across rows. - *Note:* HDK storage format supports aggregation for 0 axis only, aggregation - along rows will be defaulted to pandas. - level : None, default: None - Serves the compatibility purpose, always have to be None. - **kwargs : dict - Additional parameters to pass to the aggregation function. - - Returns - ------- - DFAlgQueryCompiler - New single-column (``axis=1``) or single-row (``axis=0``) query compiler containing - the result of aggregation. - """ - if level is not None or axis != 0: - raise NotImplementedError( - "HDK's aggregation functions does not support 'level' and 'axis' parameters." - ) - - # TODO: Do filtering on numeric columns if `numeric_only=True` - if not kwargs.get("skipna", True) or kwargs.get("numeric_only"): - raise NotImplementedError( - "HDK's aggregation functions does not support 'skipna' and 'numeric_only' parameters." - ) - # Processed above, so can be omitted - kwargs.pop("skipna", None) - kwargs.pop("numeric_only", None) - - new_frame = self._modin_frame.agg(agg) - new_frame = new_frame._set_index( - pandas.Index.__new__( - pandas.Index, data=[MODIN_UNNAMED_SERIES_LABEL], dtype="O" - ) - ) - return self.__constructor__(new_frame, shape_hint="row") - - def _get_index(self): - """ - Return frame's index. - - Returns - ------- - pandas.Index - """ - if self._modin_frame._has_unsupported_data: - return default_axis_getter(0)(self) - return self._modin_frame.index - - def _set_index(self, index): - """ - Set new index. - - Parameters - ---------- - index : pandas.Index - A new index. - """ - # NotImplementedError: HdkOnNativeDataframe._set_index is not yet suported - default_axis_setter(0)(self, index) - - def _get_columns(self): - """ - Return frame's columns. - - Returns - ------- - pandas.Index - """ - if self._modin_frame._has_unsupported_data: - return default_axis_getter(1)(self) - return self._modin_frame.columns - - def _set_columns(self, columns): - """ - Set new columns. - - Parameters - ---------- - columns : list-like - New columns. - """ - if self._modin_frame._has_unsupported_data: - default_axis_setter(1)(self, columns) - else: - try: - self._modin_frame = self._modin_frame._set_columns(columns) - except NotImplementedError: - default_axis_setter(1)(self, columns) - self._modin_frame._has_unsupported_data = True - - def fillna( - self, - squeeze_self=False, - squeeze_value=False, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - ): - assert not inplace, "inplace=True should be handled on upper level" - - if ( - isinstance(value, dict) - and len(self._modin_frame.columns) == 1 - and self._modin_frame.columns[0] == MODIN_UNNAMED_SERIES_LABEL - ): - raise NotImplementedError("Series fillna with dict value") - - new_frame = self._modin_frame.fillna( - value=value, - method=method, - axis=axis, - limit=limit, - downcast=downcast, - ) - return self.__constructor__(new_frame, self._shape_hint) - - def concat(self, axis, other, **kwargs): - if not isinstance(other, list): - other = [other] - assert all( - isinstance(o, type(self)) for o in other - ), "Different Manager objects are being used. This is not allowed" - sort = kwargs.get("sort", False) - if sort is None: - raise ValueError( - "The 'sort' keyword only accepts boolean values; None was passed." - ) - join = kwargs.get("join", "outer") - ignore_index = kwargs.get("ignore_index", False) - other_modin_frames = [o._modin_frame for o in other] - - new_modin_frame = self._modin_frame.concat( - axis, other_modin_frames, join=join, sort=sort, ignore_index=ignore_index - ) - return self.__constructor__(new_modin_frame) - - def drop(self, index=None, columns=None, errors: str = "raise"): - if index is not None: - # Only column drop is supported by the HDK engine - raise NotImplementedError("Row drop") - if errors != "raise": - raise NotImplementedError( - "This lazy query compiler will always " - + "raise an error on invalid columns." - ) - - columns = self.columns.drop(columns) - new_frame = self._modin_frame.take_2d_labels_or_positional( - row_labels=index, col_labels=columns - ) - - # If all columns are dropped and the index is trivial, we are - # not able to restore it, since we don't know the number of rows. - # In this case, we copy the index from the current frame. - if len(columns) == 0 and new_frame._index_cols is None: - assert index is None, "Can't copy old indexes as there was a row drop" - new_frame.set_index_cache(self._modin_frame.index.copy()) - - return self.__constructor__(new_frame) - - def dropna(self, axis=0, how=no_default, thresh=no_default, subset=None): - if thresh is not no_default or axis != 0: - raise NotImplementedError( - "HDK's dropna does not support 'thresh' and 'axis' parameters." - ) - - if subset is None: - subset = self.columns - if how is no_default: - how = "any" - return self.__constructor__( - self._modin_frame.dropna(subset=subset, how=how), - shape_hint=self._shape_hint, - ) - - def isna(self): - return self.__constructor__(self._modin_frame.isna(invert=False)) - - def notna(self): - return self.__constructor__(self._modin_frame.isna(invert=True)) - - def invert(self): - return self.__constructor__(self._modin_frame.invert()) - - def dt_year(self): - return self.__constructor__( - self._modin_frame.dt_extract("year"), self._shape_hint - ) - - def dt_month(self): - return self.__constructor__( - self._modin_frame.dt_extract("month"), self._shape_hint - ) - - def dt_day(self): - return self.__constructor__( - self._modin_frame.dt_extract("day"), self._shape_hint - ) - - def dt_hour(self): - return self.__constructor__( - self._modin_frame.dt_extract("hour"), self._shape_hint - ) - - def dt_minute(self): - return self.__constructor__( - self._modin_frame.dt_extract("minute"), self._shape_hint - ) - - def dt_second(self): - return self.__constructor__( - self._modin_frame.dt_extract("second"), self._shape_hint - ) - - def dt_microsecond(self): - return self.__constructor__( - self._modin_frame.dt_extract("microsecond"), self._shape_hint - ) - - def dt_nanosecond(self): - return self.__constructor__( - self._modin_frame.dt_extract("nanosecond"), self._shape_hint - ) - - def dt_quarter(self): - return self.__constructor__( - self._modin_frame.dt_extract("quarter"), self._shape_hint - ) - - def dt_dayofweek(self): - return self.__constructor__( - self._modin_frame.dt_extract("isodow"), self._shape_hint - ) - - def dt_weekday(self): - return self.__constructor__( - self._modin_frame.dt_extract("isodow"), self._shape_hint - ) - - def dt_dayofyear(self): - return self.__constructor__( - self._modin_frame.dt_extract("doy"), self._shape_hint - ) - - def _bin_op(self, other, op_name, **kwargs): - """ - Perform a binary operation on a frame. - - Parameters - ---------- - other : any - The second operand. - op_name : str - Operation name. - **kwargs : dict - Keyword args. - - Returns - ------- - DFAlgQueryCompiler - A new query compiler. - """ - level = kwargs.get("level", None) - if level is not None: - return getattr(super(), op_name)(other=other, op_name=op_name, **kwargs) - - if isinstance(other, DFAlgQueryCompiler): - shape_hint = ( - self._shape_hint if self._shape_hint == other._shape_hint else None - ) - other = other._modin_frame - else: - shape_hint = self._shape_hint - - new_modin_frame = self._modin_frame.bin_op(other, op_name, **kwargs) - return self.__constructor__(new_modin_frame, shape_hint) - - def add(self, other, **kwargs): - return self._bin_op(other, "add", **kwargs) - - def sub(self, other, **kwargs): - return self._bin_op(other, "sub", **kwargs) - - def mul(self, other, **kwargs): - return self._bin_op(other, "mul", **kwargs) - - def pow(self, other, **kwargs): - return self._bin_op(other, "pow", **kwargs) - - def mod(self, other, **kwargs): - def check_int(obj): - if isinstance(obj, DFAlgQueryCompiler): - cond = all(is_integer_dtype(t) for t in obj._modin_frame.dtypes) - elif isinstance(obj, list): - cond = all(isinstance(i, int) for i in obj) - else: - cond = isinstance(obj, int) - if not cond: - raise NotImplementedError("Non-integer operands in modulo operation") - - check_int(self) - check_int(other) - return self._bin_op(other, "mod", **kwargs) - - def floordiv(self, other, **kwargs): - return self._bin_op(other, "floordiv", **kwargs) - - def truediv(self, other, **kwargs): - return self._bin_op(other, "truediv", **kwargs) - - def eq(self, other, **kwargs): - return self._bin_op(other, "eq", **kwargs) - - def ge(self, other, **kwargs): - return self._bin_op(other, "ge", **kwargs) - - def gt(self, other, **kwargs): - return self._bin_op(other, "gt", **kwargs) - - def le(self, other, **kwargs): - return self._bin_op(other, "le", **kwargs) - - def lt(self, other, **kwargs): - return self._bin_op(other, "lt", **kwargs) - - def ne(self, other, **kwargs): - return self._bin_op(other, "ne", **kwargs) - - def __and__(self, other, **kwargs): - return self._bool_op(other, "and", **kwargs) - - def __or__(self, other, **kwargs): - return self._bool_op(other, "or", **kwargs) - - def _bool_op(self, other, op, **kwargs): # noqa: GL08 - def check_bool(obj): - if isinstance(obj, DFAlgQueryCompiler): - cond = all(is_bool_dtype(t) for t in obj._modin_frame.dtypes) - elif isinstance(obj, list): - cond = all(isinstance(i, bool) for i in obj) - else: - cond = isinstance(obj, bool) - if not cond: - raise NotImplementedError("Non-boolean operands in logic operation") - - check_bool(self) - check_bool(other) - return self._bin_op(other, op, **kwargs) - - def reset_index(self, **kwargs): - level = kwargs.get("level", None) - if level is not None: - raise NotImplementedError( - "HDK's reset_index does not support 'level' parameter." - ) - - drop = kwargs.get("drop", False) - shape_hint = self._shape_hint if drop else None - - return self.__constructor__( - self._modin_frame.reset_index(drop), shape_hint=shape_hint - ) - - def astype(self, col_dtypes, errors: str = "raise"): - if errors != "raise": - raise NotImplementedError( - "This lazy query compiler will always " - + "raise an error on invalid type keys." - ) - return self.__constructor__( - self._modin_frame.astype(col_dtypes), - shape_hint=self._shape_hint, - ) - - def setitem(self, axis, key, value): - if axis == 1 or not isinstance(value, type(self)): - raise NotImplementedError( - f"HDK's setitem does not support such set of parameters: axis={axis}, value={value}." - ) - return self._setitem(axis, key, value) - - _setitem = PandasQueryCompiler._setitem - - def insert(self, loc, column, value): - if isinstance(value, type(self)): - value.columns = [column] - return self.insert_item(axis=1, loc=loc, value=value) - return self.__constructor__(self._modin_frame.insert(loc, column, value)) - - def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): - if kwargs.get("key", None) is not None: - raise NotImplementedError("Sort with key function") - - ignore_index = kwargs.get("ignore_index", False) - na_position = kwargs.get("na_position", "last") - return self.__constructor__( - self._modin_frame.sort_rows(columns, ascending, ignore_index, na_position), - self._shape_hint, - ) - - def columnarize(self): - if self._shape_hint == "column": - assert len(self.columns) == 1, "wrong shape hint" - return self - - if self._shape_hint == "row": - # It is OK to trigger execution here because we cannot - # transpose in HDK anyway. - assert len(self.index) == 1, "wrong shape hint" - return self.transpose() - - if len(self.columns) != 1 or ( - len(self.index) == 1 and self.index[0] == MODIN_UNNAMED_SERIES_LABEL - ): - res = self.transpose() - res._shape_hint = "column" - return res - - self._shape_hint = "column" - return self - - def is_series_like(self): - if self._shape_hint is not None: - return True - return len(self.columns) == 1 or len(self.index) == 1 - - def cat_codes(self): - return self.__constructor__(self._modin_frame.cat_codes(), self._shape_hint) - - def has_multiindex(self, axis=0): - if axis == 0: - return self._modin_frame.has_multiindex() - assert axis == 1 - return isinstance(self.columns, pandas.MultiIndex) - - def get_index_name(self, axis=0): - return self.columns.name if axis else self._modin_frame.get_index_name() - - def set_index_name(self, name, axis=0): - if axis == 0: - self._modin_frame = self._modin_frame.set_index_name(name) - else: - self.columns.name = name - - def get_index_names(self, axis=0): - return self.columns.names if axis else self._modin_frame.get_index_names() - - def set_index_names(self, names=None, axis=0): - if axis == 0: - self._modin_frame = self._modin_frame.set_index_names(names) - else: - self.columns.names = names - - def free(self): - return - - index = property(_get_index, _set_index) - columns = property(_get_columns, _set_columns) - - @property - def dtypes(self): - return self._modin_frame.dtypes - - -# "?" is the boolean type code. -_SUPPORTED_NUM_TYPE_CODES = set( - np.typecodes["AllInteger"] + np.typecodes["Float"] + "?" -) - {np.dtype(np.float16).char} - - -def _check_int_or_float(op, dtypes): # noqa: GL08 - for t in dtypes: - if not isinstance(t, np.dtype) or t.char not in _SUPPORTED_NUM_TYPE_CODES: - raise NotImplementedError(f"Operation '{op}' on type '{t.name}'") diff --git a/modin/experimental/sql/__init__.py b/modin/experimental/sql/__init__.py deleted file mode 100644 index 0aea01598fb..00000000000 --- a/modin/experimental/sql/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import modin.config as cfg -import modin.pandas as pd - -_query_impl = None - - -def query(sql: str, *args, **kwargs) -> pd.DataFrame: - """ - Execute SQL query using HDK engine. - - Parameters - ---------- - sql : str - SQL query to be executed. - *args : *tuple - Positional arguments, passed to the execution engine. - **kwargs : **dict - Keyword arguments, passed to the execution engine. - - Returns - ------- - modin.pandas.DataFrame - Execution result. - """ - global _query_impl - - if _query_impl is None: - if cfg.StorageFormat.get() == "Hdk": - from modin.experimental.sql.hdk.query import hdk_query as _query_impl - else: - raise NotImplementedError - - return _query_impl(sql, *args, **kwargs) diff --git a/modin/experimental/sql/hdk/__init__.py b/modin/experimental/sql/hdk/__init__.py deleted file mode 100644 index 31de5addb64..00000000000 --- a/modin/experimental/sql/hdk/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Implementation of HDK SQL functionality.""" diff --git a/modin/experimental/sql/hdk/query.py b/modin/experimental/sql/hdk/query.py deleted file mode 100644 index 9a3ec8effa2..00000000000 --- a/modin/experimental/sql/hdk/query.py +++ /dev/null @@ -1,123 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import pyarrow as pa -from pandas.core.dtypes.common import _get_dtype - -import modin.pandas as pd -from modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.utils import ( - ColNameCodec, -) -from modin.experimental.core.execution.native.implementations.hdk_on_native.hdk_worker import ( - HdkWorker, -) -from modin.experimental.core.storage_formats.hdk import DFAlgQueryCompiler -from modin.pandas.utils import from_arrow - - -def hdk_query(query: str, **kwargs) -> pd.DataFrame: - """ - Execute SQL queries on the HDK backend. - - DataFrames are referenced in the query by names and are - passed to this function as name=value arguments. - - Here is an example of a query to three data frames: - - ids = [1, 2, 3] - first_names = ["James", "Peter", "Claus"] - last_names = ["Bond", "Pan", "Santa"] - courses_names = ["Mathematics", "Physics", "Geography"] - student = pd.DataFrame({"id": ids, "first_name": first_names, "last_name": last_names}) - course = pd.DataFrame({"id": ids, "course_name": courses_names}) - student_course = pd.DataFrame({"student_id": ids, "course_id": [3, 2, 1]}) - query = ''' - SELECT - student.first_name, - student.last_name, - course.course_name - FROM student - JOIN student_course - ON student.id = student_course.student_id - JOIN course - ON course.id = student_course.course_id - ORDER BY - last_name - ''' - res = hdk_query(query, student=student, course=course, student_course=student_course) - print(res) - - Parameters - ---------- - query : str - SQL query to be executed. - **kwargs : **dict - DataFrames referenced by the query. - - Returns - ------- - modin.pandas.DataFrame - Execution result. - """ - if len(kwargs) > 0: - query = _build_query(query, kwargs) - table = HdkWorker().executeDML(query) - df = from_arrow(table.to_arrow()) - mdf = df._query_compiler._modin_frame - schema = mdf._partitions[0][0].get().schema - # HDK returns strings as dictionary. For the proper conversion to - # Pandas, we need to replace dtypes of the corresponding columns. - if replace := [ - i for i, col in enumerate(schema) if pa.types.is_dictionary(col.type) - ]: - dtypes = mdf._dtypes - obj_type = _get_dtype(object) - for i in replace: - dtypes[i] = obj_type - return df - - -def _build_query(query: str, frames: dict) -> str: - """ - Build query to be executed. - - Table and column names are mapped to the real names - using the WITH statement. - - Parameters - ---------- - query : str - SQL query to be processed. - frames : dict - DataFrames referenced by the query. - - Returns - ------- - str - SQL query to be executed. - """ - alias = [] - for name, df in frames.items(): - assert isinstance(df._query_compiler, DFAlgQueryCompiler) - mf = df._query_compiler._modin_frame - table = mf.force_import() - alias.append("WITH " if len(alias) == 0 else "\n),\n") - alias.extend((name, " AS (\n", " SELECT\n")) - - for i, col in enumerate(table.column_names): - alias.append(" " if i == 0 else ",\n ") - alias.extend(('"', col, '"', " AS ", '"', ColNameCodec.decode(col), '"')) - alias.extend(("\n FROM\n ", table.name)) - - alias.extend(("\n)\n", query)) - return "".join(alias) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 068fbd11acb..01f83ce9a20 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -116,45 +116,11 @@ def _update_engine(publisher: Parameter): # Set this so that Pandas doesn't try to multithread by itself os.environ["OMP_NUM_THREADS"] = "1" - sfmt = StorageFormat.get() - - if sfmt == "Hdk": - is_hdk = True - elif sfmt == "Omnisci": - is_hdk = True - StorageFormat.put("Hdk") - warnings.warn( - "The OmniSci storage format has been deprecated. Please use " - + '`StorageFormat.put("hdk")` or `MODIN_STORAGE_FORMAT="hdk"` instead.' - ) - else: - is_hdk = False - - if is_hdk and publisher.get_value_source() == ValueSource.DEFAULT: - publisher.put("Native") - IsExperimental.put(True) - if ( - publisher.get() == "Native" - and StorageFormat.get_value_source() == ValueSource.DEFAULT - ): - is_hdk = True - StorageFormat.put("Hdk") - IsExperimental.put(True) - if publisher.get() == "Ray": if _is_first_update.get("Ray", True): from modin.core.execution.ray.common import initialize_ray initialize_ray() - elif publisher.get() == "Native": - # With HDK storage format there is only a single worker per node - # and we allow it to work on all cores. - if is_hdk: - os.environ["OMP_NUM_THREADS"] = str(CpuCount.get()) - else: - raise ValueError( - f"Storage format should be 'Hdk' with 'Native' engine, but provided {sfmt}." - ) elif publisher.get() == "Dask": if _is_first_update.get("Dask", True): from modin.core.execution.dask.common import initialize_dask diff --git a/modin/tests/config/test_envvars.py b/modin/tests/config/test_envvars.py index 34110e62014..4341a64d79d 100644 --- a/modin/tests/config/test_envvars.py +++ b/modin/tests/config/test_envvars.py @@ -16,7 +16,6 @@ import warnings import pytest -from packaging import version import modin.config as cfg import modin.pandas as pd @@ -122,48 +121,6 @@ def test_ray_cluster_resources(): assert ray.cluster_resources()["special_hardware"] == 1.0 -def test_hdk_envvar(): - try: - import pyhdk - - defaults = cfg.HdkLaunchParameters.get() - assert defaults["enable_union"] == 1 - if version.parse(pyhdk.__version__) >= version.parse("0.6.1"): - assert defaults["log_dir"] == "pyhdk_log" - del cfg.HdkLaunchParameters._value - except ImportError: - # This test is intended to check pyhdk internals. If pyhdk is not available, skip the version check test. - pass - - os.environ[cfg.HdkLaunchParameters.varname] = "enable_union=2,enable_thrift_logs=3" - params = cfg.HdkLaunchParameters.get() - assert params["enable_union"] == 2 - assert params["enable_thrift_logs"] == 3 - - os.environ[cfg.HdkLaunchParameters.varname] = "unsupported=X" - del cfg.HdkLaunchParameters._value - params = cfg.HdkLaunchParameters.get() - assert params["unsupported"] == "X" - try: - import pyhdk - - pyhdk.buildConfig(**cfg.HdkLaunchParameters.get()) - except RuntimeError as e: - assert str(e) == "unrecognised option '--unsupported'" - except ImportError: - # This test is intended to check pyhdk internals. If pyhdk is not available, skip the version check test. - pass - - os.environ[cfg.HdkLaunchParameters.varname] = ( - "enable_union=4,enable_thrift_logs=5,enable_lazy_dict_materialization=6" - ) - del cfg.HdkLaunchParameters._value - params = cfg.HdkLaunchParameters.get() - assert params["enable_union"] == 4 - assert params["enable_thrift_logs"] == 5 - assert params["enable_lazy_dict_materialization"] == 6 - - @pytest.mark.parametrize( "deprecated_var, new_var", [ diff --git a/modin/tests/core/storage_formats/hdk/test_internals.py b/modin/tests/core/storage_formats/hdk/test_internals.py deleted file mode 100644 index da74b16cb8e..00000000000 --- a/modin/tests/core/storage_formats/hdk/test_internals.py +++ /dev/null @@ -1,152 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import subprocess -import sys - -import pytest - - -@pytest.mark.parametrize( - "import_strategy", - [ - pytest.param( - """ -import modin.config as cfg -cfg.Engine.put('Native') # 'hdk'/'dbe' would be imported with dlopen flags first time -cfg.StorageFormat.put('HDK') -cfg.IsExperimental.put(True) -import modin.pandas as pd -""", - id="config_hdk_first-import_modin_second", - ), - pytest.param( - """ -import modin.pandas as pd -import modin.config as cfg -cfg.Engine.put('Native') -cfg.StorageFormat.put('HDK') -cfg.IsExperimental.put(True) -""", - id="import_modin_first-config_hdk_second", - ), - ], -) -@pytest.mark.parametrize("has_other_engines", [True, False]) -def test_hdk_import(import_strategy, has_other_engines): - """ - Test import of HDK engine. - - The import of DbWorker requires to set special dlopen flags which make it then - incompatible to import some other libraries further (like ``pyarrow.gandiva``). - This test verifies that it's not the case when a user naturally imports Modin - with HDK engine. - - Parameters - ---------- - import_strategy : str - There are several scenarios of how a user can import Modin with HDK engine: - configure Modin first to use HDK engine and then import ``modin.pandas`` or vice versa. - This parameters holds a python code, implementing one of these scenarios. - has_other_engines : bool - The problem with import may appear depending on whether other engines are - installed. This parameter indicates whether to remove modules for - non-hdk engines before the test. - - Notes - ----- - The failed import flow may cause segfault, which causes to crash the pytest itself. - This makes us to run the test in a separate process and check its exit-code to - decide the success of the test. - """ - - remove_other_engines = """ -import sys -sys.modules['ray'] = None -sys.modules['dask'] = None -""" - - if not has_other_engines: - import_strategy = f"{remove_other_engines}\n{import_strategy}" - - res = subprocess.run( - [sys.executable, "-c", import_strategy], - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - ) - - if res.returncode != 0: - pytest.fail(str(res.stderr)) - - -@pytest.mark.parametrize( - "import_strategy, expected_to_fail", - [ - pytest.param( - """ -from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import DbWorker -import pyarrow.gandiva -""", - True, - id="import_pydbe_first-pyarrow_gandiva_second", - ), - pytest.param( - """ -import pyarrow.gandiva -from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import DbWorker -""", - False, - id="import_pyarrow_gandiva_first-pydbe_second", - ), - ], -) -def test_hdk_compatibility_with_pyarrow_gandiva(import_strategy, expected_to_fail): - """ - Test the current status of compatibility of DbWorker and pyarrow.gandiva packages. - - If this test appears to fail, it means that these packages are now compatible/incopmatible, - if it's so, please post the actual compatibility status to the issue: - https://github.com/modin-project/modin/issues/3865 - And then inverse `expected_to_fail` parameter for the scenario that has changed its behavior. - - Parameters - ---------- - import_strategy : str - There are several scenarios of how a user can import DbWorker and pyarrow.gandiva. - This parameters holds a python code, implementing one of the scenarios. - expected_to_fail : bool - Indicates the estimated compatibility status for the specified `import_strategy`. - True - the strategy expected to fail, False - the strategy expected to pass. - Note: we can't use built-in ``pytest.marks.xfail`` as we need to check that the - expected failure was caused by LLVM error. - """ - res = subprocess.run( - [sys.executable, "-c", import_strategy], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - if expected_to_fail: - assert ( - res.returncode != 0 - ), "DbWorker and pyarrow.gandiva are now compatible! Please check the test's doc-string for further instructions." - else: - assert ( - res.returncode == 0 - ), "DbWorker and pyarrow.gandiva are now incompatible! Please check the test's doc-string for further instructions." - - if res.returncode != 0: - error_msg = res.stderr.decode("utf-8") - assert ( - error_msg.find("LLVM ERROR") != -1 - ), f"Expected to fail because of LLVM error, but failed because of:\n{error_msg}" diff --git a/modin/tests/core/storage_formats/pandas/test_internals.py b/modin/tests/core/storage_formats/pandas/test_internals.py index bf95d52ae9d..9a16022a371 100644 --- a/modin/tests/core/storage_formats/pandas/test_internals.py +++ b/modin/tests/core/storage_formats/pandas/test_internals.py @@ -26,7 +26,6 @@ MinPartitionSize, NPartitions, RangePartitioning, - StorageFormat, context, ) from modin.core.dataframe.algebra import Fold @@ -2711,10 +2710,6 @@ def map_func(df, first_arg, extra_arg=0): ), "Invalid map function result." -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="HDK is deprecated and doesn't allow to register a custom function.", -) def test_fold_operator(): new_index = list(range(500, 1000)) new_columns = ["b"] diff --git a/modin/tests/experimental/hdk_on_native/__init__.py b/modin/tests/experimental/hdk_on_native/__init__.py deleted file mode 100644 index cae6413e559..00000000000 --- a/modin/tests/experimental/hdk_on_native/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. diff --git a/modin/tests/experimental/hdk_on_native/test_dataframe.py b/modin/tests/experimental/hdk_on_native/test_dataframe.py deleted file mode 100644 index 6f894e48e22..00000000000 --- a/modin/tests/experimental/hdk_on_native/test_dataframe.py +++ /dev/null @@ -1,2996 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import os -import re - -import numpy as np -import pandas -import pyarrow -import pytest -from pandas._testing import ensure_clean -from pandas.core.dtypes.common import is_list_like -from pyhdk import __version__ as hdk_version - -from modin.config import StorageFormat -from modin.tests.interchange.dataframe_protocol.hdk.utils import split_df_into_chunks -from modin.tests.pandas.utils import ( - create_test_dfs, - default_to_pandas_ignore_string, - random_state, - test_data, -) - -from .utils import ForceHdkImport, eval_io, run_and_compare, set_execution_mode - -StorageFormat.put("hdk") - -import modin.pandas as pd -from modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_serializer import ( - CalciteSerializer, -) -from modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra import ( - FrameNode, -) -from modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition_manager import ( - HdkOnNativeDataframePartitionManager, -) -from modin.pandas.io import from_arrow -from modin.tests.pandas.utils import ( - bool_arg_values, - df_equals, - df_equals_with_non_stable_indices, - eval_general, - generate_multiindex, - test_data_keys, - test_data_values, - time_parsing_csv_path, - to_pandas, -) -from modin.utils import try_cast_to_pandas - -# Our configuration in pytest.ini requires that we explicitly catch all -# instances of defaulting to pandas, but some test modules, like this one, -# have too many such instances. -# TODO(https://github.com/modin-project/modin/issues/3655): catch all instances -# of defaulting to pandas. -pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) - - -@pytest.mark.usefixtures("TestReadCSVFixture") -class TestCSV: - from modin import __file__ as modin_root - - root = os.path.dirname( - os.path.dirname(os.path.abspath(modin_root)) + ".." - ) # root of modin repo - - boston_housing_names = [ - "index", - "CRIM", - "ZN", - "INDUS", - "CHAS", - "NOX", - "RM", - "AGE", - "DIS", - "RAD", - "TAX", - "PTRATIO", - "B", - "LSTAT", - "PRICE", - ] - boston_housing_dtypes = { - "index": "int64", - "CRIM": "float64", - "ZN": "float64", - "INDUS": "float64", - "CHAS": "float64", - "NOX": "float64", - "RM": "float64", - "AGE": "float64", - "DIS": "float64", - "RAD": "float64", - "TAX": "float64", - "PTRATIO": "float64", - "B": "float64", - "LSTAT": "float64", - "PRICE": "float64", - } - - def test_usecols_csv(self): - """check with the following arguments: names, dtype, skiprows, delimiter""" - csv_file = os.path.join( - self.root, "modin/tests/pandas/data", "test_usecols.csv" - ) - - for kwargs in ( - {"delimiter": ","}, - {"sep": None}, - {"skiprows": 1, "names": ["A", "B", "C", "D", "E"]}, - {"dtype": {"a": "int32", "e": "string"}}, - {"dtype": {"a": np.dtype("int32"), "b": np.dtype("int64"), "e": "string"}}, - ): - eval_io( - fn_name="read_csv", - md_extra_kwargs={"engine": "arrow"}, - # read_csv kwargs - filepath_or_buffer=csv_file, - **kwargs, - ) - - def test_housing_csv(self): - csv_file = os.path.join(self.root, "examples/data/boston_housing.csv") - for kwargs in ( - { - "skiprows": 1, - "names": self.boston_housing_names, - "dtype": self.boston_housing_dtypes, - }, - ): - eval_io( - fn_name="read_csv", - md_extra_kwargs={"engine": "arrow"}, - # read_csv kwargs - filepath_or_buffer=csv_file, - **kwargs, - ) - - def test_time_parsing(self): - csv_file = os.path.join(self.root, time_parsing_csv_path) - for kwargs in ( - { - "skiprows": 1, - "names": [ - "timestamp", - "year", - "month", - "date", - "symbol", - "high", - "low", - "open", - "close", - "spread", - "volume", - ], - "parse_dates": ["timestamp"], - "dtype": {"symbol": "string"}, - }, - ): - rp = pandas.read_csv(csv_file, **kwargs) - rm = pd.read_csv(csv_file, engine="arrow", **kwargs) - with ForceHdkImport(rm): - rm = to_pandas(rm) - df_equals(rm["timestamp"].dt.year, rp["timestamp"].dt.year) - df_equals(rm["timestamp"].dt.month, rp["timestamp"].dt.month) - df_equals(rm["timestamp"].dt.day, rp["timestamp"].dt.day) - df_equals(rm["timestamp"].dt.hour, rp["timestamp"].dt.hour) - - def test_csv_fillna(self): - csv_file = os.path.join(self.root, "examples/data/boston_housing.csv") - for kwargs in ( - { - "skiprows": 1, - "names": self.boston_housing_names, - "dtype": self.boston_housing_dtypes, - }, - ): - eval_io( - fn_name="read_csv", - md_extra_kwargs={"engine": "arrow"}, - comparator=lambda df1, df2: df_equals( - df1["CRIM"].fillna(1000), df2["CRIM"].fillna(1000) - ), - # read_csv kwargs - filepath_or_buffer=csv_file, - **kwargs, - ) - - @pytest.mark.parametrize("null_dtype", ["category", "float64"]) - def test_null_col(self, null_dtype): - csv_file = os.path.join( - self.root, "modin/tests/pandas/data", "test_null_col.csv" - ) - ref = pandas.read_csv( - csv_file, - names=["a", "b", "c"], - dtype={"a": "int64", "b": "int64", "c": null_dtype}, - skiprows=1, - ) - ref["a"] = ref["a"] + ref["b"] - - exp = pd.read_csv( - csv_file, - names=["a", "b", "c"], - dtype={"a": "int64", "b": "int64", "c": null_dtype}, - skiprows=1, - ) - exp["a"] = exp["a"] + exp["b"] - - # df_equals cannot compare empty categories - if null_dtype == "category": - ref["c"] = ref["c"].astype("string") - with ForceHdkImport(exp): - exp = to_pandas(exp) - exp["c"] = exp["c"].astype("string") - - df_equals(ref, exp) - - def test_read_and_concat(self): - csv_file = os.path.join( - self.root, "modin/tests/pandas/data", "test_usecols.csv" - ) - ref1 = pandas.read_csv(csv_file) - ref2 = pandas.read_csv(csv_file) - ref = pandas.concat([ref1, ref2]) - - exp1 = pandas.read_csv(csv_file) - exp2 = pandas.read_csv(csv_file) - exp = pd.concat([exp1, exp2]) - with ForceHdkImport(exp): - df_equals(ref, exp) - - @pytest.mark.parametrize("names", [None, ["a", "b", "c", "d", "e"]]) - @pytest.mark.parametrize("header", [None, 0]) - def test_from_csv(self, header, names): - csv_file = os.path.join( - self.root, "modin/tests/pandas/data", "test_usecols.csv" - ) - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=csv_file, - header=header, - names=names, - ) - - @pytest.mark.parametrize("kwargs", [{"sep": "|"}, {"delimiter": "|"}]) - def test_sep_delimiter(self, kwargs): - csv_file = os.path.join(self.root, "modin/tests/pandas/data", "test_delim.csv") - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=csv_file, - **kwargs, - ) - - @pytest.mark.skip(reason="https://github.com/modin-project/modin/issues/2174") - def test_float32(self): - csv_file = os.path.join( - self.root, "modin/tests/pandas/data", "test_usecols.csv" - ) - kwargs = { - "dtype": {"a": "float32", "b": "float32"}, - } - - pandas_df = pandas.read_csv(csv_file, **kwargs) - pandas_df["a"] = pandas_df["a"] + pandas_df["b"] - - modin_df = pd.read_csv(csv_file, **kwargs, engine="arrow") - modin_df["a"] = modin_df["a"] + modin_df["b"] - with ForceHdkImport(modin_df): - df_equals(modin_df, pandas_df) - - # Datetime Handling tests - @pytest.mark.parametrize("engine", [None, "arrow"]) - @pytest.mark.parametrize( - "parse_dates", - [ - True, - False, - ["col2"], - ["c2"], - [["col2", "col3"]], - {"col23": ["col2", "col3"]}, - [], - ], - ) - @pytest.mark.parametrize("names", [None, [f"c{x}" for x in range(1, 7)]]) - def test_read_csv_datetime( - self, - engine, - parse_dates, - names, - request, - ): - parse_dates_unsupported = isinstance(parse_dates, dict) or ( - isinstance(parse_dates, list) - and any(not isinstance(date, str) for date in parse_dates) - ) - if parse_dates_unsupported and engine == "arrow" and not names: - pytest.skip( - "In these cases Modin raises `ArrowEngineException` while pandas " - + "doesn't raise any exceptions that causes tests fails" - ) - # In these cases Modin raises `ArrowEngineException` while pandas - # raises `ValueError`, so skipping exception type checking - skip_exc_type_check = parse_dates_unsupported and engine == "arrow" - if skip_exc_type_check: - pytest.xfail(reason="https://github.com/modin-project/modin/issues/7012") - - expected_exception = None - if "names1-parse_dates2" in request.node.callspec.id: - expected_exception = ValueError( - "Missing column provided to 'parse_dates': 'col2'" - ) - elif ( - "names1-parse_dates5-None" in request.node.callspec.id - or "names1-parse_dates4-None" in request.node.callspec.id - ): - expected_exception = ValueError( - "Missing column provided to 'parse_dates': 'col2, col3'" - ) - elif "None-parse_dates3" in request.node.callspec.id: - expected_exception = ValueError( - "Missing column provided to 'parse_dates': 'c2'" - ) - eval_io( - fn_name="read_csv", - md_extra_kwargs={"engine": engine}, - expected_exception=expected_exception, - # read_csv kwargs - filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], - parse_dates=parse_dates, - names=names, - ) - - @pytest.mark.parametrize("engine", [None, "arrow"]) - @pytest.mark.parametrize("parse_dates", [None, True, False]) - def test_read_csv_datetime_tz(self, engine, parse_dates): - with ensure_clean(".csv") as file: - with open(file, "w") as f: - f.write("test\n2023-01-01T00:00:00.000-07:00") - - eval_io( - fn_name="read_csv", - filepath_or_buffer=file, - md_extra_kwargs={"engine": engine}, - parse_dates=parse_dates, - ) - - @pytest.mark.parametrize("engine", [None, "arrow"]) - @pytest.mark.parametrize( - "usecols", - [ - None, - ["col1"], - ["col1", "col1"], - ["col1", "col2", "col6"], - ["col6", "col2", "col1"], - [0], - [0, 0], - [0, 1, 5], - [5, 1, 0], - lambda x: x in ["col1", "col2"], - ], - ) - def test_read_csv_col_handling( - self, - engine, - usecols, - ): - eval_io( - fn_name="read_csv", - check_kwargs_callable=not callable(usecols), - md_extra_kwargs={"engine": engine}, - # read_csv kwargs - filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], - usecols=usecols, - ) - - @pytest.mark.parametrize( - "cols", - [ - "c1,c2,c3", - "c1,c1,c2", - "c1,c1,c1.1,c1.2,c1", - "c1,c1,c1,c1.1,c1.2,c1.3", - "c1.1,c1.2,c1.3,c1,c1,c1", - "c1.1,c1,c1.2,c1,c1.3,c1", - "c1,c1.1,c1,c1.2,c1,c1.3", - "c1,c1,c1.1,c1.1,c1.2,c2", - "c1,c1,c1.1,c1.1,c1.2,c1.2,c2", - "c1.1,c1.1,c1,c1,c1.2,c1.2,c2", - "c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2", - ], - ) - def test_read_csv_duplicate_cols(self, cols): - def test(df, lib, **kwargs): - data = f"{cols}\n" - with ensure_clean(".csv") as fname: - with open(fname, "w") as f: - f.write(data) - return lib.read_csv(fname) - - run_and_compare(test, data={}) - - def test_read_csv_dtype_object(self): - with pytest.warns(UserWarning) as warns: - with ensure_clean(".csv") as file: - with open(file, "w") as f: - f.write("test\ntest") - - def test(**kwargs): - return pd.read_csv(file, dtype={"test": "object"}) - - run_and_compare(test, data={}) - for warn in warns.list: - assert not re.match(r".*defaulting to pandas.*", str(warn)) - - -class TestMasks: - data = { - "a": [1, 1, 2, 2, 3], - "b": [None, None, 2, 1, 3], - "c": [3, None, None, 2, 1], - } - cols_values = ["a", ["a", "b"], ["a", "b", "c"]] - - @pytest.mark.parametrize("cols", cols_values) - def test_projection(self, cols): - def projection(df, cols, **kwargs): - return df[cols] - - run_and_compare(projection, data=self.data, cols=cols) - - def test_drop(self): - def drop(df, column_names, **kwargs): - return df.drop(columns=column_names) - - run_and_compare(drop, data=self.data, column_names="a") - run_and_compare(drop, data=self.data, column_names=self.data.keys()) - - def test_drop_index(self): - def drop(df, **kwargs): - return df.drop(df.index[0]) - - idx = list(map(str, self.data["a"])) - run_and_compare( - drop, data=self.data, constructor_kwargs={"index": idx}, force_lazy=False - ) - - def test_iloc(self): - def mask(df, **kwargs): - return df.iloc[[0, 1]] - - run_and_compare(mask, data=self.data, allow_subqueries=True) - - def test_empty(self): - def empty(df, **kwargs): - return df - - run_and_compare(empty, data=None) - - def test_filter(self): - def filter(df, **kwargs): - return df[df["a"] == 1] - - run_and_compare(filter, data=self.data) - - def test_filter_with_index(self): - def filter(df, **kwargs): - df = df.groupby("a").sum() - return df[df["b"] > 1] - - run_and_compare(filter, data=self.data) - - def test_filter_proj(self): - def filter(df, **kwargs): - df1 = df + 2 - return df1[(df["a"] + df1["b"]) > 1] - - run_and_compare(filter, data=self.data) - - def test_filter_drop(self): - def filter(df, **kwargs): - df = df[["a", "b"]] - df = df[df["a"] != 1] - df["a"] = df["a"] * df["b"] - return df - - run_and_compare(filter, data=self.data) - - def test_filter_str_categorical(self): - def filter(df, **kwargs): - return df[df["A"] != ""] - - data = {"A": ["A", "B", "C"]} - run_and_compare(filter, data=data) - run_and_compare(filter, data=data, constructor_kwargs={"dtype": "category"}) - - -class TestMultiIndex: - data = {"a": np.arange(24), "b": np.arange(24)} - - @pytest.mark.parametrize("names", [None, ["", ""], ["name", "name"]]) - def test_dup_names(self, names): - index = pandas.MultiIndex.from_tuples( - [(i, j) for i in range(3) for j in range(8)], names=names - ) - - pandas_df = pandas.DataFrame(self.data, index=index) + 1 - modin_df = pd.DataFrame(self.data, index=index) + 1 - - df_equals(pandas_df, modin_df) - - @pytest.mark.parametrize( - "names", - [ - None, - [None, "s", None], - ["i1", "i2", "i3"], - ["i1", "i1", "i3"], - ["i1", "i2", "a"], - ], - ) - def test_reset_index(self, names, request): - index = pandas.MultiIndex.from_tuples( - [(i, j, k) for i in range(2) for j in range(3) for k in range(4)], - names=names, - ) - - def applier(lib): - df = lib.DataFrame(self.data, index=index) + 1 - return df.reset_index() - - expected_exception = None - if "names3" in request.node.callspec.id: - expected_exception = ValueError("cannot insert i1, already exists") - elif "names4" in request.node.callspec.id: - expected_exception = ValueError("cannot insert a, already exists") - eval_general(pd, pandas, applier, expected_exception=expected_exception) - - @pytest.mark.parametrize("is_multiindex", [True, False]) - def test_reset_index_multicolumns(self, is_multiindex): - index = ( - pandas.MultiIndex.from_tuples( - [(i, j, k) for i in range(2) for j in range(3) for k in range(4)], - names=["l1", "l2", "l3"], - ) - if is_multiindex - else pandas.Index(np.arange(1, len(self.data["a"]) + 1), name="index") - ) - data = np.array(list(self.data.values())).T - - def applier(df, **kwargs): - df = df + 1 - return df.reset_index(drop=False) - - run_and_compare( - fn=applier, - data=data, - constructor_kwargs={"index": index}, - ) - - def test_set_index_name(self): - index = pandas.Index.__new__(pandas.Index, data=[i for i in range(24)]) - - pandas_df = pandas.DataFrame(self.data, index=index) - pandas_df.index.name = "new_name" - modin_df = pd.DataFrame(self.data, index=index) - modin_df._query_compiler.set_index_name("new_name") - - df_equals(pandas_df, modin_df) - - def test_set_index_names(self): - index = pandas.MultiIndex.from_tuples( - [(i, j, k) for i in range(2) for j in range(3) for k in range(4)] - ) - - pandas_df = pandas.DataFrame(self.data, index=index) - pandas_df.index.names = ["new_name1", "new_name2", "new_name3"] - modin_df = pd.DataFrame(self.data, index=index) - modin_df._query_compiler.set_index_names( - ["new_name1", "new_name2", "new_name3"] - ) - - df_equals(pandas_df, modin_df) - - def test_rename(self): - index = pandas.MultiIndex.from_tuples( - [("foo1", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] - ) - columns = pandas.MultiIndex.from_tuples( - [("fizz1", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] - ) - - def rename(df, **kwargs): - return df.rename( - index={"foo1": "foo3", "bar2": "bar3"}, - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, - ) - - run_and_compare( - fn=rename, - data=[(0, 0), (1, 1)], - constructor_kwargs={"index": index, "columns": columns}, - force_lazy=False, - ) - - -class TestFillna: - data = {"a": [1, 1, None], "b": [None, None, 2], "c": [3, None, None]} - values = [1, {"a": 1, "c": 3}, {"a": 1, "b": 2, "c": 3}] - - @pytest.mark.parametrize("value", values) - def test_fillna_all(self, value): - def fillna(df, value, **kwargs): - return df.fillna(value) - - run_and_compare(fillna, data=self.data, value=value) - - def test_fillna_bool(self): - def fillna(df, **kwargs): - df["a"] = df["a"] == 1 - df["a"] = df["a"].fillna(False) - return df - - run_and_compare(fillna, data=self.data) - - -class TestConcat: - data = { - "a": [1, 2, 3], - "b": [10, 20, 30], - "d": [1000, 2000, 3000], - "e": [11, 22, 33], - } - data2 = { - "a": [4, 5, 6], - "c": [400, 500, 600], - "b": [40, 50, 60], - "f": [444, 555, 666], - } - data3 = { - "f": [2, 3, 4], - "g": [400, 500, 600], - "h": [20, 30, 40], - } - - @pytest.mark.parametrize("join", ["inner", "outer"]) - @pytest.mark.parametrize("sort", bool_arg_values) - @pytest.mark.parametrize("ignore_index", bool_arg_values) - def test_concat(self, join, sort, ignore_index): - def concat(lib, df1, df2, join, sort, ignore_index): - return lib.concat( - [df1, df2], join=join, sort=sort, ignore_index=ignore_index - ) - - run_and_compare( - concat, - data=self.data, - data2=self.data2, - join=join, - sort=sort, - ignore_index=ignore_index, - ) - - def test_concat_with_same_df(self): - def concat(df, **kwargs): - df["f"] = df["a"] - return df - - run_and_compare(concat, data=self.data) - - def test_setitem_lazy(self): - def applier(df, **kwargs): - df = df + 1 - df["a"] = df["a"] + 1 - df["e"] = df["a"] + 1 - df["new_int8"] = np.int8(10) - df["new_int16"] = np.int16(10) - df["new_int32"] = np.int32(10) - df["new_int64"] = np.int64(10) - df["new_int"] = 10 - df["new_float"] = 5.5 - df["new_float64"] = np.float64(10.1) - return df - - run_and_compare(applier, data=self.data) - - def test_setitem_default(self): - def applier(df, lib, **kwargs): - df = df + 1 - df["a"] = np.arange(3) - df["b"] = lib.Series(np.arange(3)) - return df - - run_and_compare(applier, data=self.data, force_lazy=False) - - def test_insert_lazy(self): - def applier(df, **kwargs): - df = df + 1 - df.insert(2, "new_int", 10) - df.insert(1, "new_float", 5.5) - df.insert(0, "new_a", df["a"] + 1) - return df - - run_and_compare(applier, data=self.data) - - def test_insert_default(self): - def applier(df, lib, **kwargs): - df = df + 1 - df.insert(1, "new_range", np.arange(3)) - df.insert(1, "new_series", lib.Series(np.arange(3))) - return df - - run_and_compare(applier, data=self.data, force_lazy=False) - - @pytest.mark.parametrize( - "data", [None, {"A": range(10)}, pandas.DataFrame({"A": range(10)})] - ) - @pytest.mark.parametrize( - "index", - [None, pandas.RangeIndex(10), pandas.RangeIndex(start=10, stop=0, step=-1)], - ) - @pytest.mark.parametrize("value", [list(range(10)), pandas.Series(range(10))]) - @pytest.mark.parametrize("part_type", [None, "arrow", "hdk"]) - @pytest.mark.parametrize("insert_scalar", [True, False]) - def test_insert_list(self, data, index, value, part_type, insert_scalar): - def create(): - mdf, pdf = create_test_dfs(data, index=index) - if part_type == "arrow": # Make sure the partition contains an arrow table - mdf._query_compiler._modin_frame._partitions[0][0].get(True) - elif part_type == "hdk": - mdf._query_compiler._modin_frame.force_import() - return mdf, pdf - - def insert(loc, name, value): - nonlocal mdf, pdf - mdf.insert(loc, name, value) - pdf.insert(loc, name, value) - if insert_scalar: - mdf[f"S{loc}"] = 1 - pdf[f"S{loc}"] = 1 - - niter = 3 - - mdf, pdf = create() - for i in range(niter): - insert(len(pdf.columns), f"B{i}", value) - df_equals(mdf, pdf) - - mdf, pdf = create() - for i in range(niter): - insert(0, f"C{i}", value) - df_equals(mdf, pdf) - - mdf, pdf = create() - for i in range(niter): - insert(len(pdf.columns), f"B{i}", value) - insert(0, f"C{i}", value) - insert(len(pdf.columns) // 2, f"D{i}", value) - df_equals(mdf, pdf) - - def test_concat_many(self): - def concat(df1, df2, lib, **kwargs): - df3 = df1.copy() - df4 = df2.copy() - return lib.concat([df1, df2, df3, df4]) - - def sort_comparator(df1, df2): - """Sort and verify equality of the passed frames.""" - # We sort values because order of rows in the 'union all' result is inconsistent in HDK - df1, df2 = ( - try_cast_to_pandas(df).sort_values(df.columns[0]) for df in (df1, df2) - ) - return df_equals(df1, df2) - - run_and_compare( - concat, - data=self.data, - data2=self.data2, - comparator=sort_comparator, - allow_subqueries=True, - ) - - def test_concat_agg(self): - def concat(lib, df1, df2): - df1 = df1.groupby("a", as_index=False).agg( - {"b": "sum", "d": "sum", "e": "sum"} - ) - df2 = df2.groupby("a", as_index=False).agg( - {"c": "sum", "b": "sum", "f": "sum"} - ) - return lib.concat([df1, df2]) - - run_and_compare(concat, data=self.data, data2=self.data2, allow_subqueries=True) - - @pytest.mark.parametrize("join", ["inner", "outer"]) - @pytest.mark.parametrize("sort", bool_arg_values) - @pytest.mark.parametrize("ignore_index", bool_arg_values) - def test_concat_single(self, join, sort, ignore_index): - def concat(lib, df, join, sort, ignore_index): - return lib.concat([df], join=join, sort=sort, ignore_index=ignore_index) - - run_and_compare( - concat, - data=self.data, - join=join, - sort=sort, - ignore_index=ignore_index, - ) - - def test_groupby_concat_single(self): - def concat(lib, df): - df = lib.concat([df]) - return df.groupby("a").agg({"b": "min"}) - - run_and_compare( - concat, - data=self.data, - ) - - @pytest.mark.parametrize("join", ["inner"]) - @pytest.mark.parametrize("sort", bool_arg_values) - @pytest.mark.parametrize("ignore_index", bool_arg_values) - def test_concat_join(self, join, sort, ignore_index): - def concat(lib, df1, df2, join, sort, ignore_index, **kwargs): - return lib.concat( - [df1, df2], axis=1, join=join, sort=sort, ignore_index=ignore_index - ) - - run_and_compare( - concat, - data=self.data, - data2=self.data3, - join=join, - sort=sort, - ignore_index=ignore_index, - ) - - def test_concat_index_name(self): - df1 = pandas.DataFrame(self.data) - df1 = df1.set_index("a") - df2 = pandas.DataFrame(self.data3) - df2 = df2.set_index("f") - - ref = pandas.concat([df1, df2], axis=1, join="inner") - exp = pd.concat([df1, df2], axis=1, join="inner") - - df_equals(ref, exp) - - df2.index.name = "a" - ref = pandas.concat([df1, df2], axis=1, join="inner") - exp = pd.concat([df1, df2], axis=1, join="inner") - - df_equals(ref, exp) - - def test_concat_index_names(self): - df1 = pandas.DataFrame(self.data) - df1 = df1.set_index(["a", "b"]) - df2 = pandas.DataFrame(self.data3) - df2 = df2.set_index(["f", "h"]) - - ref = pandas.concat([df1, df2], axis=1, join="inner") - exp = pd.concat([df1, df2], axis=1, join="inner") - - df_equals(ref, exp) - - df2.index.names = ["a", "b"] - ref = pandas.concat([df1, df2], axis=1, join="inner") - exp = pd.concat([df1, df2], axis=1, join="inner") - - df_equals(ref, exp) - - def test_concat_str(self): - def concat(df1, df2, lib, **kwargs): - return lib.concat([df1.dropna(), df2.dropna()]).astype(str) - - run_and_compare( - concat, - data={"a": ["1", "2", "3"]}, - data2={"a": ["4", "5", "6"]}, - force_lazy=False, - ) - - @pytest.mark.parametrize("transform", [True, False]) - @pytest.mark.parametrize("sort_last", [True, False]) - # RecursionError in case of concatenation of big number of frames - def test_issue_5889(self, transform, sort_last): - with ensure_clean(".csv") as file: - data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]} - pandas.DataFrame(data).to_csv(file, index=False) - - def test_concat(lib, **kwargs): - if transform: - - def read_csv(): - return lib.read_csv(file)["b"] - - else: - - def read_csv(): - return lib.read_csv(file) - - df = read_csv() - for _ in range(100): - df = lib.concat([df, read_csv()]) - if sort_last: - df = lib.concat([df, read_csv()], sort=True) - return df - - run_and_compare(test_concat, data={}) - - -class TestGroupby: - data = { - "a": [1, 1, 2, 2, 2, 1], - "b": [11, 21, 12, 22, 32, 11], - "c": [101, 201, 202, 202, 302, 302], - "d": [True, True, False, True, False, True], - } - cols_value = ["a", ["a", "b"]] - - @pytest.mark.parametrize("cols", cols_value) - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_groupby_sum(self, cols, as_index): - def groupby_sum(df, cols, as_index, **kwargs): - return df.groupby(cols, as_index=as_index).sum() - - run_and_compare(groupby_sum, data=self.data, cols=cols, as_index=as_index) - - @pytest.mark.parametrize("cols", cols_value) - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_groupby_count(self, cols, as_index): - def groupby_count(df, cols, as_index, **kwargs): - return df.groupby(cols, as_index=as_index).count() - - run_and_compare(groupby_count, data=self.data, cols=cols, as_index=as_index) - - @pytest.mark.parametrize("cols", cols_value) - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_groupby_mean(self, cols, as_index): - def groupby_mean(df, cols, as_index, **kwargs): - return df.groupby(cols, as_index=as_index).mean() - - run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index) - - @pytest.mark.parametrize("cols", cols_value) - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_groupby_proj_sum(self, cols, as_index): - def groupby_sum(df, cols, as_index, **kwargs): - return df.groupby(cols, as_index=as_index).c.sum() - - run_and_compare( - groupby_sum, data=self.data, cols=cols, as_index=as_index, force_lazy=False - ) - - @pytest.mark.parametrize("agg", ["count", "size", "nunique"]) - def test_groupby_agg(self, agg): - def groupby(df, agg, **kwargs): - return df.groupby("a").agg({"b": agg}) - - run_and_compare(groupby, data=self.data, agg=agg) - - def test_groupby_agg_default_to_pandas(self): - def lambda_func(df, **kwargs): - return df.groupby("a").agg(lambda df: (df.mean() - df.sum()) // 2) - - run_and_compare(lambda_func, data=self.data, force_lazy=False) - - def not_implemented_func(df, **kwargs): - return df.groupby("a").agg("cumprod") - - run_and_compare(lambda_func, data=self.data, force_lazy=False) - - @pytest.mark.parametrize("cols", cols_value) - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_groupby_agg_mean(self, cols, as_index): - def groupby_mean(df, cols, as_index, **kwargs): - return df.groupby(cols, as_index=as_index).agg("mean") - - run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index) - - def test_groupby_lazy_multiindex(self): - index = generate_multiindex(len(self.data["a"])) - - def groupby(df, *args, **kwargs): - df = df[["a", "b", "c"]] + 1 - return df.groupby("a").agg({"b": "size"}) - - run_and_compare(groupby, data=self.data, constructor_kwargs={"index": index}) - - def test_groupby_lazy_squeeze(self): - def applier(df, **kwargs): - return df.groupby("a").sum().squeeze(axis=1) - - run_and_compare( - applier, - data=self.data, - constructor_kwargs={"columns": ["a", "b"]}, - force_lazy=True, - ) - - @pytest.mark.parametrize("method", ["sum", "size"]) - def test_groupby_series(self, method): - def groupby(df, **kwargs): - ser = df[df.columns[0]] - return getattr(ser.groupby(ser), method)() - - run_and_compare(groupby, data=self.data) - - def test_groupby_size(self): - def groupby(df, **kwargs): - return df.groupby("a").size() - - run_and_compare(groupby, data=self.data) - - @pytest.mark.parametrize("by", [["a"], ["a", "b", "c"]]) - @pytest.mark.parametrize("agg", ["sum", "size", "mean", "median"]) - @pytest.mark.parametrize("as_index", [True, False]) - def test_groupby_agg_by_col(self, by, agg, as_index): - def simple_agg(df, **kwargs): - return df.groupby(by, as_index=as_index).agg(agg) - - run_and_compare(simple_agg, data=self.data) - - def dict_agg(df, **kwargs): - return df.groupby(by, as_index=as_index).agg({by[0]: agg}) - - run_and_compare(dict_agg, data=self.data) - - def dict_agg_all_cols(df, **kwargs): - return df.groupby(by, as_index=as_index).agg({col: agg for col in by}) - - run_and_compare(dict_agg_all_cols, data=self.data) - - def test_groupby_agg_list(self): - def agg(df, **kwargs): - return df.groupby("a")[["b", "c"]].agg(["sum", "size", "mean", "median"]) - - run_and_compare(agg, data=self.data) - - # modin-issue#3461 - def test_groupby_pure_by(self): - data = [1, 1, 2, 2] - # Test when 'by' is a 'TransformNode' - run_and_compare(lambda df: df.groupby(df).sum(), data=data, force_lazy=True) - - # Test when 'by' is a 'FrameNode' - md_ser, pd_ser = pd.Series(data), pandas.Series(data) - - md_ser._query_compiler._modin_frame._execute() - assert isinstance( - md_ser._query_compiler._modin_frame._op, FrameNode - ), "Triggering execution of the Modin frame supposed to set 'FrameNode' as a frame's op" - - set_execution_mode(md_ser, "lazy") - md_res = md_ser.groupby(md_ser).sum() - set_execution_mode(md_res, None) - - pd_res = pd_ser.groupby(pd_ser).sum() - df_equals(md_res, pd_res) - - taxi_data = { - "a": [1, 1, 2, 2], - "b": [11, 21, 12, 11], - "c": pandas.to_datetime( - ["20190902", "20180913", "20190921", "20180903"], format="%Y%m%d" - ), - "d": [11.5, 21.2, 12.8, 13.4], - } - - # TODO: emulate taxi queries with group by category types when we have loading - # using arrow - # Another way of doing taxi q1 is - # res = df.groupby("cab_type").size() - this should be tested later as well - def test_taxi_q1(self): - def taxi_q1(df, **kwargs): - return df.groupby("a").size() - - run_and_compare(taxi_q1, data=self.taxi_data) - - def test_taxi_q2(self): - def taxi_q2(df, **kwargs): - return df.groupby("a").agg({"b": "mean"}) - - run_and_compare(taxi_q2, data=self.taxi_data) - - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_taxi_q3(self, as_index): - def taxi_q3(df, as_index, **kwargs): - # TODO: remove 'astype' temp fix - return df.groupby( - ["b", df["c"].dt.year.astype("int32")], as_index=as_index - ).size() - - run_and_compare(taxi_q3, data=self.taxi_data, as_index=as_index) - - def test_groupby_expr_col(self): - def groupby(df, **kwargs): - df = df.loc[:, ["b", "c"]] - df["year"] = df["c"].dt.year - df["month"] = df["c"].dt.month - df["id1"] = df["year"] * 12 + df["month"] - df["id2"] = (df["id1"] - 24000) // 12 - df = df.groupby(["id1", "id2"], as_index=False).agg({"b": "max"}) - return df - - run_and_compare(groupby, data=self.taxi_data) - - def test_series_astype(self): - def series_astype(df, **kwargs): - return df["d"].astype("int") - - run_and_compare(series_astype, data=self.taxi_data) - - def test_df_astype(self): - def df_astype(df, **kwargs): - return df.astype({"b": "float", "d": "int"}) - - run_and_compare(df_astype, data=self.taxi_data) - - def test_df_indexed_astype(self): - def df_astype(df, **kwargs): - df = df.groupby("a").agg({"b": "sum"}) - return df.astype({"b": "float"}) - - run_and_compare(df_astype, data=self.taxi_data) - - @pytest.mark.parametrize("as_index", bool_arg_values) - def test_taxi_q4(self, as_index): - def taxi_q4(df, **kwargs): - df["c"] = df["c"].dt.year - df["d"] = df["d"].astype("int64") - df = df.groupby(["b", "c", "d"], sort=True, as_index=as_index).size() - if as_index: - df = df.reset_index() - return df.sort_values( - by=["c", 0 if as_index else "size"], - ignore_index=True, - ascending=[True, False], - ) - - run_and_compare(taxi_q4, data=self.taxi_data) - - h2o_data = { - "id1": ["id1", "id2", "id3", "id1", "id2", "id3", "id1", "id2", "id3", "id1"], - "id2": ["id1", "id2", "id1", "id2", "id1", "id2", "id1", "id2", "id1", "id2"], - "id3": ["id4", "id5", "id6", "id4", "id5", "id6", "id4", "id5", "id6", "id4"], - "id4": [4, 5, 4, 5, 4, 5, 4, 5, 4, 5], - "id5": [7, 8, 9, 7, 8, 9, 7, 8, 9, 7], - "id6": [7, 8, 7, 8, 7, 8, 7, 8, 7, 8], - "v1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - "v2": [1, 3, 5, 7, 9, 10, 8, 6, 4, 2], - "v3": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.0], - } - - def _get_h2o_df(self): - df = pandas.DataFrame(self.h2o_data) - df["id1"] = df["id1"].astype("category") - df["id2"] = df["id2"].astype("category") - df["id3"] = df["id3"].astype("category") - return df - - def test_h2o_q1(self): - df = self._get_h2o_df() - - ref = df.groupby(["id1"], observed=True).agg({"v1": "sum"}) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - set_execution_mode(modin_df, "lazy") - modin_df = modin_df.groupby(["id1"], observed=True, as_index=False).agg( - {"v1": "sum"} - ) - set_execution_mode(modin_df, None) - - exp = to_pandas(modin_df) - exp["id1"] = exp["id1"].astype("category") - - df_equals(ref, exp) - - def test_h2o_q2(self): - df = self._get_h2o_df() - - ref = df.groupby(["id1", "id2"], observed=True).agg({"v1": "sum"}) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - set_execution_mode(modin_df, "lazy") - modin_df = modin_df.groupby(["id1", "id2"], observed=True, as_index=False).agg( - {"v1": "sum"} - ) - set_execution_mode(modin_df, None) - - exp = to_pandas(modin_df) - exp["id1"] = exp["id1"].astype("category") - exp["id2"] = exp["id2"].astype("category") - - df_equals(ref, exp) - - def test_h2o_q3(self): - df = self._get_h2o_df() - - ref = df.groupby(["id3"], observed=True).agg({"v1": "sum", "v3": "mean"}) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - set_execution_mode(modin_df, "lazy") - modin_df = modin_df.groupby(["id3"], observed=True, as_index=False).agg( - {"v1": "sum", "v3": "mean"} - ) - set_execution_mode(modin_df, None) - - exp = to_pandas(modin_df) - exp["id3"] = exp["id3"].astype("category") - - df_equals(ref, exp) - - def test_h2o_q4(self): - df = self._get_h2o_df() - - ref = df.groupby(["id4"], observed=True).agg( - {"v1": "mean", "v2": "mean", "v3": "mean"} - ) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - set_execution_mode(modin_df, "lazy") - modin_df = modin_df.groupby(["id4"], observed=True, as_index=False).agg( - {"v1": "mean", "v2": "mean", "v3": "mean"} - ) - set_execution_mode(modin_df, None) - - exp = to_pandas(modin_df) - - df_equals(ref, exp) - - def test_h2o_q5(self): - df = self._get_h2o_df() - - ref = df.groupby(["id6"], observed=True).agg( - {"v1": "sum", "v2": "sum", "v3": "sum"} - ) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - set_execution_mode(modin_df, "lazy") - modin_df = modin_df.groupby(["id6"], observed=True, as_index=False).agg( - {"v1": "sum", "v2": "sum", "v3": "sum"} - ) - set_execution_mode(modin_df, None) - - exp = to_pandas(modin_df) - - df_equals(ref, exp) - - def test_h2o_q7(self): - df = self._get_h2o_df() - - ref = ( - df.groupby(["id3"], observed=True) - .agg({"v1": "max", "v2": "min"}) - .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]] - ) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - set_execution_mode(modin_df, "lazy") - modin_df = modin_df.groupby(["id3"], observed=True).agg( - {"v1": "max", "v2": "min"} - ) - modin_df["range_v1_v2"] = modin_df["v1"] - modin_df["v2"] - modin_df = modin_df[["range_v1_v2"]] - modin_df.reset_index(inplace=True) - set_execution_mode(modin_df, None) - - exp = to_pandas(modin_df) - exp["id3"] = exp["id3"].astype("category") - - df_equals(ref, exp) - - def test_h2o_q10(self): - df = self._get_h2o_df() - - ref = df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], observed=True).agg( - {"v3": "sum", "v1": "count"} - ) - ref.reset_index(inplace=True) - - modin_df = pd.DataFrame(df) - modin_df = modin_df.groupby( - ["id1", "id2", "id3", "id4", "id5", "id6"], observed=True - ).agg({"v3": "sum", "v1": "count"}) - modin_df.reset_index(inplace=True) - - exp = to_pandas(modin_df) - exp["id1"] = exp["id1"].astype("category") - exp["id2"] = exp["id2"].astype("category") - exp["id3"] = exp["id3"].astype("category") - - df_equals(ref, exp) - - std_data = { - "a": [1, 2, 1, 1, 1, 2, 2, 2, 1, 2], - "b": [4, 3, 1, 6, 9, 8, 0, 9, 5, 13], - "c": [12.8, 45.6, 23.5, 12.4, 11.2, None, 56.4, 12.5, 1, 55], - } - - def test_agg_std(self): - def std(df, **kwargs): - df = df.groupby("a").agg({"b": "std", "c": "std"}) - if not isinstance(df, pandas.DataFrame): - df = to_pandas(df) - df["b"] = df["b"].apply(lambda x: round(x, 10)) - df["c"] = df["c"].apply(lambda x: round(x, 10)) - return df - - run_and_compare(std, data=self.std_data, force_lazy=False) - - skew_data = { - "a": [1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 3, 4, 4], - "b": [4, 3, 1, 6, 9, 8, 0, 9, 5, 13, 12, 44, 6], - "c": [12.8, 45.6, 23.5, 12.4, 11.2, None, 56.4, 12.5, 1, 55, 4.5, 7.8, 9.4], - } - - def test_agg_skew(self): - def std(df, **kwargs): - df = df.groupby("a").agg({"b": "skew", "c": "skew"}) - if not isinstance(df, pandas.DataFrame): - df = to_pandas(df) - df["b"] = df["b"].apply(lambda x: round(x, 10)) - df["c"] = df["c"].apply(lambda x: round(x, 10)) - return df - - run_and_compare(std, data=self.skew_data, force_lazy=False) - - def test_multilevel(self): - def groupby(df, **kwargs): - return df.groupby("a").agg({"b": "min", "c": ["min", "max", "sum", "skew"]}) - - run_and_compare(groupby, data=self.data) - - @pytest.mark.parametrize("op", ["head", "tail"]) - @pytest.mark.parametrize("n", [10, -10]) - @pytest.mark.parametrize("invert", [True, False]) - @pytest.mark.parametrize("select", [True, False]) - @pytest.mark.parametrize("ascending", [True, False]) - def test_head_tail(self, op, n, invert, select, ascending): - def head(df, **kwargs): - if invert: - df = df[~df["col3"].isna()] - if select: - df = df[["col1", "col10", "col2", "col20"]] - if ascending is not None: - df = df.sort_values(["col2", "col10"], ascending=ascending) - df = df.groupby(["col1", "col20"]) - df = getattr(df, op)(n) - return df.sort_values(list(df.columns)) - - # When invert is false, the rowid column is materialized. - run_and_compare(head, data=test_data["int_data"], force_lazy=invert) - - @pytest.mark.parametrize("agg", ["nlargest", "nsmallest"]) - @pytest.mark.parametrize("n", [1, 5, 10]) - def test_topk(self, agg, n): - def topk(df, **kwargs): - return getattr(df.groupby("id6")["v3"], agg)(n).reset_index()[["id6", "v3"]] - - run_and_compare(topk, data=self.h2o_data) - - @pytest.mark.parametrize("time", [False, True]) - @pytest.mark.parametrize("q", [0.1, 0.5, 1.0]) - @pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] - ) - def test_quantile(self, time, q, interpolation): - def quantile(df, **kwargs): - if time: - df["v1"] = df["v1"].astype("datetime64[ns]") - return df.groupby("id4")[["v1", "v2", "v3"]].quantile(q, interpolation) - - run_and_compare(quantile, data=self.h2o_data) - - -class TestAgg: - data = { - "a": [1, 2, None, None, 1, None], - "b": [10, 20, None, 20, 10, None], - "c": [None, 200, None, 400, 500, 600], - "d": [11, 22, 33, 22, 33, 22], - "e": [True, True, False, True, False, True], - } - int_data = pandas.DataFrame(data).fillna(0).astype("int").to_dict() - - @pytest.mark.parametrize("agg", ["max", "min", "sum", "mean"]) - @pytest.mark.parametrize("skipna", bool_arg_values) - def test_simple_agg(self, agg, skipna): - def apply(df, agg, skipna, **kwargs): - return getattr(df, agg)(skipna=skipna) - - run_and_compare(apply, data=self.data, agg=agg, skipna=skipna, force_lazy=False) - - def test_count_agg(self): - def apply(df, **kwargs): - return df.count() - - run_and_compare(apply, data=self.data, force_lazy=False) - - @pytest.mark.parametrize("data", [data, int_data], ids=["nan_data", "int_data"]) - @pytest.mark.parametrize("cols", ["a", "d", ["a", "d"]]) - @pytest.mark.parametrize("dropna", [True, False]) - @pytest.mark.parametrize("sort", [True]) - @pytest.mark.parametrize("ascending", [True, False]) - def test_value_counts(self, data, cols, dropna, sort, ascending): - def value_counts(df, cols, dropna, sort, ascending, **kwargs): - return df[cols].value_counts(dropna=dropna, sort=sort, ascending=ascending) - - if dropna and pandas.DataFrame( - data, columns=cols if is_list_like(cols) else [cols] - ).isna().any(axis=None): - pytest.xfail( - reason="'dropna' parameter is forcibly disabled in HDK's GroupBy" - + "due to performance issues, you can track this problem at:" - + "https://github.com/modin-project/modin/issues/2896" - ) - - # Custom comparator is required because pandas is inconsistent about - # the order of equal values, we can't match this behavior. For more details: - # https://github.com/modin-project/modin/issues/1650 - run_and_compare( - value_counts, - data=data, - cols=cols, - dropna=dropna, - sort=sort, - ascending=ascending, - comparator=df_equals_with_non_stable_indices, - ) - - @pytest.mark.parametrize( - "method", ["sum", "mean", "max", "min", "count", "nunique"] - ) - def test_simple_agg_no_default(self, method): - def applier(df, **kwargs): - if isinstance(df, pd.DataFrame): - # At the end of reduce function it does inevitable `transpose`, which - # is defaulting to pandas. The following logic check that `transpose` is the only - # function that falling back to pandas in the reduce operation flow. - with pytest.warns(UserWarning) as warns: - res = getattr(df, method)() - for warn in warns.list: - message = warn.message.args[0] - if ( - "is_sparse is deprecated" in message - # TODO: make sure we can ignore this warning - or "Frame contain columns with unsupported data-types" - in message - # Looks like the warning comes from pyarrow, more details: - # https://github.com/pandas-dev/pandas/pull/52419 - or "Passing a BlockManager to DataFrame is deprecated" - in message - ): - continue - assert ( - re.match(r".*transpose.*defaulting to pandas", message) - is not None - ), f"Expected DataFrame.transpose defaulting to pandas warning, got: {message}" - else: - res = getattr(df, method)() - return res - - run_and_compare(applier, data=self.data, force_lazy=False) - - @pytest.mark.parametrize("data", [data, int_data]) - @pytest.mark.parametrize("dropna", bool_arg_values) - def test_nunique(self, data, dropna): - def applier(df, **kwargs): - return df.nunique(dropna=dropna) - - run_and_compare(applier, data=data, force_lazy=False) - - -class TestMerge: - data = { - "a": [1, 2, 3, 6, 5, 4], - "b": [10, 20, 30, 60, 50, 40], - "e": [11, 22, 33, 66, 55, 44], - } - data2 = { - "a": [4, 2, 3, 7, 1, 5], - "b": [40, 20, 30, 70, 10, 50], - "d": [4000, 2000, 3000, 7000, 1000, 5000], - } - on_values = ["a", ["a"], ["a", "b"], ["b", "a"], None] - how_values = ["inner", "left"] - - @pytest.mark.parametrize("on", on_values) - @pytest.mark.parametrize("how", how_values) - @pytest.mark.parametrize("sort", [True, False]) - def test_merge(self, on, how, sort): - def merge(lib, df1, df2, on, how, sort, **kwargs): - return df1.merge(df2, on=on, how=how, sort=sort) - - run_and_compare( - merge, data=self.data, data2=self.data2, on=on, how=how, sort=sort - ) - - def test_merge_non_str_column_name(self): - def merge(lib, df1, df2, on, **kwargs): - return df1.merge(df2, on=on, how="inner") - - run_and_compare(merge, data=[[1, 2], [3, 4]], data2=[[1, 2], [3, 4]], on=1) - - h2o_data = { - "id1": ["id1", "id10", "id100", "id1000"], - "id2": ["id2", "id20", "id200", "id2000"], - "id3": ["id3", "id30", "id300", "id3000"], - "id4": [4, 40, 400, 4000], - "id5": [5, 50, 500, 5000], - "id6": [6, 60, 600, 6000], - "v1": [3.3, 4.4, 7.7, 8.8], - } - - h2o_data_small = { - "id1": ["id10", "id100", "id1000", "id10000"], - "id4": [40, 400, 4000, 40000], - "v2": [30.3, 40.4, 70.7, 80.8], - } - - h2o_data_medium = { - "id1": ["id10", "id100", "id1000", "id10000"], - "id2": ["id20", "id200", "id2000", "id20000"], - "id4": [40, 400, 4000, 40000], - "id5": [50, 500, 5000, 50000], - "v2": [30.3, 40.4, 70.7, 80.8], - } - - h2o_data_big = { - "id1": ["id10", "id100", "id1000", "id10000"], - "id2": ["id20", "id200", "id2000", "id20000"], - "id3": ["id30", "id300", "id3000", "id30000"], - "id4": [40, 400, 4000, 40000], - "id5": [50, 500, 5000, 50000], - "id6": [60, 600, 6000, 60000], - "v2": [30.3, 40.4, 70.7, 80.8], - } - - def _get_h2o_df(self, data): - df = pandas.DataFrame(data) - if "id1" in data: - df["id1"] = df["id1"].astype("category") - if "id2" in data: - df["id2"] = df["id2"].astype("category") - if "id3" in data: - df["id3"] = df["id3"].astype("category") - return df - - # Currently HDK returns category as string columns - # and therefore casted to category it would only have - # values from actual data. In Pandas category would - # have old values as well. Simply casting category - # to string for somparison doesn't work because None - # casted to category and back to strting becomes - # "nan". So we cast everything to category and then - # to string. - def _fix_category_cols(self, df): - if "id1" in df.columns: - df["id1"] = df["id1"].astype("category") - df["id1"] = df["id1"].astype(str) - if "id1_x" in df.columns: - df["id1_x"] = df["id1_x"].astype("category") - df["id1_x"] = df["id1_x"].astype(str) - if "id1_y" in df.columns: - df["id1_y"] = df["id1_y"].astype("category") - df["id1_y"] = df["id1_y"].astype(str) - if "id2" in df.columns: - df["id2"] = df["id2"].astype("category") - df["id2"] = df["id2"].astype(str) - if "id2_x" in df.columns: - df["id2_x"] = df["id2_x"].astype("category") - df["id2_x"] = df["id2_x"].astype(str) - if "id2_y" in df.columns: - df["id2_y"] = df["id2_y"].astype("category") - df["id2_y"] = df["id2_y"].astype(str) - if "id3" in df.columns: - df["id3"] = df["id3"].astype("category") - df["id3"] = df["id3"].astype(str) - - def test_h2o_q1(self): - lhs = self._get_h2o_df(self.h2o_data) - rhs = self._get_h2o_df(self.h2o_data_small) - - ref = lhs.merge(rhs, on="id1") - self._fix_category_cols(ref) - - modin_lhs = pd.DataFrame(lhs) - modin_rhs = pd.DataFrame(rhs) - modin_res = modin_lhs.merge(modin_rhs, on="id1") - - exp = to_pandas(modin_res) - self._fix_category_cols(exp) - - df_equals(ref, exp) - - def test_h2o_q2(self): - lhs = self._get_h2o_df(self.h2o_data) - rhs = self._get_h2o_df(self.h2o_data_medium) - - ref = lhs.merge(rhs, on="id2") - self._fix_category_cols(ref) - - modin_lhs = pd.DataFrame(lhs) - modin_rhs = pd.DataFrame(rhs) - modin_res = modin_lhs.merge(modin_rhs, on="id2") - - exp = to_pandas(modin_res) - self._fix_category_cols(exp) - - df_equals(ref, exp) - - def test_h2o_q3(self): - lhs = self._get_h2o_df(self.h2o_data) - rhs = self._get_h2o_df(self.h2o_data_medium) - - ref = lhs.merge(rhs, how="left", on="id2") - self._fix_category_cols(ref) - - modin_lhs = pd.DataFrame(lhs) - modin_rhs = pd.DataFrame(rhs) - modin_res = modin_lhs.merge(modin_rhs, how="left", on="id2") - - exp = to_pandas(modin_res) - self._fix_category_cols(exp) - - df_equals(ref, exp) - - def test_h2o_q4(self): - lhs = self._get_h2o_df(self.h2o_data) - rhs = self._get_h2o_df(self.h2o_data_medium) - - ref = lhs.merge(rhs, on="id5") - self._fix_category_cols(ref) - - modin_lhs = pd.DataFrame(lhs) - modin_rhs = pd.DataFrame(rhs) - modin_res = modin_lhs.merge(modin_rhs, on="id5") - - exp = to_pandas(modin_res) - self._fix_category_cols(exp) - - df_equals(ref, exp) - - def test_h2o_q5(self): - lhs = self._get_h2o_df(self.h2o_data) - rhs = self._get_h2o_df(self.h2o_data_big) - - ref = lhs.merge(rhs, on="id3") - self._fix_category_cols(ref) - - modin_lhs = pd.DataFrame(lhs) - modin_rhs = pd.DataFrame(rhs) - modin_res = modin_lhs.merge(modin_rhs, on="id3") - - exp = to_pandas(modin_res) - self._fix_category_cols(exp) - - df_equals(ref, exp) - - dt_data1 = { - "id": [1, 2], - "timestamp": pandas.to_datetime(["20000101", "20000201"], format="%Y%m%d"), - } - dt_data2 = {"id": [1, 2], "timestamp_year": [2000, 2000]} - - def test_merge_dt(self): - def merge(df1, df2, **kwargs): - df1["timestamp_year"] = df1["timestamp"].dt.year - res = df1.merge(df2, how="left", on=["id", "timestamp_year"]) - res["timestamp_year"] = res["timestamp_year"].fillna(np.int64(-1)) - return res - - run_and_compare(merge, data=self.dt_data1, data2=self.dt_data2) - - left_data = {"a": [1, 2, 3, 4], "b": [10, 20, 30, 40], "c": [11, 12, 13, 14]} - right_data = {"c": [1, 2, 3, 4], "b": [10, 20, 30, 40], "d": [100, 200, 300, 400]} - - @pytest.mark.parametrize("how", how_values) - @pytest.mark.parametrize( - "left_on, right_on", [["a", "c"], [["a", "b"], ["c", "b"]]] - ) - def test_merge_left_right_on(self, how, left_on, right_on): - def merge(df1, df2, how, left_on, right_on, **kwargs): - return df1.merge(df2, how=how, left_on=left_on, right_on=right_on) - - run_and_compare( - merge, - data=self.left_data, - data2=self.right_data, - how=how, - left_on=left_on, - right_on=right_on, - ) - run_and_compare( - merge, - data=self.right_data, - data2=self.left_data, - how=how, - left_on=right_on, - right_on=left_on, - ) - - def test_self_merge(self): - def merge(df, lib, iterations, **kwargs): - for _ in range(iterations): - df = lib.merge(df, df) - return df - - for i in range(1, 3): - run_and_compare( - merge, - data={"a": [1]}, - iterations=i, - ) - - def test_merge_float(self): - def merge(df, df2, on_columns, **kwargs): - return df.merge(df2, on=on_columns) - - run_and_compare( - merge, - data={"A": [1, 2] * 1000}, - data2={"A": [1.0, 3.0] * 1000}, - on_columns="A", - force_lazy=False, - ) - - def test_merge_categorical(self): - def merge(df, df2, on_columns, **kwargs): - return df.merge(df2, on=on_columns) - - run_and_compare( - merge, - data={"A": [1, 2] * 1000}, - data2={"A": [1.0, 3.0] * 1000}, - on_columns="A", - constructor_kwargs={"dtype": "category"}, - comparator=lambda df1, df2: df_equals(df1.astype(float), df2.astype(float)), - force_lazy=False, - ) - - def test_merge_date(self): - def merge(df, df2, on_columns, **kwargs): - return df.merge(df2, on=on_columns) - - run_and_compare( - merge, - data={ - "A": [ - pd.Timestamp("2023-01-01"), - pd.Timestamp("2023-01-02"), - ] - }, - data2={ - "A": [ - pd.Timestamp("2023-01-01"), - pd.Timestamp("2023-01-03"), - ] - }, - on_columns="A", - ) - - -class TestBinaryOp: - data = { - "a": [1, 1, 1, 1, 1], - "b": [10, 10, 10, 10, 10], - "c": [100, 100, 100, 100, 100], - "d": [1000, 1000, 1000, 1000, 1000], - } - data2 = { - "a": [1, 1, 1, 1, 1], - "f": [2, 2, 2, 2, 2], - "b": [3, 3, 3, 3, 3], - "d": [4, 4, 4, 4, 4], - } - fill_values = [None, 1] - - def test_binary_level(self): - def applier(df1, df2, **kwargs): - df2.index = generate_multiindex(len(df2)) - return df1.add(df2, level=1) - - # setting `force_lazy=False`, because we're expecting to fallback - # to pandas in that case, which is not supported in lazy mode - run_and_compare(applier, data=self.data, data2=self.data, force_lazy=False) - - def test_add_cst(self): - def add(df, **kwargs): - return df + 1 - - run_and_compare(add, data=self.data) - - def test_add_list(self): - def add(df, **kwargs): - return df + [1, 2, 3, 4] - - run_and_compare(add, data=self.data) - - @pytest.mark.parametrize("fill_value", fill_values) - def test_add_method_columns(self, fill_value): - def add1(df, fill_value, **kwargs): - return df["a"].add(df["b"], fill_value=fill_value) - - def add2(df, fill_value, **kwargs): - return df[["a", "c"]].add(df[["b", "a"]], fill_value=fill_value) - - run_and_compare(add1, data=self.data, fill_value=fill_value) - run_and_compare(add2, data=self.data, fill_value=fill_value) - - def test_add_columns(self): - def add1(df, **kwargs): - return df["a"] + df["b"] - - def add2(df, **kwargs): - return df[["a", "c"]] + df[["b", "a"]] - - run_and_compare(add1, data=self.data) - run_and_compare(add2, data=self.data) - - def test_add_columns_and_assign(self): - def add(df, **kwargs): - df["sum"] = df["a"] + df["b"] - return df - - run_and_compare(add, data=self.data) - - def test_add_columns_and_assign_to_existing(self): - def add(df, **kwargs): - df["a"] = df["a"] + df["b"] - return df - - run_and_compare(add, data=self.data) - - def test_mul_cst(self): - def mul(df, **kwargs): - return df * 2 - - run_and_compare(mul, data=self.data) - - def test_mul_list(self): - def mul(df, **kwargs): - return df * [2, 3, 4, 5] - - run_and_compare(mul, data=self.data) - - @pytest.mark.parametrize("fill_value", fill_values) - def test_mul_method_columns(self, fill_value): - def mul1(df, fill_value, **kwargs): - return df["a"].mul(df["b"], fill_value=fill_value) - - def mul2(df, fill_value, **kwargs): - return df[["a", "c"]].mul(df[["b", "a"]], fill_value=fill_value) - - run_and_compare(mul1, data=self.data, fill_value=fill_value) - run_and_compare(mul2, data=self.data, fill_value=fill_value) - - def test_mul_columns(self): - def mul1(df, **kwargs): - return df["a"] * df["b"] - - def mul2(df, **kwargs): - return df[["a", "c"]] * df[["b", "a"]] - - run_and_compare(mul1, data=self.data) - run_and_compare(mul2, data=self.data) - - def test_mod_cst(self): - def mod(df, **kwargs): - return df % 2 - - run_and_compare(mod, data=self.data) - - def test_mod_list(self): - def mod(df, **kwargs): - return df % [2, 3, 4, 5] - - run_and_compare(mod, data=self.data) - - @pytest.mark.parametrize("fill_value", fill_values) - def test_mod_method_columns(self, fill_value): - def mod1(df, fill_value, **kwargs): - return df["a"].mod(df["b"], fill_value=fill_value) - - def mod2(df, fill_value, **kwargs): - return df[["a", "c"]].mod(df[["b", "a"]], fill_value=fill_value) - - run_and_compare(mod1, data=self.data, fill_value=fill_value) - run_and_compare(mod2, data=self.data, fill_value=fill_value) - - def test_mod_columns(self): - def mod1(df, **kwargs): - return df["a"] % df["b"] - - def mod2(df, **kwargs): - return df[["a", "c"]] % df[["b", "a"]] - - run_and_compare(mod1, data=self.data) - run_and_compare(mod2, data=self.data) - - def test_truediv_cst(self): - def truediv(df, **kwargs): - return df / 2 - - run_and_compare(truediv, data=self.data) - - def test_truediv_list(self): - def truediv(df, **kwargs): - return df / [1, 0.5, 0.2, 2.0] - - run_and_compare(truediv, data=self.data) - - @pytest.mark.parametrize("fill_value", fill_values) - def test_truediv_method_columns(self, fill_value): - def truediv1(df, fill_value, **kwargs): - return df["a"].truediv(df["b"], fill_value=fill_value) - - def truediv2(df, fill_value, **kwargs): - return df[["a", "c"]].truediv(df[["b", "a"]], fill_value=fill_value) - - run_and_compare(truediv1, data=self.data, fill_value=fill_value) - run_and_compare(truediv2, data=self.data, fill_value=fill_value) - - def test_truediv_columns(self): - def truediv1(df, **kwargs): - return df["a"] / df["b"] - - def truediv2(df, **kwargs): - return df[["a", "c"]] / df[["b", "a"]] - - run_and_compare(truediv1, data=self.data) - run_and_compare(truediv2, data=self.data) - - def test_floordiv_cst(self): - def floordiv(df, **kwargs): - return df // 2 - - run_and_compare(floordiv, data=self.data) - - def test_floordiv_list(self): - def floordiv(df, **kwargs): - return df // [1, 0.54, 0.24, 2.01] - - run_and_compare(floordiv, data=self.data) - - @pytest.mark.parametrize("fill_value", fill_values) - def test_floordiv_method_columns(self, fill_value): - def floordiv1(df, fill_value, **kwargs): - return df["a"].floordiv(df["b"], fill_value=fill_value) - - def floordiv2(df, fill_value, **kwargs): - return df[["a", "c"]].floordiv(df[["b", "a"]], fill_value=fill_value) - - run_and_compare(floordiv1, data=self.data, fill_value=fill_value) - run_and_compare(floordiv2, data=self.data, fill_value=fill_value) - - def test_floordiv_columns(self): - def floordiv1(df, **kwargs): - return df["a"] // df["b"] - - def floordiv2(df, **kwargs): - return df[["a", "c"]] // df[["b", "a"]] - - run_and_compare(floordiv1, data=self.data) - run_and_compare(floordiv2, data=self.data) - - cmp_data = { - "a": [1, 2, 3, 4, 5], - "b": [10, 20, 30, 40, 50], - "c": [50.0, 40.0, 30.1, 20.0, 10.0], - } - cmp_fn_values = ["eq", "ne", "le", "lt", "ge", "gt"] - - @pytest.mark.parametrize("cmp_fn", cmp_fn_values) - def test_cmp_cst(self, cmp_fn): - def cmp1(df, cmp_fn, **kwargs): - return getattr(df["a"], cmp_fn)(3) - - def cmp2(df, cmp_fn, **kwargs): - return getattr(df, cmp_fn)(30) - - run_and_compare(cmp1, data=self.cmp_data, cmp_fn=cmp_fn) - run_and_compare(cmp2, data=self.cmp_data, cmp_fn=cmp_fn) - - @pytest.mark.parametrize("cmp_fn", cmp_fn_values) - def test_cmp_list(self, cmp_fn): - def cmp(df, cmp_fn, **kwargs): - return getattr(df, cmp_fn)([3, 30, 30.1]) - - run_and_compare(cmp, data=self.cmp_data, cmp_fn=cmp_fn) - - @pytest.mark.parametrize("cmp_fn", cmp_fn_values) - def test_cmp_cols(self, cmp_fn): - def cmp1(df, cmp_fn, **kwargs): - return getattr(df["b"], cmp_fn)(df["c"]) - - def cmp2(df, cmp_fn, **kwargs): - return getattr(df[["b", "c"]], cmp_fn)(df[["a", "b"]]) - - run_and_compare(cmp1, data=self.cmp_data, cmp_fn=cmp_fn) - run_and_compare(cmp2, data=self.cmp_data, cmp_fn=cmp_fn) - - @pytest.mark.parametrize("cmp_fn", cmp_fn_values) - @pytest.mark.parametrize("value", [2, 2.2, "a"]) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_cmp_mixed_types(self, cmp_fn, value, data): - def cmp(df, cmp_fn, value, **kwargs): - return getattr(df, cmp_fn)(value) - - run_and_compare(cmp, data=data, cmp_fn=cmp_fn, value=value) - - def test_filter_dtypes(self): - def filter(df, **kwargs): - return df[df.a < 4].dtypes - - run_and_compare(filter, data=self.cmp_data) - - def test_filter_empty_result(self): - def filter(df, **kwargs): - return df[df.a < 0] - - run_and_compare(filter, data=self.cmp_data) - - def test_complex_filter(self): - def filter_and(df, **kwargs): - return df[(df.a < 5) & (df.b > 20)] - - def filter_or(df, **kwargs): - return df[(df.a < 3) | (df.b > 40)] - - run_and_compare(filter_and, data=self.cmp_data) - run_and_compare(filter_or, data=self.cmp_data) - - def test_string_bin_op(self): - def test_bin_op(df, op_name, op_arg, **kwargs): - return getattr(df, op_name)(op_arg) - - bin_ops = { - "__add__": "_sfx", - "__radd__": "pref_", - "__mul__": 10, - } - - for op, arg in bin_ops.items(): - run_and_compare( - test_bin_op, data={"a": ["a"]}, op_name=op, op_arg=arg, force_lazy=False - ) - - @pytest.mark.parametrize("force_hdk", [False, True]) - def test_arithmetic_ops(self, force_hdk): - def compute(df, operation, **kwargs): - df = getattr(df, operation)(3) - return df - - for op in ( - "__add__", - "__sub__", - "__mul__", - "__pow__", - "__truediv__", - "__floordiv__", - ): - run_and_compare( - compute, - {"A": [1, 2, 3, 4, 5]}, - operation=op, - force_hdk_execute=force_hdk, - ) - - @pytest.mark.parametrize("force_hdk", [False, True]) - def test_invert_op(self, force_hdk): - def invert(df, **kwargs): - return ~df - - run_and_compare(invert, {"A": [1, 2, 3, 4, 5]}, force_hdk_execute=force_hdk) - - -class TestDateTime: - datetime_data = { - "a": [1, 1, 2, 2], - "b": [11, 21, 12, 11], - "c": pandas.to_datetime( - ["20190902", "20180913", "20190921", "20180903"], format="%Y%m%d" - ), - "d": pandas.to_datetime( - [ - "2018-10-26 12:00", - "2018-10-26 13:00:15", - "2020-10-26 04:00:15.000000002", - "2020-10-26", - ], - format="mixed", - ), - } - - def test_dt_year(self): - def dt_year(df, **kwargs): - return df["c"].dt.year - - run_and_compare(dt_year, data=self.datetime_data) - - def test_dt_month(self): - def dt_month(df, **kwargs): - return df["c"].dt.month - - run_and_compare(dt_month, data=self.datetime_data) - - def test_dt_day(self): - def dt_day(df, **kwargs): - return df["c"].dt.day - - run_and_compare(dt_day, data=self.datetime_data) - - def test_dt_hour(self): - def dt_hour(df, **kwargs): - return df["d"].dt.hour - - run_and_compare(dt_hour, data=self.datetime_data) - - def test_dt_minute(self): - def dt_minute(df, **kwargs): - return df["d"].dt.minute - - run_and_compare(dt_minute, data=self.datetime_data) - - def test_dt_second(self): - def dt_second(df, **kwargs): - return df["d"].dt.second - - run_and_compare(dt_second, data=self.datetime_data) - - def test_dt_microsecond(self): - def dt_microsecond(df, **kwargs): - return df["d"].dt.microsecond - - run_and_compare(dt_microsecond, data=self.datetime_data) - - def test_dt_nanosecond(self): - def dt_nanosecond(df, **kwargs): - return df["d"].dt.nanosecond - - run_and_compare(dt_nanosecond, data=self.datetime_data) - - def test_dt_quarter(self): - def dt_quarter(df, **kwargs): - return df["c"].dt.quarter - - run_and_compare(dt_quarter, data=self.datetime_data) - - def test_dt_dayofweek(self): - def dt_dayofweek(df, **kwargs): - return df["c"].dt.dayofweek - - run_and_compare(dt_dayofweek, data=self.datetime_data) - - def test_dt_weekday(self): - def dt_weekday(df, **kwargs): - return df["c"].dt.weekday - - run_and_compare(dt_weekday, data=self.datetime_data) - - def test_dt_dayofyear(self): - def dt_dayofyear(df, **kwargs): - return df["c"].dt.dayofyear - - run_and_compare(dt_dayofyear, data=self.datetime_data) - - @pytest.mark.parametrize("cast", [True, False]) - @pytest.mark.parametrize("unit", CalciteSerializer._TIMESTAMP_PRECISION.keys()) - def test_dt_serialization(self, cast, unit): - fill_value = np.datetime64(3, unit) - - def serialize(df, **kwargs): - if cast: - df = df.astype(f"datetime64[{unit}]") - return df.fillna(fill_value) - - def cmp(df1, df2): - assert df1["date"].max().asm8 == fill_value - assert df2["date"].max().asm8 == fill_value - df_equals(df1, df2) - - run_and_compare( - serialize, - data={ - "date": [ - np.datetime64(1, unit), - np.datetime64(2, unit), - None, - ] - }, - comparator=cmp, - ) - - -class TestCategory: - data = { - "a": ["str1", "str2", "str1", "str3", "str2", None], - } - - def test_cat_codes(self): - pandas_df = pandas.DataFrame(self.data) - pandas_df["a"] = pandas_df["a"].astype("category") - - modin_df = pd.DataFrame(pandas_df) - - modin_df["a"] = modin_df["a"].cat.codes - exp = to_pandas(modin_df) - - pandas_df["a"] = pandas_df["a"].cat.codes - - df_equals(pandas_df, exp) - - -class TestSort: - # In order for the row order to be deterministic after sorting, - # the `by` columns should not contain duplicate values. - data = { - "a": [1, 2, 5, -2, -5, 4, -4, 6, 3], - "b": [1, 2, 3, 6, 5, 1, 4, 5, 3], - "c": [5, 4, 2, 3, 1, 1, 4, 5, 6], - "d": ["1", "4", "3", "2", "1", "6", "7", "5", "0"], - } - data_nulls = { - "a": [1, 2, 5, -2, -5, 4, -4, None, 3], - "b": [1, 2, 3, 6, 5, None, 4, 5, 3], - "c": [None, 4, 2, 3, 1, 1, 4, 5, 6], - } - data_multiple_nulls = { - "a": [1, 2, None, -2, 5, 4, -4, None, 3], - "b": [1, 2, 3, 6, 5, None, 4, 5, None], - "c": [None, 4, 2, None, 1, 1, 4, 5, 6], - } - cols_values = ["a", ["a", "b"], ["b", "a"], ["c", "a", "b"]] - index_cols_values = [None, "a", ["a", "b"]] - ascending_values = [True, False] - ascending_list_values = [[True, False], [False, True]] - na_position_values = ["first", "last"] - - @pytest.mark.parametrize("cols", cols_values) - @pytest.mark.parametrize("ignore_index", bool_arg_values) - @pytest.mark.parametrize("ascending", ascending_values) - @pytest.mark.parametrize("index_cols", index_cols_values) - def test_sort_cols(self, cols, ignore_index, index_cols, ascending): - def sort(df, cols, ignore_index, index_cols, ascending, **kwargs): - if index_cols: - df = df.set_index(index_cols) - df_equals_with_non_stable_indices() - return df.sort_values(cols, ignore_index=ignore_index, ascending=ascending) - - run_and_compare( - sort, - data=self.data, - cols=cols, - ignore_index=ignore_index, - index_cols=index_cols, - ascending=ascending, - # we're expecting to fallback to pandas in that case, - # which is not supported in lazy mode - force_lazy=(index_cols is None), - ) - - @pytest.mark.parametrize("ascending", ascending_list_values) - def test_sort_cols_asc_list(self, ascending): - def sort(df, ascending, **kwargs): - return df.sort_values(["a", "b"], ascending=ascending) - - run_and_compare( - sort, - data=self.data, - ascending=ascending, - ) - - @pytest.mark.skipif( - hdk_version == "0.7.0", - reason="https://github.com/modin-project/modin/issues/6514", - ) - @pytest.mark.parametrize("ascending", ascending_values) - def test_sort_cols_str(self, ascending): - def sort(df, ascending, **kwargs): - return df.sort_values("d", ascending=ascending) - - run_and_compare( - sort, - data=self.data, - ascending=ascending, - ) - - @pytest.mark.parametrize("cols", cols_values) - @pytest.mark.parametrize("ascending", ascending_values) - @pytest.mark.parametrize("na_position", na_position_values) - def test_sort_cols_nulls(self, cols, ascending, na_position): - def sort(df, cols, ascending, na_position, **kwargs): - return df.sort_values(cols, ascending=ascending, na_position=na_position) - - run_and_compare( - sort, - data=self.data_nulls, - cols=cols, - ascending=ascending, - na_position=na_position, - ) - - # Issue #1767 - rows order is not preserved for NULL keys - # @pytest.mark.parametrize("cols", cols_values) - # @pytest.mark.parametrize("ascending", ascending_values) - # @pytest.mark.parametrize("na_position", na_position_values) - # def test_sort_cols_multiple_nulls(self, cols, ascending, na_position): - # def sort(df, cols, ascending, na_position, **kwargs): - # return df.sort_values(cols, ascending=ascending, na_position=na_position) - # - # run_and_compare( - # sort, - # data=self.data_multiple_nulls, - # cols=cols, - # ascending=ascending, - # na_position=na_position, - # ) - - -class TestBadData: - bad_for_arrow = { - "a": ["a", [[1, 2], [3]], [3, 4]], - "b": ["b", [1, 2], [3, 4]], - "c": ["1", "2", 3], - } - bad_for_hdk = { - "b": [[1, 2], [3, 4], [5, 6]], - "c": ["1", "2", "3"], - } - ok_data = {"d": np.arange(3), "e": np.arange(3), "f": np.arange(3)} - - def _get_pyarrow_table(self, obj): - if not isinstance(obj, (pandas.DataFrame, pandas.Series)): - obj = pandas.DataFrame(obj) - - return pyarrow.Table.from_pandas(obj) - - @pytest.mark.parametrize("data", [bad_for_arrow, bad_for_hdk]) - def test_construct(self, data): - def applier(df, *args, **kwargs): - return repr(df) - - run_and_compare(applier, data=data, force_lazy=False) - - def test_from_arrow(self): - at = self._get_pyarrow_table(self.bad_for_hdk) - pd_df = pandas.DataFrame(self.bad_for_hdk) - md_df = pd.utils.from_arrow(at) - - # force materialization - repr(md_df) - df_equals(md_df, pd_df) - - @pytest.mark.parametrize("data", [bad_for_arrow, bad_for_hdk]) - def test_methods(self, data): - def applier(df, *args, **kwargs): - return df.T.drop(columns=[0]) - - run_and_compare(applier, data=data, force_lazy=False) - - def test_with_normal_frame(self): - def applier(df1, df2, *args, **kwargs): - return df2.join(df1) - - run_and_compare( - applier, data=self.bad_for_hdk, data2=self.ok_data, force_lazy=False - ) - - def test_heterogenous_fillna(self): - def fillna(df, **kwargs): - return df["d"].fillna("a") - - run_and_compare(fillna, data=self.ok_data, force_lazy=False) - - @pytest.mark.parametrize( - "md_df_constructor", - [ - pytest.param(pd.DataFrame, id="from_pandas_dataframe"), - pytest.param( - lambda pd_df: from_arrow(pyarrow.Table.from_pandas(pd_df)), - id="from_pyarrow_table", - ), - ], - ) - def test_uint(self, md_df_constructor): - """ - Verify that unsigned integer data could be imported-exported via HDK with no errors. - - Originally, HDK does not support unsigned integers, there's a logic in Modin that - upcasts unsigned types to the compatible ones prior importing to HDK. - """ - pd_df = pandas.DataFrame( - { - "uint8_in_int_bounds": np.array([1, 2, 3], dtype="uint8"), - "uint8_out-of_int_bounds": np.array( - [(2**8) - 1, (2**8) - 2, (2**8) - 3], dtype="uint8" - ), - "uint16_in_int_bounds": np.array([1, 2, 3], dtype="uint16"), - "uint16_out-of_int_bounds": np.array( - [(2**16) - 1, (2**16) - 2, (2**16) - 3], dtype="uint16" - ), - "uint32_in_int_bounds": np.array([1, 2, 3], dtype="uint32"), - "uint32_out-of_int_bounds": np.array( - [(2**32) - 1, (2**32) - 2, (2**32) - 3], dtype="uint32" - ), - "uint64_in_int_bounds": np.array([1, 2, 3], dtype="uint64"), - } - ) - md_df = md_df_constructor(pd_df) - - with ForceHdkImport(md_df) as instance: - md_df_exported = instance.export_frames()[0] - result = md_df_exported.values - reference = pd_df.values - np.testing.assert_array_equal(result, reference) - - @pytest.mark.parametrize( - "md_df_constructor", - [ - pytest.param(pd.DataFrame, id="from_pandas_dataframe"), - pytest.param( - lambda pd_df: from_arrow(pyarrow.Table.from_pandas(pd_df)), - id="from_pyarrow_table", - ), - ], - ) - def test_uint_overflow(self, md_df_constructor): - """ - Verify that the exception is arisen when overflow occurs due to 'uint -> int' compatibility conversion. - - Originally, HDK does not support unsigned integers, there's a logic in Modin that upcasts - unsigned types to the compatible ones prior importing to HDK. This test ensures that the - error is arisen when such conversion causes a data loss. - """ - md_df = md_df_constructor( - pandas.DataFrame( - { - "col": np.array( - [(2**64) - 1, (2**64) - 2, (2**64) - 3], dtype="uint64" - ) - } - ) - ) - - with pytest.raises(OverflowError): - with ForceHdkImport(md_df): - pass - - def test_uint_serialization(self): - # Tests for CalciteSerializer.serialize_literal() - df = pd.DataFrame({"A": [np.nan, 1]}) - assert ( - df.fillna(np.uint8(np.iinfo(np.uint8).max)).sum()[0] - == np.iinfo(np.uint8).max + 1 - ) - assert ( - df.fillna(np.uint16(np.iinfo(np.uint16).max)).sum()[0] - == np.iinfo(np.uint16).max + 1 - ) - assert ( - df.fillna(np.uint32(np.iinfo(np.uint32).max)).sum()[0] - == np.iinfo(np.uint32).max + 1 - ) - # HDK represents 'uint64' as 'int64' internally due to a lack of support - # for unsigned ints, that's why using 'int64.max' here - assert ( - df.fillna(np.uint64(np.iinfo(np.int64).max - 1)).sum()[0] - == np.iinfo(np.int64).max - ) - - # Tests for CalciteSerializer.serialize_dtype() - df = pd.DataFrame({"A": [np.iinfo(np.uint8).max, 1]}) - assert df.astype(np.uint8).sum()[0] == np.iinfo(np.uint8).max + 1 - df = pd.DataFrame({"A": [np.iinfo(np.uint16).max, 1]}) - assert df.astype(np.uint16).sum()[0] == np.iinfo(np.uint16).max + 1 - df = pd.DataFrame({"A": [np.iinfo(np.uint32).max, 1]}) - assert df.astype(np.uint32).sum()[0] == np.iinfo(np.uint32).max + 1 - # HDK represents 'uint64' as 'int64' internally due to a lack of support - # for unsigned ints, that's why using 'int64.max' here - df = pd.DataFrame({"A": [np.iinfo(np.int64).max - 1, 1]}) - assert df.astype(np.uint64).sum()[0] == np.iinfo(np.int64).max - - def test_mean_sum(self): - all_codes = np.typecodes["All"] - exclude_codes = np.typecodes["Datetime"] + np.typecodes["Complex"] + "gSUVO" - supported_codes = set(all_codes) - set(exclude_codes) - - def test(df, dtype_code, operation, **kwargs): - df = type(df)({"A": [0, 1], "B": [1, 0]}, dtype=np.dtype(dtype_code)) - return getattr(df, operation)() - - for c in supported_codes: - for op in ("sum", "mean"): - run_and_compare(test, data={}, dtype_code=c, operation=op) - - -class TestDropna: - data = { - "col1": [1, 2, None, 2, 1], - "col2": [None, 3, None, 2, 1], - "col3": [2, 3, 4, None, 5], - "col4": [1, 2, 3, 4, 5], - } - - @pytest.mark.parametrize("subset", [None, ["col1", "col2"]]) - @pytest.mark.parametrize("how", ["all", "any"]) - def test_dropna(self, subset, how): - def applier(df, *args, **kwargs): - return df.dropna(subset=subset, how=how) - - run_and_compare(applier, data=self.data) - - def test_dropna_multiindex(self): - index = generate_multiindex(len(self.data["col1"])) - - md_df = pd.DataFrame(self.data, index=index) - pd_df = pandas.DataFrame(self.data, index=index) - - md_res = md_df.dropna()._to_pandas() - pd_res = pd_df.dropna() - - # HACK: all strings in HDK considered to be categories, that breaks - # checks for equality with pandas, this line discards category dtype - md_res.index = pandas.MultiIndex.from_tuples( - md_res.index.values, names=md_res.index.names - ) - - df_equals(md_res, pd_res) - - @pytest.mark.skip("Dropna logic for GroupBy is disabled for now") - @pytest.mark.parametrize("by", ["col1", ["col1", "col2"], ["col1", "col4"]]) - @pytest.mark.parametrize("dropna", [True, False]) - def test_dropna_groupby(self, by, dropna): - def applier(df, *args, **kwargs): - # HDK engine preserves NaNs at the result of groupby, - # so replacing NaNs with '0' to match with Pandas. - # https://github.com/modin-project/modin/issues/2878 - return df.groupby(by=by, dropna=dropna).sum().fillna(0) - - run_and_compare(applier, data=self.data) - - -class TestUnsupportedColumns: - @pytest.mark.parametrize( - "data,is_good", - [ - [["1", "2", None, "2", "1"], True], - [[None, "3", None, "2", "1"], True], - [[1, "2", None, "2", "1"], False], - [[None, 3, None, "2", "1"], False], - ], - ) - def test_unsupported_columns(self, data, is_good): - pandas_df = pandas.DataFrame({"col": data}) - bad_cols = HdkOnNativeDataframePartitionManager._get_unsupported_cols(pandas_df) - if is_good: - assert not bad_cols - else: - assert bad_cols == ["col"] - - -class TestConstructor: - @pytest.mark.parametrize( - "data", - [ - None, - {"A": range(10)}, - pandas.Series(range(10)), - pandas.DataFrame({"A": range(10)}), - ], - ) - @pytest.mark.parametrize( - "index", - [None, pandas.RangeIndex(10), pandas.RangeIndex(start=10, stop=0, step=-1)], - ) - @pytest.mark.parametrize("columns", [None, ["A"], ["A", "B", "C"]]) - @pytest.mark.parametrize("dtype", [None, float]) - def test_raw_data(self, data, index, columns, dtype): - if ( - isinstance(data, pandas.Series) - and data.name is None - and columns is not None - and len(columns) > 1 - ): - data = data.copy() - # Pandas constructor fails if an unnamed Series is passed along with columns argument - data.name = "D" - mdf, pdf = create_test_dfs(data, index=index, columns=columns, dtype=dtype) - df_equals(mdf, pdf) - - @pytest.mark.parametrize( - "index", - [ - None, - pandas.Index([1, 2, 3]), - pandas.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)]), - ], - ) - def test_shape_hint_detection(self, index): - df = pd.DataFrame({"a": [1, 2, 3]}, index=index) - assert df._query_compiler._shape_hint == "column" - - transposed_data = df._to_pandas().T.to_dict() - df = pd.DataFrame(transposed_data) - assert df._query_compiler._shape_hint == "row" - - df = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}, index=index) - assert df._query_compiler._shape_hint is None - - df = pd.DataFrame({"a": [1]}, index=None if index is None else index[:1]) - assert df._query_compiler._shape_hint == "column" - - def test_shape_hint_detection_from_arrow(self): - at = pyarrow.Table.from_pydict({"a": [1, 2, 3]}) - df = pd.utils.from_arrow(at) - assert df._query_compiler._shape_hint == "column" - - at = pyarrow.Table.from_pydict({"a": [1], "b": [2], "c": [3]}) - df = pd.utils.from_arrow(at) - assert df._query_compiler._shape_hint == "row" - - at = pyarrow.Table.from_pydict({"a": [1, 2, 3], "b": [1, 2, 3]}) - df = pd.utils.from_arrow(at) - assert df._query_compiler._shape_hint is None - - at = pyarrow.Table.from_pydict({"a": [1]}) - df = pd.utils.from_arrow(at) - assert df._query_compiler._shape_hint == "column" - - def test_constructor_from_modin_series(self): - def construct_has_common_projection(lib, df, **kwargs): - return lib.DataFrame({"col1": df.iloc[:, 0], "col2": df.iloc[:, 1]}) - - def construct_no_common_projection(lib, df1, df2, **kwargs): - return lib.DataFrame( - {"col1": df1.iloc[:, 0], "col2": df2.iloc[:, 0], "col3": df1.iloc[:, 1]} - ) - - def construct_mixed_data(lib, df1, df2, **kwargs): - return lib.DataFrame( - { - "col1": df1.iloc[:, 0], - "col2": df2.iloc[:, 0], - "col3": df1.iloc[:, 1], - "col4": np.arange(len(df1)), - } - ) - - run_and_compare( - construct_has_common_projection, data={"a": [1, 2, 3, 4], "b": [3, 4, 5, 6]} - ) - run_and_compare( - construct_no_common_projection, - data={"a": [1, 2, 3, 4], "b": [3, 4, 5, 6]}, - data2={"a": [10, 20, 30, 40]}, - # HDK doesn't support concatenation of frames that has no common projection - force_lazy=False, - ) - run_and_compare( - construct_mixed_data, - data={"a": [1, 2, 3, 4], "b": [3, 4, 5, 6]}, - data2={"a": [10, 20, 30, 40]}, - # HDK doesn't support concatenation of frames that has no common projection - force_lazy=False, - ) - - -class TestArrowExecution: - data1 = {"a": [1, 2, 3], "b": [3, 4, 5], "c": [6, 7, 8]} - data2 = {"a": [1, 2, 3], "d": [3, 4, 5], "e": [6, 7, 8]} - data3 = {"a": [4, 5, 6], "b": [6, 7, 8], "c": [9, 10, 11]} - - def test_drop_rename_concat(self): - def drop_rename_concat(df1, df2, lib, **kwargs): - df1 = df1.rename(columns={"a": "new_a", "c": "new_b"}) - df1 = df1.drop(columns="b") - df2 = df2.rename(columns={"a": "new_a", "d": "new_b"}) - df2 = df2.drop(columns="e") - return lib.concat([df1, df2], ignore_index=True) - - run_and_compare( - drop_rename_concat, - data=self.data1, - data2=self.data2, - force_lazy=False, - force_arrow_execute=True, - ) - - def test_drop_row(self): - def drop_row(df, **kwargs): - return df.drop(labels=1) - - run_and_compare( - drop_row, - data=self.data1, - force_lazy=False, - ) - - def test_series_pop(self): - def pop(df, **kwargs): - col = df["a"] - col.pop(0) - return col - - run_and_compare( - pop, - data=self.data1, - force_lazy=False, - ) - - def test_empty_transform(self): - def apply(df, **kwargs): - return df + 1 - - run_and_compare(apply, data={}, force_arrow_execute=True) - - def test_append(self): - def apply(df1, df2, **kwargs): - tmp = df1.append(df2) - return tmp - - run_and_compare( - apply, data=self.data1, data2=self.data3, force_arrow_execute=True - ) - - -class TestNonStrCols: - data = {0: [1, 2, 3], "1": [3, 4, 5], 2: [6, 7, 8]} - - def test_sum(self): - mdf = pd.DataFrame(self.data).sum() - pdf = pandas.DataFrame(self.data).sum() - df_equals(mdf, pdf) - - def test_set_index(self): - df = pd.DataFrame(self.data) - df._query_compiler._modin_frame._set_index(pd.Index([1, 2, 3])) - - -class TestLoc: - def test_loc(self): - data = [1, 2, 3, 4, 5, 6] - idx = ["a", "b", "c", "d", "e", "f"] - key = ["b", "c", "d", "e"] - mdf = pd.DataFrame(data, index=idx).loc[key] - pdf = pandas.DataFrame(data, index=idx).loc[key] - df_equals(mdf, pdf) - - def test_iloc_bool(self): - data = [1, 2, 3, 4, 5, 6] - idx = ["a", "b", "c", "d", "e", "f"] - key = [False, True, True, True, True, False] - mdf = pd.DataFrame(data, index=idx).iloc[key] - pdf = pandas.DataFrame(data, index=idx).iloc[key] - df_equals(mdf, pdf) - - def test_iloc_int(self): - data = range(11, 265) - key = list(range(0, 11)) + list(range(243, 254)) - mdf = pd.DataFrame(data).iloc[key] - pdf = pandas.DataFrame(data).iloc[key] - df_equals(mdf, pdf) - - mdf = pd.DataFrame(data).iloc[range(10, 100)] - pdf = pandas.DataFrame(data).iloc[range(10, 100)] - df_equals(mdf, pdf) - - data = test_data_values[0] - mds = pd.Series(data[next(iter(data.keys()))]).iloc[1:] - pds = pandas.Series(data[next(iter(data.keys()))]).iloc[1:] - df_equals(mds, pds) - - def test_iloc_issue_6037(self): - def iloc(df, **kwargs): - return df.iloc[:-1].dropna() - - run_and_compare( - fn=iloc, - data={"A": range(1000000)}, - force_lazy=False, - ) - - -class TestStr: - def test_str(self): - data = test_data_values[0] - mdf = pd.DataFrame(data[next(iter(data.keys()))]) - pdf = pandas.DataFrame(data[next(iter(data.keys()))]) - df_equals(mdf, pdf) - - mds = pd.Series(data[next(iter(data.keys()))]) - pds = pandas.Series(data[next(iter(data.keys()))]) - assert str(mds) == str(pds) - - def test_no_cols(self): - def run_cols(df, **kwargs): - return df.loc[1] - - run_and_compare( - fn=run_cols, - data=None, - constructor_kwargs={"index": range(5)}, - force_lazy=False, - ) - - -class TestCompare: - def test_compare_float(self): - def run_compare(df1, df2, **kwargs): - return df1.compare(df2, align_axis="columns", keep_shape=False) - - data1 = random_state.randn(100, 10) - data2 = random_state.randn(100, 10) - columns = list("abcdefghij") - - run_and_compare( - run_compare, - data=data1, - data2=data2, - constructor_kwargs={"columns": columns}, - force_lazy=False, - ) - - -class TestDuplicateColumns: - def test_init(self): - def init(df, **kwargs): - return df - - data = [ - [1, 2, 3, 4], - [5, 6, 7, 8], - [9, 10, 11, 12], - [13, 14, 15, 16], - [17, 18, 19, 20], - ] - columns = ["c1", "c2", "c1", "c3"] - run_and_compare( - fn=init, - data=data, - force_lazy=False, - constructor_kwargs={"columns": columns}, - ) - - def test_loc(self): - def loc(df, **kwargs): - return df.loc[:, ["col1", "col3", "col3"]] - - run_and_compare( - fn=loc, - data=test_data_values[0], - force_lazy=False, - ) - - def test_set_columns(self): - def set_cols(df, **kwargs): - df.columns = ["col1", "col3", "col3"] - return df - - run_and_compare( - fn=set_cols, - data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], - force_lazy=False, - ) - - def test_set_axis(self): - def set_axis(df, **kwargs): - sort_index = df.axes[1] - labels = [ - np.nan if i % 2 == 0 else sort_index[i] for i in range(len(sort_index)) - ] - return df.set_axis(labels, axis=1, copy=kwargs["copy"]) - - run_and_compare( - fn=set_axis, - data=test_data["float_nan_data"], - force_lazy=False, - copy=True, - ) - run_and_compare( - fn=set_axis, - data=test_data["float_nan_data"], - force_lazy=False, - copy=False, - ) - - -class TestFromArrow: - def test_dict(self): - indices = pyarrow.array([0, 1, 0, 1, 2, 0, None, 2]) - dictionary = pyarrow.array(["first", "second", "third"]) - dict_array = pyarrow.DictionaryArray.from_arrays(indices, dictionary) - at = pyarrow.table( - {"col1": dict_array, "col2": [1, 2, 3, 4, 5, 6, 7, 8], "col3": dict_array} - ) - pdf = at.to_pandas() - nchunks = 3 - chunks = split_df_into_chunks(pdf, nchunks) - at = pyarrow.concat_tables([pyarrow.Table.from_pandas(c) for c in chunks]) - mdf = from_arrow(at) - at = mdf._query_compiler._modin_frame._partitions[0][0].get() - assert len(at.column(0).chunks) == nchunks - - mdt = mdf.dtypes.iloc[0] - pdt = pdf.dtypes.iloc[0] - assert mdt == "category" - assert isinstance(mdt, pandas.CategoricalDtype) - assert str(mdt) == str(pdt) - - # Make sure the lazy proxy dtype is not materialized yet. - assert type(mdt) is not pandas.CategoricalDtype - assert mdt._parent is not None - assert mdt._update_proxy(at, at.column(0)._name) is mdt - assert mdt._update_proxy(at, at.column(2)._name) is not mdt - assert ( - type(mdt._update_proxy(at, at.column(2)._name)) != pandas.CategoricalDtype - ) - - assert mdt == pdt - assert pdt == mdt - assert repr(mdt) == repr(pdt) - - # `df_equals` triggers categories materialization and thus - # has to be called after all checks for laziness - df_equals(mdf, pdf) - # Should be materialized now - assert ( - type(mdt._update_proxy(at, at.column(2)._name)) == pandas.CategoricalDtype - ) - - -class TestSparseArray: - def test_sparse_series(self): - data = pandas.arrays.SparseArray(np.array([3, 1, 2, 3, 4, np.nan])) - mds = pd.Series(data) - pds = pandas.Series(data) - df_equals(mds, pds) - - -class TestEmpty: - def test_frame_insert(self): - def insert(df, **kwargs): - df["a"] = [1, 2, 3, 4, 5] - return df - - run_and_compare( - insert, - data=None, - ) - run_and_compare( - insert, - data=None, - constructor_kwargs={"index": ["a", "b", "c", "d", "e"]}, - ) - run_and_compare( - insert, - data=None, - constructor_kwargs={"columns": ["a", "b", "c", "d", "e"]}, - # Do not force lazy since setitem() defaults to pandas - force_lazy=False, - ) - - def test_series_getitem(self): - df_equals(pd.Series([])[:30], pandas.Series([])[:30]) - - def test_series_to_pandas(self): - df_equals(pd.Series([])._to_pandas(), pandas.Series([])) - - -if __name__ == "__main__": - pytest.main(["-v", __file__]) diff --git a/modin/tests/experimental/hdk_on_native/test_init.py b/modin/tests/experimental/hdk_on_native/test_init.py deleted file mode 100644 index 1d1cfa348b2..00000000000 --- a/modin/tests/experimental/hdk_on_native/test_init.py +++ /dev/null @@ -1,29 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - - -class TestInit: - def test_num_threads(self): - import os - - import modin.pandas as pd - - assert "OMP_NUM_THREADS" not in os.environ - - import modin.config as cfg - - cfg.IsExperimental.put(True) - cfg.Engine.put("Native") - cfg.StorageFormat.put("Hdk") - pd.DataFrame() - assert os.environ["OMP_NUM_THREADS"] == str(cfg.CpuCount.get()) diff --git a/modin/tests/experimental/hdk_on_native/test_utils.py b/modin/tests/experimental/hdk_on_native/test_utils.py deleted file mode 100644 index 26fb4573d33..00000000000 --- a/modin/tests/experimental/hdk_on_native/test_utils.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import sys -import timeit -from random import choice, randint, uniform - -import pandas -import pytz - -from modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.utils import ( - ColNameCodec, -) - -UNICODE_RANGES = [ - (0x0020, 0x007F), # Basic Latin - (0x00A0, 0x00FF), # Latin-1 Supplement - (0x0100, 0x017F), # Latin Extended-A - (0x0180, 0x024F), # Latin Extended-B - (0x0250, 0x02AF), # IPA Extensions - (0x02B0, 0x02FF), # Spacing Modifier Letters - (0x0300, 0x036F), # Combining Diacritical Marks - (0x0370, 0x03FF), # Greek and Coptic - (0x10330, 0x1034F), # Gothic - (0xE0000, 0xE007F), # Tags -] -UNICODE_ALPHABET = [chr(c) for r in UNICODE_RANGES for c in range(r[0], r[1] + 1)] - - -def test_encode_col_name(): - def test(name): - encoded = ColNameCodec.encode(name) - assert ColNameCodec.decode(encoded) == name - - test("") - test(None) - test(("", "")) - - for i in range(0, 1000): - test(randint(-sys.maxsize, sys.maxsize)) - for i in range(0, 1000): - test(uniform(-sys.maxsize, sys.maxsize)) - for i in range(0, 1000): - test(rnd_unicode(randint(0, 100))) - for i in range(0, 1000): - test((rnd_unicode(randint(0, 100)), rnd_unicode(randint(0, 100)))) - for i in range(0, 1000): - tz = choice(pytz.all_timezones) - test(pandas.Timestamp(randint(0, 0xFFFFFFFF), unit="s", tz=tz)) - - -def rnd_unicode(length): - return "".join(choice(UNICODE_ALPHABET) for _ in range(length)) - - -def test_time(): - ranges = [ - (0x0041, 0x005A), # Alpha chars - (0x0020, 0x007F), # Basic Latin - (0x00A0, 0x00FF), # Latin-1 Supplement - ] - repeat = 10 - text_len = 100000 - - for r in ranges: - alphabet = "".join([chr(c) for c in range(r[0], r[1] + 1)]) - text = ( - alphabet * int(text_len / len(alphabet)) - + alphabet[0 : divmod(text_len, len(alphabet))[1]] - ) - encoded_text = ColNameCodec.encode(text) - assert text == ColNameCodec.decode(encoded_text) - print(f"Alphabet: {alphabet}") # noqa: T201 - print(f"Text len: {len(text)}") # noqa: T201 - print(f"Encoded text len: {len(encoded_text)}") # noqa: T201 - - def test_encode(): - ColNameCodec.encode(text) - - def test_decode(): - ColNameCodec.decode(encoded_text) - - time = timeit.timeit(stmt=test_encode, number=repeat) - print(f"Encode time: {time/repeat} seconds") # noqa: T201 - time = timeit.timeit(stmt=test_decode, number=repeat) - print(f"Decode time: {time/repeat} seconds") # noqa: T201 - print("--------------------------------------") # noqa: T201 diff --git a/modin/tests/experimental/hdk_on_native/utils.py b/modin/tests/experimental/hdk_on_native/utils.py deleted file mode 100644 index cb349474d49..00000000000 --- a/modin/tests/experimental/hdk_on_native/utils.py +++ /dev/null @@ -1,314 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import datetime - -import pandas -import pytest -from pandas.core.dtypes.common import is_datetime64_any_dtype, is_object_dtype - -import modin.pandas as pd -from modin.tests.pandas.utils import df_equals -from modin.tests.pandas.utils import eval_io as general_eval_io -from modin.utils import try_cast_to_pandas - - -def eval_io( - fn_name, - comparator=df_equals, - cast_to_str=False, - expected_exception=None, - check_kwargs_callable=True, - modin_warning=None, - md_extra_kwargs=None, - *args, - **kwargs, -): - """ - Evaluate I/O operation and do equality check after importing Modin's data to HDK. - - Notes - ----- - For parameters description please refer to ``modin.tests.pandas.utils.eval_io``. - """ - - def hdk_comparator(df1, df2, **kwargs): - """Evaluate equality comparison of the passed frames after importing the Modin's one to HDK.""" - with ForceHdkImport(df1, df2): - # Aligning DateTime dtypes because of the bug related to the `parse_dates` parameter: - # https://github.com/modin-project/modin/issues/3485 - dfs = align_datetime_dtypes(df1, df2) - - # 1. Replace NA with empty strings. HDK treats empty strings and NA equally. - cols = {c for df in dfs for c, t in df.dtypes.items() if is_object_dtype(t)} - if len(cols) != 0: - cols = pandas.Index(cols) - for df in dfs: - df[cols] = df[cols].fillna("") - # 2. HdkWorker.cast_to_compatible_types() converts all categorical columns to string. - cols = { - c - for df in dfs - for c, t in df.dtypes.items() - if isinstance(t, pandas.CategoricalDtype) - } - if len(cols) != 0: - cols = pandas.Index(cols) - for df in dfs: - df[cols] = df[cols].astype(str) - comparator(*dfs, **kwargs) - - general_eval_io( - fn_name, - comparator=hdk_comparator, - cast_to_str=cast_to_str, - expected_exception=expected_exception, - check_kwargs_callable=check_kwargs_callable, - modin_warning=modin_warning, - md_extra_kwargs=md_extra_kwargs, - *args, - **kwargs, - ) - - -def align_datetime_dtypes(*dfs): - """ - Make all of the passed frames have DateTime dtype for the same columns. - - Cast column type of the certain frame to the DateTime type if any frame in - the `dfs` sequence has DateTime type for this column. - - Parameters - ---------- - *dfs : iterable of DataFrames - DataFrames to align DateTime dtypes. - - Notes - ----- - Passed Modin frames may be casted to pandas in the result. - """ - datetime_cols = {} - time_cols = set() - for df in dfs: - for col, dtype in df.dtypes.items(): - # If we already decided to cast this column to DateTime no more actions are needed - if col not in datetime_cols and is_datetime64_any_dtype(dtype): - datetime_cols[col] = dtype - # datetime.time is considered to be an 'object' dtype in pandas that's why - # we have to explicitly check the values type in the column - elif ( - dtype == pandas.api.types.pandas_dtype("O") - and col not in time_cols - # HDK has difficulties with empty frames, so explicitly skip them - # https://github.com/modin-project/modin/issues/3428 - and len(df) > 0 - and all( - isinstance(val, datetime.time) or pandas.isna(val) - for val in df[col] - ) - ): - time_cols.add(col) - - if len(datetime_cols) == 0 and len(time_cols) == 0: - return dfs - - def convert_to_time(value): - """Convert passed value to `datetime.time`.""" - if isinstance(value, datetime.time): - return value - elif isinstance(value, str): - return datetime.time.fromisoformat(value) - else: - return datetime.time(value) - - time_cols_list = list(time_cols) - casted_dfs = [] - for df in dfs: - # HDK has difficulties with casting to certain dtypes (i.e. datetime64), - # so casting it to pandas - pandas_df = try_cast_to_pandas(df) - if datetime_cols: - pandas_df = pandas_df.astype(datetime_cols) - if time_cols: - pandas_df[time_cols_list] = pandas_df[time_cols_list].map(convert_to_time) - casted_dfs.append(pandas_df) - - return casted_dfs - - -class ForceHdkImport: - """ - Trigger import execution for Modin DataFrames obtained by HDK engine if already not. - - When using as a context class also cleans up imported tables at the end of the context. - - Parameters - ---------- - *dfs : iterable - DataFrames to trigger import. - """ - - def __init__(self, *dfs): - self._imported_frames = [] - for df in dfs: - if not isinstance(df, (pd.DataFrame, pd.Series)): - continue - if df.empty: - continue - try: - modin_frame = df._query_compiler._modin_frame - modin_frame.force_import() - self._imported_frames.append(df) - except NotImplementedError: - ... - - def __enter__(self): - return self - - def export_frames(self): - """ - Export tables from HDK that was imported by this instance. - - Returns - ------- - list - A list of Modin DataFrames whose payload is ``pyarrow.Table`` - that was just exported from HDK. - """ - result = [] - for df in self._imported_frames: - # Append `TransformNode`` selecting all the columns (SELECT * FROM frame_id) - df = df[df.columns.tolist()] - modin_frame = df._query_compiler._modin_frame - # Forcibly executing plan via HDK. - mode = modin_frame._force_execution_mode - modin_frame._force_execution_mode = "hdk" - modin_frame._execute() - modin_frame._force_execution_mode = mode - result.append(df) - return result - - def __exit__(self, exc_type, exc_val, exc_tb): - self._imported_frames.clear() - - -def set_execution_mode(frame, mode, recursive=False): - """ - Enable execution mode assertions for the passed frame. - - Enabled execution mode checks mean, that the frame raises an AssertionError - if the execution flow is out of the scope of the selected mode. - - Parameters - ---------- - frame : DataFrame or Series - Modin frame to set execution mode at. - mode : {None, "lazy", "arrow"} - Execution mode to set: - - "lazy": only delayed computations. - - "arrow": only computations via Pyarrow. - - None: allow any type of computations. - recursive : bool, default: False - Whether to set the specified execution mode for every frame - in the delayed computation tree. - """ - if isinstance(frame, (pd.Series, pd.DataFrame)): - frame = frame._query_compiler._modin_frame - frame._force_execution_mode = mode - if recursive and hasattr(frame._op, "input"): - for child in frame._op.input: - set_execution_mode(child, mode, True) - - -def run_and_compare( - fn, - data, - data2=None, - force_lazy=True, - force_hdk_execute=False, - force_arrow_execute=False, - allow_subqueries=False, - comparator=df_equals, - **kwargs, -): - """Verify equality of the results of the passed function executed against pandas and modin frame.""" - - def run_modin( - fn, - data, - data2, - force_lazy, - force_hdk_execute, - force_arrow_execute, - allow_subqueries, - constructor_kwargs, - **kwargs, - ): - kwargs["df1"] = pd.DataFrame(data, **constructor_kwargs) - kwargs["df2"] = pd.DataFrame(data2, **constructor_kwargs) - kwargs["df"] = kwargs["df1"] - - if force_hdk_execute: - set_execution_mode(kwargs["df1"], "hdk") - set_execution_mode(kwargs["df2"], "hdk") - elif force_arrow_execute: - set_execution_mode(kwargs["df1"], "arrow") - set_execution_mode(kwargs["df2"], "arrow") - elif force_lazy: - set_execution_mode(kwargs["df1"], "lazy") - set_execution_mode(kwargs["df2"], "lazy") - - exp_res = fn(lib=pd, **kwargs) - - if force_hdk_execute: - set_execution_mode(exp_res, "hdk", allow_subqueries) - elif force_arrow_execute: - set_execution_mode(exp_res, "arrow", allow_subqueries) - elif force_lazy: - set_execution_mode(exp_res, None, allow_subqueries) - - return exp_res - - constructor_kwargs = kwargs.pop("constructor_kwargs", {}) - try: - kwargs["df1"] = pandas.DataFrame(data, **constructor_kwargs) - kwargs["df2"] = pandas.DataFrame(data2, **constructor_kwargs) - kwargs["df"] = kwargs["df1"] - ref_res = fn(lib=pandas, **kwargs) - except Exception as err: - with pytest.raises(type(err)): - exp_res = run_modin( - fn=fn, - data=data, - data2=data2, - force_lazy=force_lazy, - force_hdk_execute=force_hdk_execute, - force_arrow_execute=force_arrow_execute, - allow_subqueries=allow_subqueries, - constructor_kwargs=constructor_kwargs, - **kwargs, - ) - _ = exp_res.index - else: - exp_res = run_modin( - fn=fn, - data=data, - data2=data2, - force_lazy=force_lazy, - force_hdk_execute=force_hdk_execute, - force_arrow_execute=force_arrow_execute, - allow_subqueries=allow_subqueries, - constructor_kwargs=constructor_kwargs, - **kwargs, - ) - comparator(ref_res, exp_res) diff --git a/modin/tests/experimental/test_sql.py b/modin/tests/experimental/test_sql.py deleted file mode 100644 index 594140d6a5d..00000000000 --- a/modin/tests/experimental/test_sql.py +++ /dev/null @@ -1,70 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import io - -import pandas -import pytest - -import modin.pandas as pd -from modin.config import StorageFormat -from modin.tests.pandas.utils import default_to_pandas_ignore_string, df_equals - -pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) - -titanic_snippet = """passenger_id,survived,p_class,name,sex,age,sib_sp,parch,ticket,fare,cabin,embarked -1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S -2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S -4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S -5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S -6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q -7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S -9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S -""" - - -@pytest.mark.skipif( - StorageFormat.get() != "Hdk", - reason="Lack of implementation for other storage formats.", -) -def test_sql_query(): - from modin.experimental.sql import query - - df = pd.read_csv(io.StringIO(titanic_snippet)) - sql = "SELECT survived, p_class, count(passenger_id) as cnt FROM (SELECT * FROM titanic WHERE survived = 1) as t1 GROUP BY survived, p_class" - query_result = query(sql, titanic=df) - expected_df = ( - df[df.survived == 1] - .groupby(["survived", "p_class"]) - .agg({"passenger_id": "count"}) - .reset_index() - ) - assert query_result.shape == expected_df.shape - values_left = expected_df.dropna().values - values_right = query_result.dropna().values - assert (values_left == values_right).all() - - -@pytest.mark.skipif( - StorageFormat.get() != "Hdk", - reason="Lack of implementation for other storage formats.", -) -def test_string_cast(): - from modin.experimental.sql import query - - data = {"A": ["A", "B", "C"], "B": ["A", "B", "C"]} - mdf = pd.DataFrame(data) - pdf = pandas.DataFrame(data) - df_equals(pdf, query("SELECT * FROM df", df=mdf)) diff --git a/modin/tests/interchange/dataframe_protocol/hdk/__init__.py b/modin/tests/interchange/dataframe_protocol/hdk/__init__.py deleted file mode 100644 index cae6413e559..00000000000 --- a/modin/tests/interchange/dataframe_protocol/hdk/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. diff --git a/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py b/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py deleted file mode 100644 index 86ffcb4c41b..00000000000 --- a/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py +++ /dev/null @@ -1,337 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Dataframe exchange protocol tests that are specific for HDK implementation.""" - -import numpy as np -import pandas -import pyarrow as pa -import pytest - -import modin.pandas as pd -from modin.core.dataframe.pandas.interchange.dataframe_protocol.from_dataframe import ( - buffer_to_ndarray, - primitive_column_to_ndarray, - set_nulls, -) -from modin.pandas.io import from_arrow, from_dataframe -from modin.tests.pandas.utils import df_equals -from modin.tests.test_utils import warns_that_defaulting_to_pandas - -from .utils import export_frame, get_data_of_all_types, split_df_into_chunks - - -@pytest.mark.parametrize("data_has_nulls", [True, False]) -@pytest.mark.parametrize("from_hdk", [True, False]) -@pytest.mark.parametrize("n_chunks", [None, 3, 5, 12]) -def test_simple_export(data_has_nulls, from_hdk, n_chunks): - if from_hdk: - # HDK can't import 'uint64' as well as booleans - # issue for bool: https://github.com/modin-project/modin/issues/4299 - exclude_dtypes = ["bool", "uint64"] - else: - exclude_dtypes = [] - - data = get_data_of_all_types( - has_nulls=data_has_nulls, exclude_dtypes=exclude_dtypes - ) - md_df = pd.DataFrame(data) - - exported_df = export_frame(md_df, from_hdk, n_chunks=n_chunks) - - # export_frame() splits the frame into multiple chunks. When it's - # split with HDK, each categorical column will have a different - # set of categories. When concatenating the chunks, the categorical - # column will be of type object. - cat_cols = md_df.select_dtypes(include=["category"]).columns - with warns_that_defaulting_to_pandas(): - md_df[cat_cols] = md_df[cat_cols].astype(str) - exported_df[cat_cols] = exported_df[cat_cols].astype(str) - - df_equals(md_df, exported_df) - - -@pytest.mark.parametrize("n_chunks", [2, 4, 7]) -@pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_export_aligned_at_chunks(n_chunks, data_has_nulls): - """Test export from DataFrame exchange protocol when internal PyArrow table is equaly chunked.""" - # Modin DataFrame constructor can't process PyArrow's category when using `from_arrow`, so exclude it - data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) - pd_df = pandas.DataFrame(data) - pd_chunks = split_df_into_chunks(pd_df, n_chunks) - - chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) - md_df = from_arrow(chunked_at) - assert ( - len(md_df._query_compiler._modin_frame._partitions[0][0].get().column(0).chunks) - == md_df.__dataframe__().num_chunks() - == n_chunks - ) - - exported_df = export_frame(md_df) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=n_chunks) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=n_chunks * 2) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=n_chunks * 3) - df_equals(md_df, exported_df) - - -@pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_export_unaligned_at_chunks(data_has_nulls): - """ - Test export from DataFrame exchange protocol when internal PyArrow table's chunks are unaligned. - - Arrow table allows for its columns to be chunked independently. Unaligned chunking means that - each column has its individual chunking and so some preprocessing is required in order - to emulate equaly chunked columns in the protocol. - """ - # Modin DataFrame constructor can't process PyArrow's category when using `from_arrow`, so exclude it - data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) - pd_df = pandas.DataFrame(data) - # divide columns in 3 groups: unchunked, 2-chunked, 7-chunked - chunk_groups = [1, 2, 7] - chunk_col_ilocs = [ - slice( - i * len(pd_df.columns) // len(chunk_groups), - (i + 1) * len(pd_df.columns) // len(chunk_groups), - ) - for i in range(len(chunk_groups)) - ] - - pd_chunk_groups = [ - split_df_into_chunks(pd_df.iloc[:, cols], n_chunks) - for n_chunks, cols in zip(chunk_groups, chunk_col_ilocs) - ] - at_chunk_groups = [ - pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in chunk_group]) - for chunk_group in pd_chunk_groups - ] - - chunked_at = at_chunk_groups[0] - # TODO: appending columns one by one looks inefficient, is there a better way? - for _at in at_chunk_groups[1:]: - for field in _at.schema: - chunked_at = chunked_at.append_column(field, _at[field.name]) - md_df = from_arrow(chunked_at) - - # verify that test generated the correct chunking - internal_at = md_df._query_compiler._modin_frame._partitions[0][0].get() - for n_chunks_group, cols in zip(chunk_groups, chunk_col_ilocs): - for col in internal_at.select(range(cols.start, cols.stop)).columns: - assert len(col.chunks) == n_chunks_group - - n_chunks = md_df.__dataframe__().num_chunks() - - exported_df = export_frame(md_df) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=n_chunks) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=n_chunks * 2) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=n_chunks * 3) - df_equals(md_df, exported_df) - - -@pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_export_indivisible_chunking(data_has_nulls): - """ - Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'indivisibly chunked'. - - The setup for the test is a PyArrow table having one of the chunk consisting of a single row, - meaning that the chunk can't be subdivide. - """ - data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) - pd_df = pandas.DataFrame(data) - pd_chunks = (pd_df.iloc[:1], pd_df.iloc[1:]) - - chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) - md_df = from_arrow(chunked_at) - assert ( - len(md_df._query_compiler._modin_frame._partitions[0][0].get().column(0).chunks) - == md_df.__dataframe__().num_chunks() - == 2 - ) - # Meaning that we can't subdivide first chunk - np.testing.assert_array_equal( - md_df.__dataframe__()._chunk_slices, [0, 1, len(pd_df)] - ) - - exported_df = export_frame(md_df, n_chunks=2) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=4) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, n_chunks=40) - df_equals(md_df, exported_df) - - -def test_export_when_delayed_computations(): - """ - Test that export works properly when HdkOnNative has delayed computations. - - If there are delayed functions and export is required, it has to trigger the execution - first prior materializing protocol's buffers, so the buffers contain actual result - of the computations. - """ - # HDK can't import 'uint64' as well as booleans, so exclude them - # issue for bool: https://github.com/modin-project/modin/issues/4299 - data = get_data_of_all_types(has_nulls=True, exclude_dtypes=["uint64", "bool"]) - md_df = pd.DataFrame(data) - pd_df = pandas.DataFrame(data) - - md_res = md_df.fillna({"float32_null": 32.0, "float64_null": 64.0}) - pd_res = pd_df.fillna({"float32_null": 32.0, "float64_null": 64.0}) - assert ( - not md_res._query_compiler._modin_frame._has_arrow_table() - ), "There are no delayed computations for the frame" - - exported_df = export_frame(md_res) - df_equals(exported_df, pd_res) - - -@pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_simple_import(data_has_nulls): - """Test that ``modin.pandas.utils.from_dataframe`` works properly.""" - data = get_data_of_all_types(data_has_nulls) - - modin_df_producer = pd.DataFrame(data) - internal_modin_df_producer = modin_df_producer.__dataframe__() - # Our configuration in pytest.ini requires that we explicitly catch all - # instances of defaulting to pandas, this one raises a warning on `.from_dataframe` - with warns_that_defaulting_to_pandas(): - modin_df_consumer = from_dataframe(modin_df_producer) - internal_modin_df_consumer = from_dataframe(internal_modin_df_producer) - - # TODO: the following assertions verify that `from_dataframe` doesn't return - # the same object untouched due to optimization branching, it actually should - # do so but the logic is not implemented yet, so the assertions are passing - # for now. It's required to replace the producer's type with a different one - # to consumer when we have some other implementation of the protocol as the - # assertions may start failing shortly. - assert modin_df_producer is not modin_df_consumer - assert internal_modin_df_producer is not internal_modin_df_consumer - assert ( - modin_df_producer._query_compiler._modin_frame - is not modin_df_consumer._query_compiler._modin_frame - ) - - df_equals(modin_df_producer, modin_df_consumer) - df_equals(modin_df_producer, internal_modin_df_consumer) - - -@pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_zero_copy_export_for_primitives(data_has_nulls): - """Test that basic data types can be zero-copy exported from HdkOnNative dataframe.""" - data = get_data_of_all_types( - has_nulls=data_has_nulls, include_dtypes=["int", "uint", "float"] - ) - at = pa.Table.from_pydict(data) - - md_df = from_arrow(at) - protocol_df = md_df.__dataframe__(allow_copy=False) - - for i, col in enumerate(protocol_df.get_columns()): - col_arr, _ = primitive_column_to_ndarray(col) - - exported_ptr = col_arr.__array_interface__["data"][0] - producer_ptr = at.column(i).chunks[0].buffers()[-1].address - # Verify that the pointers of produce and exported objects point to the same data - assert producer_ptr == exported_ptr - - # Can't export `md_df` zero-copy no more as it has delayed 'fillna' operation - md_df = md_df.fillna({"float32": 32.0}) - non_zero_copy_protocol_df = md_df.__dataframe__(allow_copy=False) - - with pytest.raises(RuntimeError): - primitive_column_to_ndarray( - non_zero_copy_protocol_df.get_column_by_name("float32") - ) - - -def test_bitmask_chunking(): - """Test that making a virtual chunk in a middle of a byte of a bitmask doesn't cause problems.""" - at = pa.Table.from_pydict({"col": [True, False, True, True, False] * 5}) - assert at["col"].type.bit_width == 1 - - md_df = from_arrow(at) - # Column length is 25, n_chunks is 2, meaning that the split will occur in the middle - # of the second byte - exported_df = export_frame(md_df, n_chunks=2) - df_equals(md_df, exported_df) - - -@pytest.mark.parametrize("data_has_nulls", [True, False]) -@pytest.mark.parametrize("n_chunks", [2, 9]) -def test_buffer_of_chunked_at(data_has_nulls, n_chunks): - """Test that getting buffers of physically chunked column works properly.""" - data = get_data_of_all_types( - # For the simplicity of the test include only primitive types, so the test can use - # only one function to export a column instead of if-elsing to find a type-according one - has_nulls=data_has_nulls, - include_dtypes=["bool", "int", "uint", "float"], - ) - - pd_df = pandas.DataFrame(data) - pd_chunks = split_df_into_chunks(pd_df, n_chunks) - - chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) - md_df = from_arrow(chunked_at) - - protocol_df = md_df.__dataframe__() - for i, col in enumerate(protocol_df.get_columns()): - assert col.num_chunks() > 1 - assert len(col._pyarrow_table.column(0).chunks) > 1 - - buffers = col.get_buffers() - data_buff, data_dtype = buffers["data"] - result = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size()) - result = set_nulls(result, col, buffers["validity"]) - - # Our configuration in pytest.ini requires that we explicitly catch all - # instances of defaulting to pandas, this one raises a warning on `.to_numpy()` - with warns_that_defaulting_to_pandas(): - reference = md_df.iloc[:, i].to_numpy() - - np.testing.assert_array_equal(reference, result) - - protocol_df = md_df.__dataframe__(allow_copy=False) - for i, col in enumerate(protocol_df.get_columns()): - assert col.num_chunks() > 1 - assert len(col._pyarrow_table.column(0).chunks) > 1 - - # Catch exception on attempt of doing a copy due to chunks combining - with pytest.raises(RuntimeError): - col.get_buffers() - - -def test_concat_chunks(): - """Regression test for https://github.com/modin-project/modin/issues/4366""" - modin_df = pd.DataFrame( - {"a": pd.Categorical(list("testdataforexchangedataframeprotocol"))} - ) - n_chunks = 2 - chunks = split_df_into_chunks(modin_df, n_chunks) - new_modin_df = pd.concat(chunks) - assert new_modin_df["a"].dtype.name == "category" - protocol_df = new_modin_df.__dataframe__() - df_col = protocol_df.get_column_by_name("a") - assert df_col.num_chunks() == n_chunks diff --git a/modin/tests/interchange/dataframe_protocol/hdk/utils.py b/modin/tests/interchange/dataframe_protocol/hdk/utils.py deleted file mode 100644 index 7af3b87600a..00000000000 --- a/modin/tests/interchange/dataframe_protocol/hdk/utils.py +++ /dev/null @@ -1,249 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -"""Utility function for testing HdkOnNative implementation for DataFrame exchange protocol.""" - -from typing import Dict - -import numpy as np -import pandas - -from modin.core.dataframe.pandas.interchange.dataframe_protocol.from_dataframe import ( - from_dataframe_to_pandas, - protocol_df_chunk_to_pandas, -) -from modin.tests.experimental.hdk_on_native.utils import ForceHdkImport - - -def split_df_into_chunks(df, n_chunks): - """ - Split passed DataFrame into `n_chunks` along row axis. - - Parameters - ---------- - df : DataFrame - DataFrame to split into chunks. - n_chunks : int - Number of chunks to split `df` into. - - Returns - ------- - list of DataFrames - """ - chunks = [] - for i in range(n_chunks): - start = i * len(df) // n_chunks - end = (i + 1) * len(df) // n_chunks - chunks.append(df.iloc[start:end]) - - return chunks - - -def export_frame(md_df, from_hdk=False, **kwargs): - """ - Construct ``pandas.DataFrame`` from ``modin.pandas.DataFrame`` using DataFrame exchange protocol. - - Parameters - ---------- - md_df : modin.pandas.DataFrame - DataFrame to convert to pandas. - from_hdk : bool, default: False - Whether to forcibly use data exported from HDK. If `True`, import DataFrame's - data into HDK and then export it back, so the origin for underlying `md_df` - data is HDK. - **kwargs : dict - Additional parameters to pass to the ``from_dataframe_to_pandas`` function. - - Returns - ------- - pandas.DataFrame - """ - if not from_hdk: - return from_dataframe_to_pandas_assert_chunking(md_df, **kwargs) - - with ForceHdkImport(md_df) as instance: - md_df_exported = instance.export_frames()[0] - exported_df = from_dataframe_to_pandas_assert_chunking(md_df_exported, **kwargs) - - return exported_df - - -def from_dataframe_to_pandas_assert_chunking(df, n_chunks=None, **kwargs): - """ - Build a ``pandas.DataFrame`` from a `__dataframe__` object splitting it into `n_chunks`. - - The function asserts that the `df` was split exactly into `n_chunks` before converting them to pandas. - - Parameters - ---------- - df : DataFrame - Object supporting the exchange protocol, i.e. `__dataframe__` method. - n_chunks : int, optional - Number of chunks to split `df`. - - Returns - ------- - pandas.DataFrame - """ - if n_chunks is None: - return from_dataframe_to_pandas(df, n_chunks=n_chunks, **kwargs) - - protocol_df = df.__dataframe__() - chunks = list(protocol_df.get_chunks(n_chunks)) - assert len(chunks) == n_chunks - - pd_chunks = [None] * len(chunks) - for i in range(len(chunks)): - pd_chunks[i] = protocol_df_chunk_to_pandas(chunks[i], **kwargs) - - pd_df = pandas.concat(pd_chunks, axis=0, ignore_index=True) - - index_obj = protocol_df.metadata.get( - "modin.index", protocol_df.metadata.get("pandas.index", None) - ) - if index_obj is not None: - pd_df.index = index_obj - - return pd_df - - -def get_data_of_all_types( - has_nulls=False, exclude_dtypes=None, include_dtypes=None -) -> Dict[str, np.ndarray]: - """ - Generate a dictionary containing every datatype that is supported by HDK implementation of the exchange protocol. - - Parameters - ---------- - has_nulls : bool, default: False - Whether to include columns containing null values. - exclude_dtypes : list, optional - List of type prefixes to exclude in the dictionary. For example, - passing ``["int", "float"]`` excludes all of the signed integer (``int16``, - ``int32``, ``int64``) and float (``float32``, ``float64``) types. - include_dtypes : list, optional - List of type prefixes to include in the dictionary. For example, - passing ``["int", "float"]`` will include ONLY signed integer (``int16``, - ``int32``, ``int64``) and float (``float32``, ``float64``) types. - - Returns - ------- - dict - Dictionary to pass to a DataFrame constructor. The keys are string column names - that are equal to the type name of the according column. Columns containing null - types have a ``"_null"`` suffix in their names. - """ - bool_data = {} - int_data = {} - uint_data = {} - float_data = {} - datetime_data = {} - string_data = {} - category_data = {} - - # bool - bool_data["bool"] = np.array([True, False, True, True] * 10, dtype=bool) - - # int - for width in (8, 16, 32, 64): - dtype = getattr(np, f"int{width}") - max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min - int_data[f"int{width}"] = np.array( - [max_val, max_val - 1, min_val + 1, min_val + 2] * 10, dtype=dtype - ) - - # uint - for width in (8, 16, 32, 64): - dtype = getattr(np, f"uint{width}") - max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min - uint_data[f"uint{width}"] = np.array( - [max_val, max_val - 1, min_val + 1, min_val + 2] * 10, dtype=dtype - ) - - # float - for width in (32, 64): - dtype = getattr(np, f"float{width}") - max_val, min_val = np.finfo(dtype).max, np.finfo(dtype).min - float_data[f"float{width}"] = np.array( - [max_val, max_val - 1, min_val + 1, min_val + 2] * 10, dtype=dtype - ) - if has_nulls: - float_data[f"float{width}_null"] = np.array( - [max_val, None, min_val + 1, min_val + 2] * 10, dtype=dtype - ) - - # datetime - for unit in ("s", "ms", "ns"): - datetime_data[f"datetime64[{unit}]"] = np.array( - [0, 1, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") - ) - if has_nulls: - datetime_data[f"datetime64[{unit}]_null"] = np.array( - [0, None, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") - ) - - # string - string_data["string"] = np.array( - # Test multi-byte characters as well to ensure that the chunking works correctly for them - ["English: test string", " ", "Chinese: 测试字符串", "Russian: тестовая строка"] - * 10 - ) - if has_nulls: - string_data["string_null"] = np.array( - [ - "English: test string", - None, - "Chinese: 测试字符串", - "Russian: тестовая строка", - ] - * 10 - ) - - # category - category_data["category_string"] = pandas.Categorical( - ["Sample", "te", " ", "xt"] * 10 - ) - # HDK does not support non-string categories - # category_data["category_int"] = pandas.Categorical([1, 2, 3, 4] * 10) - if has_nulls: - category_data["category_string_null"] = pandas.Categorical( - ["Sample", None, " ", "xt"] * 10 - ) - - data = { - **bool_data, - **int_data, - **uint_data, - **float_data, - **datetime_data, - **string_data, - **category_data, - } - - if include_dtypes is not None: - filtered_keys = ( - key - for key in data.keys() - if any(key.startswith(dtype) for dtype in include_dtypes) - ) - data = {key: data[key] for key in filtered_keys} - - if exclude_dtypes is not None: - filtered_keys = ( - key - for key in data.keys() - if not any(key.startswith(dtype) for dtype in exclude_dtypes) - ) - data = {key: data[key] for key in filtered_keys} - - return data diff --git a/modin/tests/pandas/dataframe/conftest.py b/modin/tests/pandas/dataframe/conftest.py deleted file mode 100644 index 8c62d03d34f..00000000000 --- a/modin/tests/pandas/dataframe/conftest.py +++ /dev/null @@ -1,42 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import pytest - -from modin.config import StorageFormat - - -def pytest_collection_modifyitems(items): - if StorageFormat.get() == "Hdk": - for item in items: - if item.name in ( - "test_sum[data0-over_rows_int-skipna_True-True]", - "test_sum[data0-over_rows_str-skipna_True-True]", - ): - item.add_marker( - pytest.mark.xfail( - reason="https://github.com/intel-ai/hdk/issues/286" - ) - ) - elif item.name == "test_insert_dtypes[category-int_data]": - item.add_marker( - pytest.mark.xfail( - reason="Categorical columns are converted to string due to #1698" - ) - ) - elif item.name == "test_insert_dtypes[int32-float_nan_data]": - item.add_marker( - pytest.mark.xfail( - reason="HDK does not raise IntCastingNaNError on NaN to int cast" - ) - ) diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 10dabbe32bc..3e17962a6cd 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat +from modin.config import NPartitions, StorageFormat from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -189,12 +189,7 @@ def test_math_alias(math_op, alias): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_comparison(data, op, other, request): def operation(df): - df = getattr(df, op)(df if other == "as_left" else other) - if other == "as_left" and StorageFormat.get() == "Hdk": - # In case of comparison with a DataFrame, HDK returns - # a DataFrame with sorted columns. - df = df.sort_index(axis=1) - return df + return getattr(df, op)(df if other == "as_left" else other) expected_exception = None if "int_data" in request.node.callspec.id and other == "a": @@ -203,8 +198,6 @@ def operation(df): expected_exception = TypeError( "Invalid comparison between dtype=float64 and str" ) - if StorageFormat.get() == "Hdk": - pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019") eval_general( *create_test_dfs(data), @@ -481,13 +474,7 @@ def test_non_commutative_multiply(): [ pytest.param([10, 20], id="int"), pytest.param([10, True], id="obj"), - pytest.param( - [True, True], - id="bool", - marks=pytest.mark.skipif( - condition=Engine.get() == "Native", reason="Fails on HDK" - ), - ), + pytest.param([True, True], id="bool"), pytest.param([3.5, 4.5], id="float"), ], ) @@ -496,22 +483,10 @@ def test_non_commutative_multiply(): [ pytest.param([10, 20], id="int"), pytest.param([10, True], id="obj"), - pytest.param( - [True, True], - id="bool", - marks=pytest.mark.skipif( - condition=Engine.get() == "Native", reason="Fails on HDK" - ), - ), + pytest.param([True, True], id="bool"), pytest.param([3.5, 4.5], id="float"), pytest.param(2, id="int scalar"), - pytest.param( - True, - id="bool scalar", - marks=pytest.mark.skipif( - condition=Engine.get() == "Native", reason="Fails on HDK" - ), - ), + pytest.param(True, id="bool scalar"), pytest.param(3.5, id="float scalar"), ], ) diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 45ab3e2ec95..bad7e54031b 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -355,28 +355,17 @@ def test_corr_nans_in_different_partitions(self): @pytest.mark.parametrize("ddof", [1, 2, 4], ids=lambda x: f"ddof={x}") @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_cov(min_periods, ddof, backend): - # Modin result may slightly differ from pandas result - # due to floating pointing arithmetic. - if StorageFormat.get() == "Hdk": - - def comparator1(df1, df2): - modin_df_almost_equals_pandas(df1, df2, max_diff=0.0002) - - comparator2 = comparator1 - else: - comparator1 = df_equals - comparator2 = modin_df_almost_equals_pandas - eval_general( *create_test_dfs(test_data["int_data"], backend=backend), lambda df: df.cov(min_periods=min_periods, ddof=ddof), - comparator=comparator1, + comparator=df_equals, ) - + # Modin result may slightly differ from pandas result + # due to floating pointing arithmetic. That's why we use `modin_df_almost_equals_pandas`. eval_general( *create_test_dfs(test_data["float_nan_data"], backend=backend), lambda df: df.cov(min_periods=min_periods), - comparator=comparator2, + comparator=modin_df_almost_equals_pandas, ) @@ -568,17 +557,8 @@ def test_last(): "value_vars", [lambda df: df.columns[-1], lambda df: df.columns[-4:], None] ) def test_melt(data, id_vars, value_vars): - if StorageFormat.get() == "Hdk": - # Drop NA and sort by all columns to make sure the order - # is identical to Pandas. - def melt(df, *args, **kwargs): - df = df.melt(*args, **kwargs).dropna() - return df.sort_values(df.columns.tolist()) - - else: - - def melt(df, *args, **kwargs): - return df.melt(*args, **kwargs).sort_values(["variable", "value"]) + def melt(df, *args, **kwargs): + return df.melt(*args, **kwargs).sort_values(["variable", "value"]) eval_general( *create_test_dfs(data), @@ -590,8 +570,6 @@ def melt(df, *args, **kwargs): # Functional test for BUG:7206 def test_melt_duplicate_col_names(): - if StorageFormat.get() == "Hdk": - pass data = {"data": [[1, 2], [3, 4]], "columns": ["dupe", "dupe"]} def melt(df, *args, **kwargs): @@ -624,10 +602,7 @@ def test_pivot(data, index, columns, values, request): in request.node.callspec.id or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id - or ( - current_execution in ("BaseOnPython", "HdkOnNative") - and index is lib.no_default - ) + or (current_execution in ("BaseOnPython",) and index is lib.no_default) ): pytest.xfail(reason="https://github.com/modin-project/modin/issues/7010") @@ -1476,10 +1451,9 @@ def test_setattr_axes(): # In BaseOnPython, setting columns raises a warning because get_axis # defaults to pandas. warnings.simplefilter("error") - if StorageFormat.get() != "Hdk": # Not yet supported - #1766 - df.index = ["foo", "bar"] - # Check that ensure_index was called - pd.testing.assert_index_equal(df.index, pandas.Index(["foo", "bar"])) + df.index = ["foo", "bar"] + # Check that ensure_index was called + pd.testing.assert_index_equal(df.index, pandas.Index(["foo", "bar"])) df.columns = [9, 10] pd.testing.assert_index_equal(df.columns, pandas.Index([9, 10])) diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index d984fce252f..b3f1bed2dc9 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,7 +21,7 @@ from pandas._testing import ensure_clean import modin.pandas as pd -from modin.config import MinPartitionSize, NPartitions, StorageFormat +from modin.config import MinPartitionSize, NPartitions from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( @@ -1491,13 +1491,7 @@ def test_reset_index(data, test_async_reset_index): "data", [ test_data["int_data"], - pytest.param( - test_data["float_nan_data"], - marks=pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="https://github.com/modin-project/modin/issues/2896", - ), - ), + test_data["float_nan_data"], ], ) def test_reset_index_multiindex_groupby(data): @@ -2215,10 +2209,6 @@ def test___setitem__(data): df_equals(modin_df, pandas_df) -@pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/165", -) def test___setitem__partitions_aligning(): # from issue #2390 modin_df = pd.DataFrame({"a": [1, 2, 3]}) @@ -2286,9 +2276,6 @@ def test___setitem__mask(): modin_df[array] = 20 -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", reason="https://github.com/intel-ai/hdk/issues/165" -) @pytest.mark.parametrize( "data", [ diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 2a33e0b860d..4e2c6517bb2 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -19,7 +19,7 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, RangePartitioning, StorageFormat +from modin.config import Engine, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, @@ -72,9 +72,6 @@ def test_combine(data): ) -@pytest.mark.xfail( - StorageFormat.get() == "Hdk", reason="https://github.com/intel-ai/hdk/issues/264" -) @pytest.mark.parametrize( "test_data, test_data2", [ @@ -237,10 +234,6 @@ def test_join_6602(): teams.set_index("league_abbreviation").join(abbreviations.rename("league_name")) -@pytest.mark.skipif( - RangePartitioning.get() and StorageFormat.get() == "Hdk", - reason="Doesn't make sense for HDK", -) @pytest.mark.parametrize( "test_data, test_data2", [ @@ -682,29 +675,10 @@ def test_sort_values( if ascending is None and key is not None: pytest.skip("Pandas bug #41318") - # If index is preserved and `key` function is ``None``, - # it could be sorted along rows differently from pandas. - # The order of NA rows, sorted by HDK, is different (but still valid) - # from pandas. To make the index identical to pandas, we add the - # index names to 'by'. - by_index_names = None - if ( - StorageFormat.get() == "Hdk" - and not ignore_index - and key is None - and (axis == 0 or axis == "rows") - ): - by_index_names = [] if "multiindex" in by: index = generate_multiindex(len(data[list(data.keys())[0]]), nlevels=2) columns = generate_multiindex(len(data.keys()), nlevels=2) data = {columns[ind]: data[key] for ind, key in enumerate(data)} - if by_index_names is not None: - by_index_names.extend(index.names) - elif by_index_names is not None: - index = pd.RangeIndex(0, len(next(iter(data.values()))), name="test_idx") - columns = None - by_index_names.append(index.name) else: index = None columns = None @@ -728,9 +702,6 @@ def test_sort_values( else: raise Exception('Unknown "by" specifier:' + b) - if by_index_names is not None: - by_list.extend(by_index_names) - # Create "ascending" list if ascending in ["list_first_True", "list_first_False"]: start = 0 if ascending == "list_first_False" else 1 @@ -769,10 +740,6 @@ def test_sort_values_descending_with_only_two_bins(): ) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/modin-project/modin/issues/3941", -) @pytest.mark.parametrize("ignore_index", [True, False]) def test_sort_values_preserve_index_names(ignore_index): modin_df, pandas_df = create_test_dfs( diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index ab7a7fa4a31..9c6ed0cc2af 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -468,19 +468,7 @@ def test_astype_errors(errors): ) -@pytest.mark.parametrize( - "has_dtypes", - [ - pytest.param( - False, - marks=pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="HDK does not support cases when `.dtypes` is None", - ), - ), - True, - ], -) +@pytest.mark.parametrize("has_dtypes", [False, True]) def test_astype_copy(has_dtypes): data = [1] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) @@ -563,11 +551,6 @@ def test_astype_category_large(): assert modin_result.dtypes.equals(pandas_result.dtypes) -@pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="https://github.com/modin-project/modin/issues/6268", - strict=True, -) def test_astype_int64_to_astype_category_github_issue_6259(): eval_general( *create_test_dfs( @@ -613,21 +596,6 @@ def _get_lazy_proxy(): df = df.astype({"a": "category"}) return df.dtypes["a"], original_dtype, df - elif StorageFormat.get() == "Hdk": - import pyarrow as pa - - from modin.pandas.io import from_arrow - - at = pa.concat_tables( - [ - pa.Table.from_pandas(chunk.astype({"a": "category"})) - for chunk in chunks - ] - ) - assert len(at.column(0).chunks) == nchunks - - df = from_arrow(at) - return df.dtypes["a"], original_dtype, df else: raise NotImplementedError() @@ -675,9 +643,6 @@ def test_update_proxy_implicit(self): if StorageFormat.get() == "Pandas": assert lazy_proxy._parent is parent_frame - elif StorageFormat.get() == "Hdk": - arrow_table = parent_frame._partitions[0, 0].get() - assert lazy_proxy._parent is arrow_table else: raise NotImplementedError( f"The test is not implemented for {StorageFormat.get()} storage format" @@ -692,11 +657,6 @@ def test_update_proxy_implicit(self): # Make sure that the old proxy still pointing to the old parent assert lazy_proxy._parent is parent_frame assert new_lazy_proxy._parent is new_parent_frame - elif StorageFormat.get() == "Hdk": - new_arrow_table = new_parent_frame._partitions[0, 0].get() - # Make sure that the old proxy still pointing to the old parent - assert lazy_proxy._parent is arrow_table - assert new_lazy_proxy._parent is new_arrow_table else: raise NotImplementedError( f"The test is not implemented for {StorageFormat.get()} storage format" @@ -827,10 +787,6 @@ def comparator(df1, df2): ) -@pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="HDK does not support columns with different types", -) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype modin_part1 = pd.DataFrame(["a"]).convert_dtypes() @@ -1345,14 +1301,12 @@ def test_insert(data): modin_df, pandas_df, col="Duplicate", value=lambda df: df[df.columns[0]] ) eval_insert(modin_df, pandas_df, col="Scalar", value=100) - if StorageFormat.get() != "Hdk": - # FIXME: https://github.com/modin-project/modin/issues/7027 - eval_insert( - pd.DataFrame(columns=list("ab")), - pandas.DataFrame(columns=list("ab")), - col="Series insert", - value=lambda df: df[df.columns[0]], - ) + eval_insert( + pd.DataFrame(columns=list("ab")), + pandas.DataFrame(columns=list("ab")), + col="Series insert", + value=lambda df: df[df.columns[0]], + ) eval_insert( modin_df, pandas_df, diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py index d6f76d68507..1faaaca0107 100644 --- a/modin/tests/pandas/dataframe/test_reduce.py +++ b/modin/tests/pandas/dataframe/test_reduce.py @@ -17,11 +17,9 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat -from modin.pandas.testing import assert_series_equal +from modin.config import NPartitions from modin.tests.pandas.utils import ( arg_keys, - assert_dtypes_equal, axis_keys, axis_values, bool_arg_keys, @@ -105,10 +103,6 @@ def test_count_specific(numeric_only): ) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/513", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_count_dtypes(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) @@ -293,16 +287,6 @@ def test_prod( @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_sum(data, axis, skipna, is_transposed, request): - if ( - StorageFormat.get() == "Hdk" - and is_transposed - and skipna - and ( - "over_rows_int" in request.node.callspec.id - or "over_rows_str" in request.node.callspec.id - ) - ): - pytest.xfail(reason="https://github.com/modin-project/modin/issues/7028") eval_general( *create_test_dfs(data), lambda df: (df.T if is_transposed else df).sum( @@ -415,9 +399,6 @@ def test_reduce_specific(fn, numeric_only, axis): expected_exception = TypeError( f"'{operator}' not supported between instances of 'str' and 'float'" ) - if StorageFormat.get() == "Hdk": - # FIXME: https://github.com/modin-project/modin/issues/7030 - expected_exception = False else: # FIXME: https://github.com/modin-project/modin/issues/7030 expected_exception = False @@ -482,23 +463,9 @@ def test_value_counts_categorical(): modin_df, pandas_df = create_test_dfs( {"col1": data, "col2": data}, dtype="category" ) - - if StorageFormat.get() == "Hdk": - # The order of HDK categories is different from Pandas - # and, thus, index comparison fails. - def comparator(df1, df2): - # Perform our own non-strict version of dtypes equality check - assert_dtypes_equal(df1, df2) - assert_series_equal( - df1._to_pandas(), df2, check_index=False, check_dtype=False - ) - - else: - comparator = df_equals - eval_general( modin_df, pandas_df, lambda df: df.value_counts(), - comparator=comparator, + comparator=df_equals, ) diff --git a/modin/tests/pandas/dataframe/test_window.py b/modin/tests/pandas/dataframe/test_window.py index 449dde389b4..d7f1bc5d869 100644 --- a/modin/tests/pandas/dataframe/test_window.py +++ b/modin/tests/pandas/dataframe/test_window.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat +from modin.config import NPartitions from modin.tests.pandas.utils import ( arg_keys, axis_keys, @@ -25,7 +25,6 @@ bool_arg_keys, bool_arg_values, create_test_dfs, - default_to_pandas_ignore_string, df_equals, eval_general, int_arg_keys, @@ -47,9 +46,6 @@ # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") -if StorageFormat.get() == "Hdk": - pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) - @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [False, True]) @@ -182,10 +178,6 @@ def test_fillna(data, method, axis, limit): df_equals(modin_result, pandas_result) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="'datetime64[ns, pytz.FixedOffset(60)]' vs 'datetime64[ns, UTC+01:00]'", -) def test_fillna_sanity(): # with different dtype frame_data = [ diff --git a/modin/tests/pandas/test_concat.py b/modin/tests/pandas/test_concat.py index 19f9a8d21fc..3f2c6a0daf5 100644 --- a/modin/tests/pandas/test_concat.py +++ b/modin/tests/pandas/test_concat.py @@ -336,7 +336,7 @@ def test_concat_empty_df_series(): @pytest.mark.skipif( - StorageFormat.get() not in ("Hdk", "Base"), + StorageFormat.get() != "Base", reason="https://github.com/modin-project/modin/issues/5696", ) @pytest.mark.parametrize("col_type", [None, "str"]) diff --git a/modin/tests/pandas/test_general.py b/modin/tests/pandas/test_general.py index 3f3b4e74afc..6d32ac93312 100644 --- a/modin/tests/pandas/test_general.py +++ b/modin/tests/pandas/test_general.py @@ -19,7 +19,6 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import StorageFormat from modin.pandas.io import to_pandas from modin.pandas.testing import assert_frame_equal from modin.tests.test_utils import warns_that_defaulting_to_pandas @@ -29,7 +28,6 @@ bool_arg_keys, bool_arg_values, create_test_dfs, - default_to_pandas_ignore_string, df_equals, eval_general, sort_if_range_partitioning, @@ -38,12 +36,9 @@ test_data_values, ) -if StorageFormat.get() == "Hdk": - pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) -else: - pytestmark = pytest.mark.filterwarnings( - "default:`DataFrame.insert` for empty DataFrame is not currently supported.*:UserWarning" - ) +pytestmark = pytest.mark.filterwarnings( + "default:`DataFrame.insert` for empty DataFrame is not currently supported.*:UserWarning" +) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -507,8 +502,8 @@ def test_pivot(): with pytest.raises(ValueError): pd.pivot(test_df["bar"], index="foo", columns="bar", values="baz") - if get_current_execution() != "BaseOnPython" and StorageFormat.get() != "Hdk": - # FIXME: Failed for some reason on 'BaseOnPython' and 'HDK' + if get_current_execution() != "BaseOnPython": + # FIXME: Failed for some reason on 'BaseOnPython' # https://github.com/modin-project/modin/issues/6240 df_equals( pd.pivot(test_df, columns="bar"), @@ -634,10 +629,6 @@ def test_unique(): assert modin_result.shape == pandas_result.shape -@pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="https://github.com/modin-project/modin/issues/2896", -) @pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)]) def test_value_counts(normalize, bins, dropna): # We sort indices for Modin and pandas result because of issue #1650 @@ -933,11 +924,7 @@ def test_default_to_pandas_warning_message(func, regex): def test_empty_dataframe(): df = pd.DataFrame(columns=["a", "b"]) - with ( - warns_that_defaulting_to_pandas() - if StorageFormat.get() != "Hdk" - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): df[(df.a == 1) & (df.b == 2)] diff --git a/modin/tests/pandas/test_groupby.py b/modin/tests/pandas/test_groupby.py index c10cba13b1d..46779e0dfc4 100644 --- a/modin/tests/pandas/test_groupby.py +++ b/modin/tests/pandas/test_groupby.py @@ -2633,7 +2633,7 @@ def compare(obj1, obj2): @pytest.mark.skipif( - get_current_execution() == "BaseOnPython" or StorageFormat.get() == "Hdk", + get_current_execution() == "BaseOnPython", reason="The test only make sense for partitioned executions", ) def test_groupby_with_virtual_partitions(): diff --git a/modin/tests/pandas/test_io.py b/modin/tests/pandas/test_io.py index 023e8c5a0a5..e1afab5a1bd 100644 --- a/modin/tests/pandas/test_io.py +++ b/modin/tests/pandas/test_io.py @@ -58,6 +58,7 @@ df_equals, dummy_decorator, eval_general, + eval_io, eval_io_from_str, generate_dataframe, get_unique_filename, @@ -70,14 +71,6 @@ from .utils import test_data as utils_test_data from .utils import time_parsing_csv_path -if StorageFormat.get() == "Hdk": - from modin.tests.experimental.hdk_on_native.utils import ( - align_datetime_dtypes, - eval_io, - ) -else: - from .utils import eval_io - if StorageFormat.get() == "Pandas": import modin.pandas as pd else: @@ -471,11 +464,8 @@ def test_read_csv_parsing_3( skipfooter, nrows, ): - xfail_case = ( - (false_values or true_values) - and Engine.get() != "Python" - and StorageFormat.get() != "Hdk" - ) + # TODO: Check #2446 as it was closed + xfail_case = (false_values or true_values) and Engine.get() != "Python" if xfail_case: pytest.xfail("modin and pandas dataframes differs - issue #2446") @@ -749,17 +739,6 @@ def test_read_csv_file_format( lineterminator=lineterminator, ) - if ( - (StorageFormat.get() == "Hdk") - and (escapechar is not None) - and (lineterminator is None) - and (thousands is None) - and (decimal == ".") - ): - with open(unique_filename, "r") as f: - if any(line.find(f',"{escapechar}') != -1 for _, line in enumerate(f)): - pytest.xfail("Tests with this character sequence fail due to #5649") - expected_exception = None if dialect is None: # FIXME: https://github.com/modin-project/modin/issues/7035 @@ -824,11 +803,8 @@ def test_read_csv_error_handling(self, on_bad_lines): # in that case exceptions are raised both by Modin and pandas # and tests pass raise_exception_case = on_bad_lines is not None - if ( - not raise_exception_case - and Engine.get() not in ["Python"] - and StorageFormat.get() != "Hdk" - ): + # TODO: Check #2500 as it was closed + if not raise_exception_case and Engine.get() not in ["Python"]: pytest.xfail("read_csv doesn't raise `bad lines` exceptions - issue #2500") eval_io( fn_name="read_csv", @@ -871,8 +847,6 @@ def test_python_engine_low_memory_except(self, low_memory): @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_delim_whitespace(self, delim_whitespace, tmp_path): - if StorageFormat.get() == "Hdk" and delim_whitespace: - pytest.xfail(reason="https://github.com/modin-project/modin/issues/6999") str_delim_whitespaces = "col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n" unique_filename = get_unique_filename(data_dir=tmp_path) eval_io_from_str( @@ -976,13 +950,6 @@ def test_read_csv_parse_dates( expected_exception = ValueError( "Missing column provided to 'parse_dates': 'z'" ) - if ( - StorageFormat.get() == "Hdk" - and "names1-0-None-nonexistent_string_column-strict-None" - in request.node.callspec.id - ): - # FIXME: https://github.com/modin-project/modin/issues/7035 - expected_exception = False eval_io( fn_name="read_csv", expected_exception=expected_exception, @@ -1034,7 +1001,7 @@ def test_read_csv_skiprows_names(self, names, skiprows): def _has_pandas_fallback_reason(self): # The Python engine does not use custom IO dispatchers, so specialized error messages # won't appear - return Engine.get() != "Python" and StorageFormat.get() != "Hdk" + return Engine.get() != "Python" def test_read_csv_default_to_pandas(self): if self._has_pandas_fallback_reason(): @@ -1051,10 +1018,6 @@ def test_read_csv_url(self): fn_name="read_csv", # read_csv kwargs filepath_or_buffer="https://raw.githubusercontent.com/modin-project/modin/main/modin/tests/pandas/data/blah.csv", - # It takes about ~17Gb of RAM for HDK to import the whole table from this test - # because of too many (~1000) string columns in it. Taking a subset of columns - # to be able to run this test on low-RAM machines. - usecols=[0, 1, 2, 3] if StorageFormat.get() == "Hdk" else None, ) @pytest.mark.parametrize("nrows", [21, 5, None]) @@ -1072,7 +1035,7 @@ def test_read_csv_newlines_in_quotes(self, nrows, skiprows): filepath_or_buffer="modin/tests/pandas/data/newlines.csv", nrows=nrows, skiprows=skiprows, - cast_to_str=StorageFormat.get() != "Hdk", + cast_to_str=True, ) @pytest.mark.parametrize("skiprows", [None, 0, [], [1, 2], np.arange(0, 2)]) @@ -1128,9 +1091,6 @@ def test_read_csv_names_neq_num_cols(self, kwargs): def test_read_csv_wrong_path(self): expected_exception = FileNotFoundError(2, "No such file or directory") - if StorageFormat.get() == "Hdk": - # FIXME: https://github.com/modin-project/modin/issues/7035 - expected_exception = False eval_io( fn_name="read_csv", expected_exception=expected_exception, @@ -1231,11 +1191,6 @@ def wrapped_read_csv(file, method): pytest.csvs_names["test_read_csv_regular"], method="modin" ) - if StorageFormat.get() == "Hdk": - # Aligning DateTime dtypes because of the bug related to the `parse_dates` parameter: - # https://github.com/modin-project/modin/issues/3485 - modin_df, pandas_df = align_datetime_dtypes(modin_df, pandas_df) - df_equals(modin_df, pandas_df) @pytest.mark.parametrize( @@ -1413,9 +1368,6 @@ def wrapped_read_table(file, method): pandas_df = wrapped_read_table(unique_filename, method="pandas") modin_df = wrapped_read_table(unique_filename, method="modin") - if StorageFormat.get() == "Hdk": - modin_df, pandas_df = align_datetime_dtypes(modin_df, pandas_df) - df_equals(modin_df, pandas_df) def test_read_table_empty_frame(self, make_csv_file): @@ -2172,9 +2124,7 @@ def test_read_parquet_s3_with_column_partitioning( def test_read_parquet_relative_to_user_home(make_parquet_file): with ensure_clean(".parquet") as unique_filename: make_parquet_file(filename=unique_filename) - _check_relative_io( - "read_parquet", unique_filename, "path", storage_default=("Hdk",) - ) + _check_relative_io("read_parquet", unique_filename, "path") @pytest.mark.filterwarnings(default_to_pandas_ignore_string) @@ -2368,8 +2318,9 @@ def test_read_excel_all_sheets(self, make_excel_file): for key in pandas_df.keys(): df_equals(modin_df.get(key), pandas_df.get(key)) + # TODO: Check pandas gh-#39250 as it was fixed @pytest.mark.xfail( - Engine.get() != "Python" and StorageFormat.get() != "Hdk", + Engine.get() != "Python", reason="pandas throws the exception. See pandas issue #39250 for more info", ) @check_file_leaks @@ -2763,9 +2714,6 @@ def test_to_sql(self, tmp_path, make_sql_connection, index, conn_type): assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index()) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", reason="Missing optional dependency 'lxml'." -) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestHtml: def test_read_html(self, make_html_file): @@ -3200,9 +3148,6 @@ def test_to_pickle(self, tmp_path): df_equals(modin_df, recreated_modin_df) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", reason="Missing optional dependency 'lxml'." -) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestXml: def test_read_xml(self): @@ -3342,7 +3287,7 @@ def test_to_latex(): @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_to_xml(): # `lxml` is a required dependency for `to_xml`, but optional for Modin. - # For some engines we do not install it (like for HDK). + # For some engines we do not install it. pytest.importorskip("lxml") modin_df, _ = create_test_dfs(TEST_DATA) assert modin_df.to_xml() == to_pandas(modin_df).to_xml() diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index be737e4c70d..b7be43c62dc 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -30,7 +30,9 @@ import modin.pandas as pd from modin.config import Engine, NPartitions, StorageFormat from modin.pandas.io import to_pandas -from modin.pandas.testing import assert_series_equal +from modin.tests.core.storage_formats.pandas.test_internals import ( + construct_modin_df_by_scheme, +) from modin.tests.test_utils import warns_that_defaulting_to_pandas from modin.utils import get_current_execution, try_cast_to_pandas @@ -44,7 +46,6 @@ agg_func_keys, agg_func_values, arg_keys, - assert_dtypes_equal, bool_arg_keys, bool_arg_values, categories_equals, @@ -87,11 +88,6 @@ test_string_list_data_values, ) -if StorageFormat.get() != "Hdk": - from modin.tests.core.storage_formats.pandas.test_internals import ( - construct_modin_df_by_scheme, - ) - # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. @@ -671,10 +667,6 @@ def test___str__(data): assert str(modin_series) == str(pandas_series) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/272", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___sub__(data): modin_series, pandas_series = create_test_series(data) @@ -1061,13 +1053,7 @@ def test_asof_large(where): "data", [ test_data["int_data"], - pytest.param( - test_data["float_nan_data"], - marks=pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="HDK does not raise IntCastingNaNError", - ), - ), + test_data["float_nan_data"], ], ids=test_data_keys, ) @@ -1312,10 +1298,6 @@ def test_clip_sequence(request, data, bound_type): df_equals(modin_result, pandas_result) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/271", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_combine(data): modin_series, _ = create_test_series(data) # noqa: F841 @@ -1323,10 +1305,6 @@ def test_combine(data): modin_series.combine(modin_series2, lambda s1, s2: s1 if s1 < s2 else s2) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/271", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_combine_first(data): modin_series, pandas_series = create_test_series(data) @@ -1409,25 +1387,20 @@ def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) - if StorageFormat.get() != "Hdk": - # FIXME: HDK should also work in this case - eval_general( - modin_series, - pandas_series, - lambda ser: ser - + (modin_series if isinstance(ser, pd.Series) else pandas_series), - comparator=comparator, - ) + eval_general( + modin_series, + pandas_series, + lambda ser: ser + + (modin_series if isinstance(ser, pd.Series) else pandas_series), + comparator=comparator, + ) - if StorageFormat.get() != "Hdk": - # FIXME: HDK should also work in this case but - # since we deprecated it, we will just remove this branch - eval_general( - modin_series, - pandas_series, - lambda ser: ser > (ser + 1), - comparator=comparator, - ) + eval_general( + modin_series, + pandas_series, + lambda ser: ser > (ser + 1), + comparator=comparator, + ) eval_general( modin_series, @@ -1443,14 +1416,12 @@ def comparator(df1, df2): comparator=comparator, ) - if StorageFormat.get() != "Hdk": - # FIXME: HDK should also work in this case - eval_general( - modin_series, - pandas_series, - lambda ser: ser.fillna(0), - comparator=comparator, - ) + eval_general( + modin_series, + pandas_series, + lambda ser: ser.fillna(0), + comparator=comparator, + ) def test_pyarrow_array_retrieve(): @@ -1846,13 +1817,7 @@ def test_dtype(data): "timezone", [ pytest.param(None), - pytest.param( - "Europe/Berlin", - marks=pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="HDK is unable to store TZ in the table schema", - ), - ), + pytest.param("Europe/Berlin"), ], ) def test_dt(timezone): @@ -1947,14 +1912,10 @@ def dt_with_empty_partition(lib): .dropna(axis=1) .squeeze(1) ) - # BaseOnPython ahd HDK had a single partition after the concat, and it + # BaseOnPython had a single partition after the concat, and it # maintains that partition after dropna and squeeze. In other execution modes, # the series should have two column partitions, one of which is empty. - if ( - isinstance(df, pd.DataFrame) - and get_current_execution() != "BaseOnPython" - and StorageFormat.get() != "Hdk" - ): + if isinstance(df, pd.DataFrame) and get_current_execution() != "BaseOnPython": assert df._query_compiler._modin_frame._partitions.shape == (1, 2) return df.dt.days @@ -2549,10 +2510,6 @@ def test_map(data, na_values): ) -@pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/542", -) def test_mask(): modin_series = pd.Series(np.arange(10)) m = modin_series % 3 == 0 @@ -3463,10 +3420,6 @@ def test_std(request, data, skipna, ddof): df_equals(modin_result, pandas_result) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/272", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sub(data): modin_series, pandas_series = create_test_series(data) @@ -3484,10 +3437,6 @@ def test_6782(): ) -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/intel-ai/hdk/issues/272", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_subtract(data): modin_series, pandas_series = create_test_series(data) @@ -3899,13 +3848,7 @@ def test_update(data, other_data): [ pytest.param(None), pytest.param(False), - pytest.param( - True, - marks=pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="https://github.com/modin-project/modin/issues/2896", - ), - ), + pytest.param(True), ], ) @pytest.mark.parametrize("ascending", [True, False]) @@ -3952,24 +3895,10 @@ def test_value_counts_categorical(): data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000) random_state = np.random.RandomState(seed=42) random_state.shuffle(data) - - if StorageFormat.get() == "Hdk": - # The order of HDK categories is different from Pandas - # and, thus, index comparison fails. - def comparator(df1, df2): - # Perform our own non-strict version of dtypes equality check - assert_dtypes_equal(df1, df2) - assert_series_equal( - df1.modin.to_pandas(), df2, check_index=False, check_dtype=False - ) - - else: - comparator = df_equals - eval_general( *create_test_series(data, dtype="category"), lambda df: df.value_counts(), - comparator=comparator, + comparator=df_equals, ) @@ -4865,11 +4794,8 @@ def test_case_when(base, caselist): # 'base' and serieses from 'caselist' must have equal lengths, however in this test we want # to verify that 'case_when' works correctly even if partitioning of 'base' and 'caselist' isn't equal. - # HDK and BaseOnPython always use a single partition, thus skipping this test for them. - if ( - StorageFormat.get() != "Hdk" - and f"{StorageFormat.get()}On{Engine.get()}" != "BaseOnPython" - ): + # BaseOnPython always uses a single partition, thus skipping this test for them. + if f"{StorageFormat.get()}On{Engine.get()}" != "BaseOnPython": modin_base_repart = construct_modin_df_by_scheme( base.to_frame(), partitioning_scheme={"row_lengths": [14, 14, 12], "column_widths": [1]}, @@ -4969,10 +4895,6 @@ def test_cat_ordered(data): assert modin_series.cat.ordered == pandas_series.cat.ordered -@pytest.mark.skipif( - StorageFormat.get() == "Hdk", - reason="HDK uses internal codes, that are different from Pandas", -) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) @@ -4999,8 +4921,7 @@ def test_cat_codes_issue5650(set_min_partition_size): modin_df, pandas_df, lambda df: df["name"].cat.codes, - # https://github.com/modin-project/modin/issues/5973 - comparator_kwargs={"check_dtypes": StorageFormat.get() != "Hdk"}, + comparator_kwargs={"check_dtypes": True}, ) diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index c076fcd3856..bc478d957f9 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -357,10 +357,8 @@ def test_execute(): modin.utils.execute(pandas_df) mgr_cls.wait_partitions.assert_not_called() - # muke sure `trigger_hdk_import=True` doesn't broke anything - # when using other storage formats with patch.object(mgr_cls, "wait_partitions", new=Mock()): - modin.utils.execute(modin_df, trigger_hdk_import=True) + modin.utils.execute(modin_df) mgr_cls.wait_partitions.assert_called_once() # check several modin dataframes diff --git a/modin/utils.py b/modin/utils.py index a3ed1dc91a3..8305bc75da3 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -631,7 +631,7 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: return obj -def execute(*objs: Iterable[Any], trigger_hdk_import: bool = False) -> None: +def execute(*objs: Iterable[Any]) -> None: """ Trigger the lazy computations for each obj in `objs`, if any, and wait for them to complete. @@ -639,17 +639,12 @@ def execute(*objs: Iterable[Any], trigger_hdk_import: bool = False) -> None: ---------- *objs : Iterable[Any] A collection of objects to trigger lazy computations. - trigger_hdk_import : bool, default: False - Trigger import execution. Makes sense only for HDK storage format. - Safe to use with other storage formats. """ for obj in objs: if not hasattr(obj, "_query_compiler"): continue query_compiler = obj._query_compiler query_compiler.execute() - if trigger_hdk_import and hasattr(query_compiler, "force_import"): - query_compiler.force_import() def wrap_into_list(*args: Any, skipna: bool = True) -> List[Any]: @@ -794,18 +789,6 @@ def _get_modin_deps_info() -> Mapping[str, Optional[JSONSerializable]]: if version.parse(pkg.__version__) < pkg_version else "" ) - - try: - # We import ``DbWorker`` from this module since correct import of ``DbWorker`` itself - # from HDK is located in it with all the necessary options for dlopen. - from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( # noqa - DbWorker, - ) - - result["hdk"] = "present" - except ImportError: - result["hdk"] = None - return result diff --git a/requirements/env_hdk.yml b/requirements/env_hdk.yml deleted file mode 100644 index f36ddf03efe..00000000000 --- a/requirements/env_hdk.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: modin_on_hdk -channels: - - conda-forge -dependencies: - - pip - - # required dependencies - - pandas>=2.2,<2.3 - - numpy>=1.22.4 - - pyhdk==0.9 - - fsspec>=2022.11.0 - - packaging>=21.0 - - psutil>=5.8.0 - - # optional dependencies - - s3fs>=2022.11.0 - - openpyxl>=3.1.0 - - xlrd>=2.0.1 - - sqlalchemy>=2.0.0 - - scipy>=1.10.0 - - matplotlib>=3.6.3 - - xarray>=2022.12.0 - - pytables>=3.8.0 - - fastparquet>=2022.12.0 - # pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469 - - numexpr<2.8.5 - - # dependencies for making release - - pygithub>=v1.58.0 - - # test dependencies - - coverage>=7.1.0 - - moto>=4.1.0 - - pytest>=7.3.2 - - pytest-cov>=4.0.0 - - pytest-xdist>=3.2.0 - - typing_extensions - - # code linters - - black>=24.1.0 - - flake8>=6.0.0 - - flake8-no-implicit-concat>=0.3.4 - - flake8-print>=5.0.0 - - mypy>=1.0.0 - - - pip: - - dataframe-api-compat>=0.2.7 - # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - - numpydoc==1.6.0 diff --git a/scripts/doc_checker.py b/scripts/doc_checker.py index c2f07bdb92d..a56bd1aaec5 100644 --- a/scripts/doc_checker.py +++ b/scripts/doc_checker.py @@ -161,8 +161,7 @@ def check_spelling_words(doc: Validator) -> list: if not doc.raw_doc: return [] components = set( - ["Modin", "pandas", "NumPy", "Ray", "Dask"] - + ["PyArrow", "HDK", "XGBoost", "Plasma"] + ["Modin", "pandas", "NumPy", "Ray", "Dask"] + ["PyArrow", "XGBoost", "Plasma"] ) check_words = "|".join(x.lower() for x in components) @@ -543,11 +542,6 @@ def load_obj(name, old_load_obj=Validator._load_obj): Validator._load_obj = staticmethod(load_obj) - # for testing hdk-engine docs without `pyhdk` installation - sys.modules["pyhdk"] = Mock() - sys.modules["pyhdk"].__version__ = "999" - sys.modules["pyhdk.hdk"] = Mock() - sys.modules["pyhdk._sql"] = Mock() # enable docs testing on windows sys.getdlopenflags = Mock() sys.setdlopenflags = Mock()