From 6059ad70a4b36c51c7cd984307e1d2d7a122b0a7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 14 Jan 2024 14:02:46 -0500 Subject: [PATCH] pantab 4.0 (#218) --- .github/workflows/unit-test.yml | 2 + .gitignore | 2 + CMakeLists.txt | 52 ++ CONTRIBUTING.md | 16 +- environment.yml | 4 +- meson.build | 33 - pantab/__init__.py | 113 +-- pantab/_compat.py | 7 - pantab/_hyper_util.py | 29 - pantab/_reader.py | 191 ++---- pantab/_types.py | 72 -- pantab/_writer.py | 242 +------ pantab/src/CMakeLists.txt | 30 + pantab/src/__init__.py | 0 pantab/src/cffi.h | 73 -- .../{numpy_datetime.c => numpy_datetime.cpp} | 0 pantab/src/pantab.c | 64 -- pantab/src/pantab.cpp | 619 +++++++++++++++++ pantab/src/reader.c | 204 ------ pantab/src/reader.h | 9 - pantab/src/tableauhyperapi.h | 83 --- pantab/src/type.c | 37 - pantab/src/type.h | 62 -- pantab/src/writer.c | 641 ------------------ pantab/src/writer.h | 10 - pantab/tests/conftest.py | 80 ++- pantab/tests/test_reader.py | 71 +- pantab/tests/test_roundtrip.py | 143 +--- pantab/tests/test_types.py | 16 - pantab/tests/test_writer.py | 146 +--- pyproject.toml | 37 +- 31 files changed, 934 insertions(+), 2154 deletions(-) create mode 100644 CMakeLists.txt delete mode 100644 meson.build delete mode 100644 pantab/_compat.py delete mode 100644 pantab/_hyper_util.py create mode 100644 pantab/src/CMakeLists.txt create mode 100644 pantab/src/__init__.py delete mode 100644 pantab/src/cffi.h rename pantab/src/{numpy_datetime.c => numpy_datetime.cpp} (100%) delete mode 100644 pantab/src/pantab.c create mode 100644 pantab/src/pantab.cpp delete mode 100644 pantab/src/reader.c delete mode 100644 pantab/src/reader.h delete mode 100644 pantab/src/tableauhyperapi.h delete mode 100644 pantab/src/type.c delete mode 100644 pantab/src/type.h delete mode 100644 pantab/src/writer.c delete mode 100644 pantab/src/writer.h delete mode 100644 pantab/tests/test_types.py diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 979fd6b3..478cf60b 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -30,3 +30,5 @@ jobs: - name: Build wheels for ${{ matrix.os }} uses: pypa/cibuildwheel@v2.16.2 + env: + MACOSX_DEPLOYMENT_TARGET: "10.14" diff --git a/.gitignore b/.gitignore index f1d85790..326e5ba7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ .mypy_cache *.hyper hyper_db* +compile_commands.json +_deps ######################################### # Editor temporary/working/backup files # diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..f5fdb0b4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.18) +project(${SKBUILD_PROJECT_NAME} LANGUAGES C CXX) +set(CMAKE_C_STANDARD 17) +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if (MSVC) +else() + add_compile_options(-Wall -Wextra) +endif() + +find_package(Python COMPONENTS Interpreter Development.Module NumPy REQUIRED) + +# Detect the installed nanobind package and import it into CMake +execute_process( + COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir + OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR) +list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}") +find_package(nanobind CONFIG REQUIRED) + +if(WIN32) + set(TABLEAU_DOWNLOAD_URL "https://downloads.tableau.com/tssoftware//tableauhyperapi-cxx-windows-x86_64-release-main.0.0.18441.r118d57bb.zip") +elseif(APPLE) + set(TABLEAU_DOWNLOAD_URL "https://downloads.tableau.com/tssoftware//tableauhyperapi-cxx-macos-x86_64-release-main.0.0.18441.r118d57bb.zip") +else() + set(TABLEAU_DOWNLOAD_URL "https://downloads.tableau.com/tssoftware//tableauhyperapi-cxx-linux-x86_64-release-main.0.0.18441.r118d57bb.zip") +endif() + +include(FetchContent) +FetchContent_Declare( + tableauhyperapi-cxx + URL "${TABLEAU_DOWNLOAD_URL}" +) + +FetchContent_MakeAvailable(tableauhyperapi-cxx) +list(APPEND CMAKE_PREFIX_PATH "${tableauhyperapi-cxx_SOURCE_DIR}/share/cmake") +find_package(tableauhyperapi-cxx CONFIG REQUIRED) + + +FetchContent_Declare(nanoarrow-project + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG apache-arrow-nanoarrow-0.3.0 +) +FetchContent_MakeAvailable(nanoarrow-project) + +if (PANTAB_USE_SANITIZERS) + add_compile_options(-fsanitize=address -fsanitize=undefined) + add_link_options(-fsanitize=address -fsanitize=undefined) +endif() + +add_subdirectory(pantab/src) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e905d5d6..7e399c79 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,21 +50,7 @@ git checkout -b a-new-branch ### Building the Project -To install pantab, simply run: - -```sh -python -m pip install . -``` - -From the project root. Because pandas uses meson as a build backend, you can pass options (like building a debug version) via meson command line arguments: - -``` -python -m pip install . --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" -``` - -At the moment editable installs are not supported. - -Please also note that the above will fail without a C compiler - if you don't have one installed check out the appropriate documentation from the [Python Developer Guide](https://devguide.python.org/setup/#compile-and-build) for your platform. +For an editable install of pantab you can simply run `pip install -ve .` from the project root. ### Creating tests and running the test suite diff --git a/environment.yml b/environment.yml index c3addc18..19679ea0 100644 --- a/environment.yml +++ b/environment.yml @@ -5,13 +5,15 @@ dependencies: - black - flake8 - isort - - meson-python - mypy + - nanobind - pandas + - pandas-stubs - pip - pyarrow - python - pytest + - scikit-build-core - sphinx - pre-commit - sphinx_rtd_theme diff --git a/meson.build b/meson.build deleted file mode 100644 index 70c06638..00000000 --- a/meson.build +++ /dev/null @@ -1,33 +0,0 @@ -project('pantab', 'c') - -py = import('python').find_installation(pure: false) - -incdir_numpy = run_command(py, - [ - '-c', - ''' -import os -import numpy as np -try: - # Check if include directory is inside the pandas dir - # e.g. a venv created inside the pandas dir - # If so, convert it to a relative path - incdir = os.path.relpath(np.get_include()) -except Exception: - incdir = np.get_include() -print(incdir) - ''' - ], - check: true -).stdout().strip() - -inc_np = include_directories(incdir_numpy) - -py.extension_module( - 'libpantab', - ['pantab/src/pantab.c', 'pantab/src/numpy_datetime.c', - 'pantab/src/reader.c', 'pantab/src/type.c', - 'pantab/src/writer.c'], - include_directories: [inc_np], - install: true -) diff --git a/pantab/__init__.py b/pantab/__init__.py index 26c0801f..384760a8 100644 --- a/pantab/__init__.py +++ b/pantab/__init__.py @@ -1,7 +1,5 @@ -__version__ = "3.0.3" +__version__ = "4.0.0rc" -import libpantab # type: ignore -from tableauhyperapi import __version__ as hyperapi_version from ._reader import frame_from_hyper, frame_from_hyper_query, frames_from_hyper from ._tester import test @@ -16,112 +14,3 @@ "frames_to_hyper", "test", ] - -# We link against HyperAPI in a fun way: In Python, we extract the function -# pointers directly from the Python HyperAPI. We pass those function pointers -# over to the C module which will then use those pointers to directly interact -# with HyperAPI. Furthermore, we check the function signatures to guard -# against API-breaking changes in HyperAPI. -# -# Directly using HyperAPI's C functions always was and still is discouraged and -# unsupported by Tableu. In particular, Tableau will not be able to provide -# official support for this hack. -# -# Because this is highly brittle, we try to make the error message as -# actionable as possible and guide users in the right direction. - -api_incompatibility_msg = """ -pantab is incompatible with version {} of Tableau Hyper API. Please upgrade -both `tableauhyperapi` and `pantab` to the latest version. See also -https://pantab.readthedocs.io/en/latest/caveats.html#tableauhyperapi-compatability -""".format( - hyperapi_version -) - -try: - from tableauhyperapi.impl.dll import ffi, lib -except ImportError as e: - raise NotImplementedError(api_incompatibility_msg) from e - - -def _check_compatibility(check, message): - if not check: - raise NotImplementedError(message + "\n" + api_incompatibility_msg) - - -def _get_hapi_function(name, sig): - _check_compatibility(hasattr(lib, name), f"function '{name}' missing") - f = getattr(lib, name) - func_type = ffi.typeof(f) - _check_compatibility( - func_type.kind == "function", - f"expected '{name}' to be a function, got {func_type.kind}", - ) - _check_compatibility( - func_type.cname == sig, - f"expected '{name}' to have the signature '{sig}', got '{func_type.cname}'", - ) - return f - - -libpantab.load_hapi_functions( - _get_hapi_function("hyper_decode_date", "hyper_date_components_t(*)(uint32_t)"), - _get_hapi_function("hyper_encode_date", "uint32_t(*)(hyper_date_components_t)"), - _get_hapi_function("hyper_decode_time", "hyper_time_components_t(*)(uint64_t)"), - _get_hapi_function("hyper_encode_time", "uint64_t(*)(hyper_time_components_t)"), - _get_hapi_function( - "hyper_inserter_buffer_add_null", - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *)", - ), - _get_hapi_function( - "hyper_inserter_buffer_add_bool", - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *, _Bool)", - ), - _get_hapi_function( - "hyper_inserter_buffer_add_int16", - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *, int16_t)", - ), - _get_hapi_function( - "hyper_inserter_buffer_add_int32", - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *, int32_t)", - ), - _get_hapi_function( - "hyper_inserter_buffer_add_int64", - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *, int64_t)", - ), - _get_hapi_function( - "hyper_inserter_buffer_add_double", - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *, double)", - ), - _get_hapi_function( - "hyper_inserter_buffer_add_binary", - ( - "struct hyper_error_t *(*)" - "(struct hyper_inserter_buffer_t *, uint8_t *, size_t)" - ), - ), - _get_hapi_function( - "hyper_inserter_buffer_add_raw", - ( - "struct hyper_error_t *(*)(struct hyper_inserter_buffer_t *" - ", uint8_t *, size_t)" - ), - ), - _get_hapi_function( - "hyper_rowset_get_next_chunk", - ( - "struct hyper_error_t *(*)(struct hyper_rowset_t *" - ", struct hyper_rowset_chunk_t * *)" - ), - ), - _get_hapi_function( - "hyper_destroy_rowset_chunk", "void(*)(struct hyper_rowset_chunk_t *)" - ), - _get_hapi_function( - "hyper_rowset_chunk_field_values", - ( - "void(*)(struct hyper_rowset_chunk_t *" - ", size_t *, size_t *, uint8_t * * *, size_t * *)" - ), - ), -) diff --git a/pantab/_compat.py b/pantab/_compat.py deleted file mode 100644 index fe03b20b..00000000 --- a/pantab/_compat.py +++ /dev/null @@ -1,7 +0,0 @@ -import pandas as pd -from pandas.util.version import parse - -PANDAS_120 = parse(pd.__version__) >= parse("1.2.0") -PANDAS_130 = parse(pd.__version__) >= parse("1.3.0") - -__all__ = ["PANDAS_120", "PANDAS_130"] diff --git a/pantab/_hyper_util.py b/pantab/_hyper_util.py deleted file mode 100644 index 98193233..00000000 --- a/pantab/_hyper_util.py +++ /dev/null @@ -1,29 +0,0 @@ -from contextlib import nullcontext -from typing import Optional - -import tableauhyperapi as tab_api - - -def ensure_hyper_process(hyper_process: Optional[tab_api.HyperProcess]): - """ - Spawns an adhoc HyperProcess if needed, i.e. if no existing HyperProcess is provided - - Usage: - ``` - with ensure_hyper_process() as h: - h.execute_query(...) - ``` - """ - if hyper_process is None: - return tab_api.HyperProcess(tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) - else: - # Wrap the HyperProcess into a nullcontext such that the `with` doesn't close - # the HyperProcess - return nullcontext(hyper_process) - - -def forbid_hyper_process(hyper_process: Optional[tab_api.HyperProcess]): - if hyper_process is not None: - raise ValueError( - "hyper_process parameter is useless because `Connection` is provided" - ) diff --git a/pantab/_reader.py b/pantab/_reader.py index 4ab43bc4..a85b62d7 100644 --- a/pantab/_reader.py +++ b/pantab/_reader.py @@ -3,169 +3,90 @@ import tempfile from typing import Dict, Optional, Union -import libpantab # type: ignore -import numpy as np import pandas as pd import tableauhyperapi as tab_api -import pantab._types as pantab_types -from pantab._hyper_util import ensure_hyper_process, forbid_hyper_process +import pantab.src.pantab as libpantab # type: ignore TableType = Union[str, tab_api.Name, tab_api.TableName] -def _read_query_result( - result: tab_api.Result, dtypes: Optional[Dict[str, str]], use_float_na: bool -) -> pd.DataFrame: - if dtypes is None: - dtypes = {} - # Construct data types from result - for column in result.schema.columns: - # `result.schema` does not provide nullability information. - # Lwt's err on the safe side and always assume they are nullable - nullability = tab_api.Nullability.NULLABLE - column_type = pantab_types._ColumnType(column.type, nullability) - try: - dtypes[column.name.unescaped] = pantab_types._get_pandas_type( - column_type - ) - except KeyError as e: - raise TypeError( - f"Column {column.name} has unsupported datatype {column.type} " - f"with nullability {column.nullability}" - ) from e - - # if the use_float_na flag is set to False - # then switch Float32/Float64 dtypes back to float32/float64 - # to support np.nan rather than pd.NA - if not use_float_na: - for column, col_type in dtypes.items(): - if col_type == "Float64": - dtypes[column] = "float64" - elif col_type == "Float32": - dtypes[column] = "float32" - - # Call native library to read tuples from result set - dtype_strs = tuple(dtypes.values()) - df = pd.DataFrame(libpantab.read_hyper_query(result._Result__cdata, dtype_strs)) - if df.empty: - return pd.DataFrame({col: pd.Series(dtype="object") for col in dtypes}) - df.columns = dtypes.keys() - # TODO: remove this hackery... - for k, v in dtypes.items(): - if v == "date": - dtypes[k] = "datetime64[ns]" - date_types = ["datetime64[ns, UTC]", "datetime64[ns]"] - for col in df.select_dtypes(include=date_types): - df[col] = df[col].dt.tz_localize(None) - for col in df.select_dtypes(exclude=date_types): - df[col] = df[col].astype(dtypes[col]) - - df = df.fillna(value=np.nan) # Replace any appearances of None - - return df - - -def _read_table( - *, connection: tab_api.Connection, table: TableType, use_float_na: bool -) -> pd.DataFrame: - if isinstance(table, str): - table = tab_api.TableName(table) - - table_def = connection.catalog.get_table_definition(table) - columns = table_def.columns - - dtypes: Dict[str, str] = {} - for column in columns: - column_type = pantab_types._ColumnType(column.type, column.nullability) - try: - dtypes[column.name.unescaped] = pantab_types._get_pandas_type(column_type) - except KeyError as e: - raise TypeError( - f"Column {column.name} has unsupported datatype {column.type} " - f"with nullability {column.nullability}" - ) from e - - query = f"SELECT * from {table}" - with connection.execute_query(query) as result: - return _read_query_result(result, dtypes, use_float_na) - - def frame_from_hyper( - source: Union[str, pathlib.Path, tab_api.Connection], + source: Union[str, pathlib.Path], *, table: TableType, - hyper_process: Optional[tab_api.HyperProcess] = None, - use_float_na: bool = False, ) -> pd.DataFrame: """See api.rst for documentation""" + if isinstance(table, (str, tab_api.Name)) or not table.schema_name: + table = tab_api.TableName("public", table) + + data, columns, dtypes = libpantab.read_from_hyper_table( + str(source), + table.schema_name.name.unescaped, # TODO: this probably allows injection + table.name.unescaped, + ) + df = pd.DataFrame(data, columns=columns) + dtype_map = {k: v for k, v in zip(columns, dtypes) if v != "datetime64[ns, UTC]"} + df = df.astype(dtype_map) + + tz_aware_columns = { + col for col, dtype in zip(columns, dtypes) if dtype == "datetime64[ns, UTC]" + } + for col in tz_aware_columns: + try: + df[col] = df[col].dt.tz_localize("UTC") + except AttributeError: # happens when df[col] is empty + df[col] = df[col].astype("datetime64[ns, UTC]") - if isinstance(source, tab_api.Connection): - forbid_hyper_process(hyper_process) - return _read_table(connection=source, table=table, use_float_na=use_float_na) - else: - with tempfile.TemporaryDirectory() as tmp_dir, ensure_hyper_process( - hyper_process - ) as hpe: - tmp_db = shutil.copy(source, tmp_dir) - with tab_api.Connection(hpe.endpoint, tmp_db) as connection: - return _read_table( - connection=connection, table=table, use_float_na=use_float_na - ) + return df def frames_from_hyper( - source: Union[str, pathlib.Path, tab_api.Connection], - *, - hyper_process: Optional[tab_api.HyperProcess] = None, - use_float_na: bool = False, + source: Union[str, pathlib.Path], ) -> Dict[tab_api.TableName, pd.DataFrame]: """See api.rst for documentation.""" result: Dict[TableType, pd.DataFrame] = {} - if isinstance(source, tab_api.Connection): - forbid_hyper_process(hyper_process) - connection = source - for schema in connection.catalog.get_schema_names(): - for table in connection.catalog.get_table_names(schema=schema): - result[table] = _read_table( - connection=connection, table=table, use_float_na=use_float_na - ) - else: - with tempfile.TemporaryDirectory() as tmp_dir, ensure_hyper_process( - hyper_process - ) as hpe: - tmp_db = shutil.copy(source, tmp_dir) - with tab_api.Connection(hpe.endpoint, tmp_db) as connection: - for schema in connection.catalog.get_schema_names(): - for table in connection.catalog.get_table_names(schema=schema): - result[table] = _read_table( - connection=connection, - table=table, - use_float_na=use_float_na, - ) + table_names = [] + with tempfile.TemporaryDirectory() as tmp_dir, tab_api.HyperProcess( + tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU + ) as hpe: + tmp_db = shutil.copy(source, tmp_dir) + with tab_api.Connection(hpe.endpoint, tmp_db) as connection: + for schema in connection.catalog.get_schema_names(): + for table in connection.catalog.get_table_names(schema=schema): + table_names.append(table) + + for table in table_names: + result[table] = frame_from_hyper( + source=source, + table=table, + ) return result def frame_from_hyper_query( - source: Union[str, pathlib.Path, tab_api.Connection], + source: Union[str, pathlib.Path], query: str, *, hyper_process: Optional[tab_api.HyperProcess] = None, - use_float_na: bool = False, ) -> pd.DataFrame: """See api.rst for documentation.""" + # Call native library to read tuples from result set + df = pd.DataFrame(libpantab.read_from_hyper_query(str(source), query)) + data, columns, dtypes = libpantab.read_from_hyper_query(str(source), query) + df = pd.DataFrame(data, columns=columns) + dtype_map = {k: v for k, v in zip(columns, dtypes) if v != "datetime64[ns, UTC]"} + df = df.astype(dtype_map) + + tz_aware_columns = { + col for col, dtype in zip(columns, dtypes) if dtype == "datetime64[ns, UTC]" + } + for col in tz_aware_columns: + try: + df[col] = df[col].dt.tz_localize("UTC") + except AttributeError: # happens when df[col] is empty + df[col] = df[col].astype("datetime64[ns, UTC]") - if isinstance(source, tab_api.Connection): - forbid_hyper_process(hyper_process) - with source.execute_query(query) as result: - return _read_query_result(result, None, use_float_na) - else: - with tempfile.TemporaryDirectory() as tmp_dir, ensure_hyper_process( - hyper_process - ) as hpe: - tmp_db = shutil.copy(source, tmp_dir) - with tab_api.Connection(hpe.endpoint, tmp_db) as connection: - with connection.execute_query(query) as result: - return _read_query_result(result, None, use_float_na) + return df diff --git a/pantab/_types.py b/pantab/_types.py index 25aa7c13..3bbfc232 100644 --- a/pantab/_types.py +++ b/pantab/_types.py @@ -1,77 +1,5 @@ -import collections from typing import Union import tableauhyperapi as tab_api -import pantab._compat as compat - -# The Hyper API as of writing doesn't offer great hashability for column comparison -# so we create out namedtuple for that purpose -_ColumnType = collections.namedtuple("_ColumnType", ["type_", "nullability"]) - TableType = Union[str, tab_api.Name, tab_api.TableName] - -_column_types = { - "int16": _ColumnType(tab_api.SqlType.small_int(), tab_api.Nullability.NOT_NULLABLE), - "int32": _ColumnType(tab_api.SqlType.int(), tab_api.Nullability.NOT_NULLABLE), - "int64": _ColumnType(tab_api.SqlType.big_int(), tab_api.Nullability.NOT_NULLABLE), - "Int16": _ColumnType(tab_api.SqlType.small_int(), tab_api.Nullability.NULLABLE), - "Int32": _ColumnType(tab_api.SqlType.int(), tab_api.Nullability.NULLABLE), - "Int64": _ColumnType(tab_api.SqlType.big_int(), tab_api.Nullability.NULLABLE), - "float32": _ColumnType(tab_api.SqlType.double(), tab_api.Nullability.NULLABLE), - "float64": _ColumnType(tab_api.SqlType.double(), tab_api.Nullability.NULLABLE), - "bool": _ColumnType(tab_api.SqlType.bool(), tab_api.Nullability.NOT_NULLABLE), - "datetime64[ns]": _ColumnType( - tab_api.SqlType.timestamp(), tab_api.Nullability.NULLABLE - ), - "datetime64[ns, UTC]": _ColumnType( - tab_api.SqlType.timestamp_tz(), tab_api.Nullability.NULLABLE - ), - "timedelta64[ns]": _ColumnType( - tab_api.SqlType.interval(), tab_api.Nullability.NULLABLE - ), - "object": _ColumnType(tab_api.SqlType.text(), tab_api.Nullability.NULLABLE), -} - -_column_types["string"] = _ColumnType( - tab_api.SqlType.text(), tab_api.Nullability.NULLABLE -) -_column_types["boolean"] = _ColumnType( - tab_api.SqlType.bool(), tab_api.Nullability.NULLABLE -) - -if compat.PANDAS_120: - _column_types["Float32"] = _ColumnType( - tab_api.SqlType.double(), tab_api.Nullability.NULLABLE - ) - _column_types["Float64"] = _ColumnType( - tab_api.SqlType.double(), tab_api.Nullability.NULLABLE - ) - - -# Invert this, but exclude float32 as that does not roundtrip -_pandas_types = {v: k for k, v in _column_types.items() if k != "float32"} - -# Add things that we can't write to Hyper but can read -_pandas_types[ - _ColumnType(tab_api.SqlType.date(), tab_api.Nullability.NULLABLE) -] = "date" -_pandas_types[ - _ColumnType(tab_api.SqlType.double(), tab_api.Nullability.NOT_NULLABLE) -] = "float64" -_pandas_types[ - _ColumnType(tab_api.SqlType.text(), tab_api.Nullability.NOT_NULLABLE) -] = "string" - - -def _get_pandas_type(column_type: _ColumnType) -> str: - if column_type in _pandas_types: - return _pandas_types[column_type] - - if column_type.type_.tag == tab_api.TypeTag.VARCHAR: - return "string" - - raise KeyError( - f"Column has unsupported datatype {column_type.type_} " - f"with nullability {column_type.nullability}" - ) diff --git a/pantab/_writer.py b/pantab/_writer.py index f5674626..0068979b 100644 --- a/pantab/_writer.py +++ b/pantab/_writer.py @@ -1,39 +1,15 @@ -import itertools -import os import pathlib import shutil import tempfile import uuid -from typing import Dict, List, Optional, Sequence, Tuple, Union +from typing import Dict, Optional, Union -import libpantab # type: ignore -import numpy as np import pandas as pd +import pyarrow as pa import tableauhyperapi as tab_api -import pantab._compat as compat import pantab._types as pantab_types -from pantab._hyper_util import ensure_hyper_process - - -def _pandas_to_tableau_type(typ: str) -> pantab_types._ColumnType: - try: - return pantab_types._column_types[typ] - except KeyError: - raise TypeError("Conversion of '{}' dtypes not supported!".format(typ)) - - -def _timedelta_to_interval(td: pd.Timedelta) -> Optional[tab_api.Interval]: - """Converts a pandas Timedelta to tableau Hyper API implementation.""" - if pd.isnull(td): - return None - - days = td.days - without_days = td - pd.Timedelta(days=days) - total_seconds = int(without_days.total_seconds()) - microseconds = total_seconds * 1_000_000 - - return tab_api.Interval(months=0, days=days, microseconds=microseconds) +import pantab.src.pantab as libpantab # type: ignore def _validate_table_mode(table_mode: str) -> None: @@ -41,193 +17,18 @@ def _validate_table_mode(table_mode: str) -> None: raise ValueError("'table_mode' must be either 'w' or 'a'") -def _assert_columns_equal( - left: Sequence[tab_api.TableDefinition.Column], - right: Sequence[tab_api.TableDefinition.Column], -) -> None: - """ - Helper function to validate if sequences of columns are equal. - - The TableauHyperAPI as of 0.0.8953 does not implement equality operations - for Column instances, hence the need for this. - """ - - class DummyColumn: - """Dummy class to match items needed for str repr of columns.""" - - @property - def name(self): - return None - - @property - def type(self): - return None - - @property - def nullability(self): - return None - - for c1, c2 in itertools.zip_longest(left, right, fillvalue=DummyColumn()): - if c1.name != c2.name or c1.type != c2.type or c1.nullability != c2.nullability: - break # go to error handler - else: - return None # everything matched up, so bail out - - c1_str = ", ".join( - f"(Name={x.name}, Type={x.type}, Nullability={x.nullability})" for x in left - ) - c2_str = ", ".join( - f"(Name={x.name}, Type={x.type}, Nullability={x.nullability})" for x in right - ) - - raise TypeError(f"Mismatched column definitions: {c1_str} != {c2_str}") - - -def _maybe_convert_timedelta(df: pd.DataFrame) -> Tuple[pd.DataFrame, Tuple[str, ...]]: - """ - Hyper uses a different storage format than pandas / Python for timedeltas. - - Ultimately this should be pushed to the C extension, but doesn't look to fully work - at the moment anyway so keep in Python until complete. - """ - orig_dtypes = tuple(map(str, df.dtypes)) - deltas = df.select_dtypes(include=["timedelta64[ns]"]) - - if deltas.empty: - pass - else: - df = df.copy() - - for index, (_, content) in enumerate(df.items()): - if content.dtype == "timedelta64[ns]": - df.iloc[:, index] = content.apply(_timedelta_to_interval) - - return df, orig_dtypes - - -def _maybe_convert_utctimestamp(df: pd.DataFrame) -> pd.DataFrame: - """ - Hyper implements a subset of postgres and doesn't implement timezone-aware datetimes - Thus, we localize to timezone-naive - """ - for utc_col in df.select_dtypes("datetime64[ns, UTC]"): - df[utc_col] = df[utc_col].dt.tz_convert(None) - return df - - -def _insert_frame( - df: pd.DataFrame, - *, - connection: tab_api.Connection, - table: pantab_types.TableType, - table_mode: str, - use_parquet: bool, -) -> None: - _validate_table_mode(table_mode) - - if isinstance(table, str): - table = tab_api.TableName(table) - - # Populate insertion mechanisms dependent on column types - column_types: List[pantab_types._ColumnType] = [] - columns: List[tab_api.TableDefinition.Column] = [] - for col_name, dtype in df.dtypes.items(): - column_type = _pandas_to_tableau_type(dtype.name) - column_types.append(column_type) - columns.append( - tab_api.TableDefinition.Column( - name=col_name, - type=column_type.type_, - nullability=column_type.nullability, - ) - ) - - # Sanity check for existing table structures - if table_mode == "a" and connection.catalog.has_table(table): - table_def = connection.catalog.get_table_definition(table) - _assert_columns_equal(columns, table_def.columns) - else: # New table, potentially new schema - table_def = tab_api.TableDefinition(table) - - for column, column_type in zip(columns, column_types): - table_def.add_column(column) - - if isinstance(table, tab_api.TableName) and table.schema_name: - connection.catalog.create_schema_if_not_exists(table.schema_name) - - connection.catalog.create_table_if_not_exists(table_def) - - if not use_parquet: - null_mask = np.ascontiguousarray(pd.isnull(df)) - # Special handling for conversions - df, dtypes = _maybe_convert_timedelta(df) - - with tab_api.Inserter(connection, table_def) as inserter: - if compat.PANDAS_130: - df = _maybe_convert_utctimestamp(df) - libpantab.write_to_hyper(df, null_mask, inserter._buffer, dtypes) - else: - libpantab.write_to_hyper_legacy( - df.itertuples(index=False, name=None), - null_mask, - inserter._buffer, - df.shape[1], - dtypes, - ) - inserter.execute() - else: - if any(x.name == "timedelta64[ns]" for x in df.dtypes): - raise ValueError( - "Writing timedelta values with use_parquet=True is not yet supported." - ) - - import pyarrow as pa - import pyarrow.parquet as pq - - tbl = pa.Table.from_pandas(df) - non_nullable = {"int16", "int32", "int64", "bool"} - new_fields = [] - for field, dtype in zip(tbl.schema, df.dtypes): - if dtype.name in non_nullable: - new_fields.append( - pa.field(name=field.name, type=field.type, nullable=False) - ) - else: - new_fields.append(field) - - new_schema = pa.schema(new_fields) - tbl = tbl.cast(new_schema) - - # Windows can't read and write a NamedTemporaryFile in one pass - with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: - pq.write_table(tbl, tmp) - - connection.execute_command( - f"COPY {table} FROM '{tmp.name}' WITH (FORMAT 'parquet')" - ) - - try: - os.unlink(tmp.name) - except FileNotFoundError: - pass - - def frame_to_hyper( df: pd.DataFrame, database: Union[str, pathlib.Path], *, table: pantab_types.TableType, table_mode: str = "w", - hyper_process: Optional[tab_api.HyperProcess] = None, - use_parquet: bool = False, ) -> None: """See api.rst for documentation""" frames_to_hyper( {table: df}, database, table_mode, - hyper_process=hyper_process, - use_parquet=use_parquet, ) @@ -237,29 +38,28 @@ def frames_to_hyper( table_mode: str = "w", *, hyper_process: Optional[tab_api.HyperProcess] = None, - use_parquet: bool = False, ) -> None: """See api.rst for documentation.""" _validate_table_mode(table_mode) - with ensure_hyper_process(hyper_process) as hpe: - tmp_db = pathlib.Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.hyper" + tmp_db = pathlib.Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.hyper" + + if table_mode == "a" and pathlib.Path(database).exists(): + shutil.copy(database, tmp_db) + + def convert_to_table_name(table: pantab_types.TableType): + # nanobind expects a tuple of (schema, table) strings + if isinstance(table, (str, tab_api.Name)) or not table.schema_name: + table = tab_api.TableName("public", table) - if table_mode == "a" and pathlib.Path(database).exists(): - shutil.copy(database, tmp_db) + return (table.schema_name.name.unescaped, table.name.unescaped) - with tab_api.Connection( - hpe.endpoint, tmp_db, tab_api.CreateMode.CREATE_IF_NOT_EXISTS - ) as connection: - for table, df in dict_of_frames.items(): - _insert_frame( - df, - connection=connection, - table=table, - table_mode=table_mode, - use_parquet=use_parquet, - ) + data = { + convert_to_table_name(key): pa.Table.from_pandas(val) + for key, val in dict_of_frames.items() + } + libpantab.write_to_hyper(data, path=str(tmp_db), table_mode=table_mode) - # In Python 3.9+ we can just pass the path object, but due to bpo 32689 - # and subsequent typeshed changes it is easier to just pass as str for now - shutil.move(str(tmp_db), database) + # In Python 3.9+ we can just pass the path object, but due to bpo 32689 + # and subsequent typeshed changes it is easier to just pass as str for now + shutil.move(str(tmp_db), database) diff --git a/pantab/src/CMakeLists.txt b/pantab/src/CMakeLists.txt new file mode 100644 index 00000000..4fc881ad --- /dev/null +++ b/pantab/src/CMakeLists.txt @@ -0,0 +1,30 @@ +nanobind_add_module(pantab NOMINSIZE pantab.cpp numpy_datetime.cpp) +target_include_directories(pantab PUBLIC ${Python_NumPy_INCLUDE_DIRS}) +target_link_libraries(pantab + PRIVATE Tableau::tableauhyperapi-cxx + PRIVATE nanoarrow +) +set_target_properties(nanoarrow + PROPERTIES POSITION_INDEPENDENT_CODE + ON) + +install(TARGETS pantab + LIBRARY DESTINATION ${SKBUILD_PROJECT_NAME}/src) + +if(WIN32) + set(HYPERAPI_LIB_NAME "tableauhyperapi.lib") + set(HYPERAPI_BIN_LOC "bin/hyper") +elseif(APPLE) + set(HYPERAPI_LIB_NAME "libtableauhyperapi.dylib") + set(HYPERAPI_BIN_LOC "lib/hyper") +else() + set(HYPERAPI_LIB_NAME "libtableauhyperapi.so") + set(HYPERAPI_BIN_LOC "lib/hyper") +endif() + +# Auditwheel doesn't know how to handle the cmake dependencies +# so we manually install here and exclude from auditwheel +install(FILES ${tableauhyperapi-cxx_SOURCE_DIR}/lib/${HYPERAPI_LIB_NAME} + DESTINATION ${SKBUILD_PROJECT_NAME}/src) +install(DIRECTORY "${tableauhyperapi-cxx_SOURCE_DIR}/${HYPERAPI_BIN_LOC}/" + DESTINATION ${SKBUILD_PROJECT_NAME}/src/hyper) diff --git a/pantab/src/__init__.py b/pantab/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pantab/src/cffi.h b/pantab/src/cffi.h deleted file mode 100644 index 4e4f0cdb..00000000 --- a/pantab/src/cffi.h +++ /dev/null @@ -1,73 +0,0 @@ -/* This header file is copied directly from cffi to allow interaction -with cffi C-level objects without including the entire library. - -cffi is licensed under the MIT license, with originaly copyright included -below: - -Except when otherwise stated (look for LICENSE files in directories or -information at the beginning of each file) all software and -documentation is licensed as follows: - - The MIT License - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, - copy, modify, merge, publish, distribute, sublicense, and/or - sell copies of the Software, and to permit persons to whom the - Software is furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included - in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS IN THE SOFTWARE. -*/ - -#ifndef PANTAB_CFFI_H -#define PANTAB_CFFI_H - -#define PY_SSIZE_T_CLEAN -#include - -typedef struct _ctypedescr { - PyObject_VAR_HEAD - - struct _ctypedescr *ct_itemdescr; /* ptrs and arrays: the item type */ - PyObject *ct_stuff; /* structs: dict of the fields - arrays: ctypedescr of the ptr type - function: tuple(abi, ctres, ctargs..) - enum: pair {"name":x},{x:"name"} - ptrs: lazily, ctypedescr of array */ - void *ct_extra; /* structs: first field (not a ref!) - function types: cif_description - primitives: prebuilt "cif" object */ - - PyObject *ct_weakreflist; /* weakref support */ - - PyObject *ct_unique_key; /* key in unique_cache (a string, but not - human-readable) */ - - Py_ssize_t ct_size; /* size of instances, or -1 if unknown */ - Py_ssize_t ct_length; /* length of arrays, or -1 if unknown; - or alignment of primitive and struct types; - always -1 for pointers */ - int ct_flags; /* CT_xxx flags */ - - int ct_name_position; /* index in ct_name of where to put a var name */ - char ct_name[1]; /* string, e.g. "int *" for pointers to ints */ -} CTypeDescrObject; - -typedef struct { - PyObject_HEAD CTypeDescrObject *c_type; - char *c_data; - PyObject *c_weakreflist; -} CDataObject; - -#endif diff --git a/pantab/src/numpy_datetime.c b/pantab/src/numpy_datetime.cpp similarity index 100% rename from pantab/src/numpy_datetime.c rename to pantab/src/numpy_datetime.cpp diff --git a/pantab/src/pantab.c b/pantab/src/pantab.c deleted file mode 100644 index 8a9e06b4..00000000 --- a/pantab/src/pantab.c +++ /dev/null @@ -1,64 +0,0 @@ -#define PY_SSIZE_T_CLEAN -#include -#define PY_ARRAY_UNIQUE_SYMBOL PANTAB_ARRAY_API -#include - -#include "cffi.h" -#include "reader.h" -#include "tableauhyperapi.h" -#include "writer.h" - -// Function pointers, initialized by `load_hapi_functions` function -#define C(RET, NAME, ARGS) RET(*NAME) ARGS = NULL; -HYPERAPI_FUNCTIONS(C) -#undef C - -static PyObject *load_hapi_functions(PyObject *Py_UNUSED(dummy), - PyObject *args) { - bool ok; -#define C(RET, NAME, ARGS) PyObject *NAME##_arg; - HYPERAPI_FUNCTIONS(C) -#undef C - const char *formatStr = -#define C(RET, NAME, ARGS) "O" - HYPERAPI_FUNCTIONS(C) -#undef C - ; - - ok = PyArg_ParseTuple(args, formatStr -#define C(RET, NAME, ARGS) , &NAME##_arg - HYPERAPI_FUNCTIONS(C) -#undef C - ); - if (!ok) - return NULL; - - // TODO: check that we get an instance of CDataObject; else will - // segfault -#define C(RET, NAME, ARGS) \ - NAME = (RET(*) ARGS)(((CDataObject *)NAME##_arg)->c_data); - HYPERAPI_FUNCTIONS(C) -#undef C - - Py_RETURN_NONE; -} - -static PyMethodDef methods[] = { - {"load_hapi_functions", load_hapi_functions, METH_VARARGS, - "Initializes the HyperAPI functions used by pantab."}, - {"write_to_hyper_legacy", write_to_hyper_legacy, METH_VARARGS, - "Legacy method to Write a numpy array to a hyper file."}, - {"write_to_hyper", write_to_hyper, METH_VARARGS, - "Writes a dataframe array to a hyper file."}, - {"read_hyper_query", read_hyper_query, METH_VARARGS, - "Reads a hyper query from a given connection."}, - {NULL, NULL, 0, NULL}}; - -static struct PyModuleDef pantabmodule = {.m_base = PyModuleDef_HEAD_INIT, - .m_name = "libpantab", - .m_methods = methods}; - -PyMODINIT_FUNC PyInit_libpantab(void) { - import_array(); - return PyModule_Create(&pantabmodule); -} diff --git a/pantab/src/pantab.cpp b/pantab/src/pantab.cpp new file mode 100644 index 00000000..6cdac616 --- /dev/null +++ b/pantab/src/pantab.cpp @@ -0,0 +1,619 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "numpy_datetime.h" + +namespace nb = nanobind; + +using Dtype = std::tuple; + +enum TimeUnit { SECOND, MILLI, MICRO, NANO }; + +static hyperapi::SqlType hyperTypeFromArrowSchema(struct ArrowSchema *schema, + ArrowError *error) { + struct ArrowSchemaView schema_view; + if (ArrowSchemaViewInit(&schema_view, schema, error) != 0) { + throw std::runtime_error("Issue converting to hyper type: " + + std::string(error->message)); + } + + switch (schema_view.type) { + case NANOARROW_TYPE_INT16: + return hyperapi::SqlType::smallInt(); + case NANOARROW_TYPE_INT32: + return hyperapi::SqlType::integer(); + case NANOARROW_TYPE_INT64: + return hyperapi::SqlType::bigInt(); + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + return hyperapi::SqlType::doublePrecision(); + case NANOARROW_TYPE_BOOL: + return hyperapi::SqlType::boolean(); + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + return hyperapi::SqlType::text(); + case NANOARROW_TYPE_TIMESTAMP: + if (std::strcmp("", schema_view.timezone)) { + return hyperapi::SqlType::timestampTZ(); + } else { + return hyperapi::SqlType::timestamp(); + } + default: + throw std::invalid_argument("Unsupported Arrow type: " + + std::to_string(schema_view.type)); + } +} + +class InsertHelper { +public: + InsertHelper(std::shared_ptr inserter, + const struct ArrowArray *chunk, const struct ArrowSchema *schema, + struct ArrowError *error, int64_t column_position) + : inserter_(inserter), chunk_(chunk), schema_(schema), error_(error), + column_position_(column_position) {} + + virtual ~InsertHelper() {} + + void Init() { + struct ArrowSchema *child_schema = schema_->children[column_position_]; + + if (ArrowArrayViewInitFromSchema(&array_view_, child_schema, error_) != 0) { + throw std::runtime_error("Could not construct insert helper: " + + std::string{error_->message}); + } + + if (ArrowArrayViewSetArray(&array_view_, chunk_->children[column_position_], + error_) != 0) { + throw std::runtime_error("Could not set array view: " + + std::string{error_->message}); + } + } + + virtual void insertValueAtIndex(size_t) {} + +protected: + std::shared_ptr inserter_; + const struct ArrowArray *chunk_; + const struct ArrowSchema *schema_; + struct ArrowError *error_; + const int64_t column_position_; + struct ArrowArrayView array_view_; +}; + +template class PrimitiveInsertHelper : public InsertHelper { +public: + using InsertHelper::InsertHelper; + + void insertValueAtIndex(size_t idx) override { + if (ArrowArrayViewIsNull(&array_view_, idx)) { + // MSVC on cibuildwheel doesn't like this templated optional + // inserter_->add(std::optional{std::nullopt}); + hyperapi::internal::ValueInserter{*inserter_}.addNull(); + return; + } + constexpr size_t elem_size = sizeof(T); + T result; + memcpy(&result, + array_view_.buffer_views[1].data.as_uint8 + (idx * elem_size), + elem_size); + inserter_->add(result); + } +}; + +template class Utf8InsertHelper : public InsertHelper { +public: + using InsertHelper::InsertHelper; + + void insertValueAtIndex(size_t idx) override { + if (ArrowArrayViewIsNull(&array_view_, idx)) { + // MSVC on cibuildwheel doesn't like this templated optional + // inserter_->add(std::optional{std::nullopt}); + hyperapi::internal::ValueInserter{*inserter_}.addNull(); + return; + } + + struct ArrowBufferView buffer_view = + ArrowArrayViewGetBytesUnsafe(&array_view_, idx); + auto result = std::string{buffer_view.data.as_char, + static_cast(buffer_view.size_bytes)}; + inserter_->add(result); + } +}; + +template +class TimestampInsertHelper : public InsertHelper { +public: + using InsertHelper::InsertHelper; + + void insertValueAtIndex(size_t idx) override { + constexpr size_t elem_size = sizeof(int64_t); + if (ArrowArrayViewIsNull(&array_view_, idx)) { + // MSVC on cibuildwheel doesn't like this templated optional + // inserter_->add(std::optional{std::nullopt}); + hyperapi::internal::ValueInserter{*inserter_}.addNull(); + return; + } + int64_t value; + + memcpy(&value, + array_view_.buffer_views[1].data.as_uint8 + (idx * elem_size), + elem_size); + + // using timestamp_t = + // typename std::conditional::type; + + // TODO: need overflow checks here + npy_datetimestruct dts; + PyArray_DatetimeMetaData meta; + if constexpr (TU == TimeUnit::SECOND) { + meta = {NPY_FR_s, 1}; + } else if constexpr (TU == TimeUnit::MILLI) { + meta = {NPY_FR_ms, 1}; + } else if constexpr (TU == TimeUnit::MICRO) { + meta = {NPY_FR_us, 1}; + } else if constexpr (TU == TimeUnit::NANO) { + // we assume pandas is ns here but should check format + meta = {NPY_FR_ns, 1}; + } + + int ret = convert_datetime_to_datetimestruct(&meta, value, &dts); + if (ret != 0) { + throw std::invalid_argument("could not convert datetime value "); + } + hyperapi::Date dt{static_cast(dts.year), + static_cast(dts.month), + static_cast(dts.day)}; + hyperapi::Time time{static_cast(dts.hour), + static_cast(dts.min), + static_cast(dts.sec), dts.us}; + + if constexpr (TZAware) { + hyperapi::OffsetTimestamp ts{dt, time, std::chrono::minutes{0}}; + inserter_->add(ts); + + } else { + hyperapi::Timestamp ts{dt, time}; + inserter_->add(ts); + } + } +}; + +static std::unique_ptr +makeInsertHelper(std::shared_ptr inserter, + struct ArrowArray *chunk, struct ArrowSchema *schema, + struct ArrowError *error, int64_t column_position) { + // TODO: we should provide the full dtype here not just format string, so + // boolean fields can determine whether they are bit or byte masks + + // right now we pass false as the template paramter to the + // PrimitiveInsertHelper as that is all pandas generates; other libraries may + // need the true variant + struct ArrowSchemaView schema_view; + if (ArrowSchemaViewInit(&schema_view, schema->children[column_position], + error) != 0) { + throw std::runtime_error("Issue generating insert helper: " + + std::string(error->message)); + } + + switch (schema_view.type) { + case NANOARROW_TYPE_INT16: + return std::unique_ptr(new PrimitiveInsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_INT32: + return std::unique_ptr(new PrimitiveInsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_INT64: + return std::unique_ptr(new PrimitiveInsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_FLOAT: + return std::unique_ptr(new PrimitiveInsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_DOUBLE: + return std::unique_ptr(new PrimitiveInsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_BOOL: + return std::unique_ptr(new PrimitiveInsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + return std::unique_ptr(new Utf8InsertHelper( + inserter, chunk, schema, error, column_position)); + case NANOARROW_TYPE_TIMESTAMP: + switch (schema_view.time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + if (std::strcmp("", schema_view.timezone)) { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } else { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } + case NANOARROW_TIME_UNIT_MILLI: + if (std::strcmp("", schema_view.timezone)) { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } else { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } + case NANOARROW_TIME_UNIT_MICRO: + if (std::strcmp("", schema_view.timezone)) { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } else { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } + case NANOARROW_TIME_UNIT_NANO: + if (std::strcmp("", schema_view.timezone)) { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } else { + return std::unique_ptr( + new TimestampInsertHelper( + inserter, chunk, schema, error, column_position)); + } + } + throw std::runtime_error( + "This code block should not be hit - contact a developer"); + default: + throw std::invalid_argument("makeInsertHelper: Unsupported Arrow type: " + + std::to_string(schema_view.type)); + } +} + +using SchemaAndTableName = std::tuple; + +void write_to_hyper( + const std::map &dict_of_exportable, + const std::string &path, const std::string &table_mode) { + hyperapi::HyperProcess hyper{ + hyperapi::Telemetry::DoNotSendUsageDataToTableau}; + + // TODO: we don't have separate table / database create modes in the API + // but probably should; for now we infer this from table mode + const auto createMode = table_mode == "w" + ? hyperapi::CreateMode::CreateAndReplace + : hyperapi::CreateMode::CreateIfNotExists; + + hyperapi::Connection connection{hyper.getEndpoint(), path, createMode}; + const hyperapi::Catalog &catalog = connection.getCatalog(); + + for (auto const &[schema_and_table, exportable] : dict_of_exportable) { + const auto hyper_schema = std::get<0>(schema_and_table); + const auto hyper_table = std::get<1>(schema_and_table); + auto arrow_c_stream = nb::getattr(exportable, "__arrow_c_stream__")(); + + PyObject *obj = arrow_c_stream.ptr(); + if (!PyCapsule_CheckExact(obj)) { + throw std::invalid_argument("Object does not provide capsule"); + } + auto c_stream = static_cast( + PyCapsule_GetPointer(obj, "arrow_array_stream")); + auto stream = nanoarrow::UniqueArrayStream{c_stream}; + + struct ArrowSchema schema; + if (stream->get_schema(stream.get(), &schema) != 0) { + std::string error_msg{stream->get_last_error(stream.get())}; + throw std::runtime_error("Could not read from arrow schema:" + error_msg); + } + + struct ArrowError error; + auto names_vec = std::vector{}; + std::vector hyper_columns; + + for (int64_t i = 0; i < schema.n_children; i++) { + const auto hypertype = + hyperTypeFromArrowSchema(schema.children[i], &error); + const auto name = std::string{schema.children[i]->name}; + names_vec.push_back(name); + + // Almost all arrow types are nullable + hyper_columns.push_back(hyperapi::TableDefinition::Column{ + name, hypertype, hyperapi::Nullability::Nullable}); + } + + hyperapi::TableName table_name{hyper_schema, hyper_table}; + hyperapi::TableDefinition tableDef{table_name, hyper_columns}; + catalog.createSchemaIfNotExists(*table_name.getSchemaName()); + if (table_mode == "w") { + catalog.createTable(tableDef); + } else if (table_mode == "a") { + catalog.createTableIfNotExists(tableDef); + } + auto inserter = std::make_shared(connection, tableDef); + + struct ArrowArray chunk; + int errcode; + while ((errcode = stream->get_next(stream.get(), &chunk) == 0) && + chunk.release != NULL) { + const int nrows = chunk.length; + if (nrows < 0) { + throw std::runtime_error("Unexpected array length < 0"); + } + + std::vector> insert_helpers; + for (int64_t i = 0; i < schema.n_children; i++) { + // the lifetime of the inserthelper cannot exceed that of chunk or + // schema this is implicit; we should make this explicit + auto insert_helper = + makeInsertHelper(inserter, &chunk, &schema, &error, i); + + insert_helper->Init(); + insert_helpers.push_back(std::move(insert_helper)); + } + + for (int64_t row_idx = 0; row_idx < nrows; row_idx++) { + for (const auto &insert_helper : insert_helpers) { + insert_helper->insertValueAtIndex(row_idx); + } + inserter->endRow(); + } + } + + inserter->execute(); + } +} + +class ReadHelper { +public: + ReadHelper() {} + virtual ~ReadHelper() {} + virtual nb::object Read(const hyperapi::Value &) { return nb::none(); } +}; + +class IntegralReadHelper : public ReadHelper { + nb::object Read(const hyperapi::Value &value) { + if (value.isNull()) { + return nb::none(); + } + return nb::int_(value.get()); + } +}; + +class FloatReadHelper : public ReadHelper { + nb::object Read(const hyperapi::Value &value) { + if (value.isNull()) { + return nb::none(); + } + return nb::float_(value.get()); + } +}; + +class BooleanReadHelper : public ReadHelper { + nb::object Read(const hyperapi::Value &value) { + // TODO: bool support added in nanobind >= 1..9.0 + // return nb::bool_(value.get()); + if (value.isNull()) { + return nb::none(); + } + return nb::int_(value.get()); + } +}; + +class StringReadHelper : public ReadHelper { + nb::object Read(const hyperapi::Value &value) { + if (value.isNull()) { + return nb::none(); + } + return nb::str(value.get().c_str()); + } +}; + +class DateReadHelper : public ReadHelper { + nb::object Read(const hyperapi::Value &value) { + if (value.isNull()) { + return nb::none(); + } + + const auto hyper_date = value.get(); + const auto year = hyper_date.getYear(); + const auto month = hyper_date.getMonth(); + const auto day = hyper_date.getDay(); + + PyObject *result = PyDate_FromDate(year, month, day); + if (result == nullptr) { + throw std::invalid_argument("could not parse date"); + } + return nb::object(result, nb::detail::steal_t{}); + } +}; + +template class DatetimeReadHelper : public ReadHelper { + nb::object Read(const hyperapi::Value &value) { + if (value.isNull()) { + return nb::none(); + } + + using timestamp_t = + typename std::conditional::type; + const auto hyper_ts = value.get(); + const auto hyper_date = hyper_ts.getDate(); + const auto hyper_time = hyper_ts.getTime(); + const auto year = hyper_date.getYear(); + const auto month = hyper_date.getMonth(); + const auto day = hyper_date.getDay(); + const auto hour = hyper_time.getHour(); + const auto min = hyper_time.getMinute(); + const auto sec = hyper_time.getSecond(); + const auto usec = hyper_time.getMicrosecond(); + + PyObject *result = + PyDateTime_FromDateAndTime(year, month, day, hour, min, sec, usec); + if (result == nullptr) { + throw std::invalid_argument("could not parse timestamp"); + } + return nb::object(result, nb::detail::steal_t{}); + } +}; + +static std::unique_ptr makeReadHelper(hyperapi::SqlType sqltype) { + if ((sqltype == hyperapi::SqlType::smallInt()) || + (sqltype == hyperapi::SqlType::integer()) || + (sqltype == hyperapi::SqlType::bigInt())) { + return std::unique_ptr(new IntegralReadHelper()); + } else if (sqltype == hyperapi::SqlType::doublePrecision()) { + return std::unique_ptr(new FloatReadHelper()); + } else if ((sqltype == hyperapi::SqlType::text())) { + return std::unique_ptr(new StringReadHelper()); + } else if (sqltype == hyperapi::SqlType::boolean()) { + return std::unique_ptr(new BooleanReadHelper()); + } else if (sqltype == hyperapi::SqlType::date()) { + return std::unique_ptr(new DateReadHelper()); + } else if (sqltype == hyperapi::SqlType::timestamp()) { + return std::unique_ptr(new DatetimeReadHelper()); + } else if (sqltype == hyperapi::SqlType::timestampTZ()) { + return std::unique_ptr(new DatetimeReadHelper()); + } + + throw nb::type_error(("cannot read sql type: " + sqltype.toString()).c_str()); +} + +static std::string pandasDtypeFromHyper(const hyperapi::SqlType &sqltype) { + if (sqltype == hyperapi::SqlType::smallInt()) { + return "int16[pyarrow]"; + } else if (sqltype == hyperapi::SqlType::integer()) { + return "int32[pyarrow]"; + } else if (sqltype == hyperapi::SqlType::bigInt()) { + return "int64[pyarrow]"; + } else if (sqltype == hyperapi::SqlType::doublePrecision()) { + return "double[pyarrow]"; + } else if (sqltype == hyperapi::SqlType::text()) { + return "string[pyarrow]"; + } else if (sqltype == hyperapi::SqlType::boolean()) { + return "boolean[pyarrow]"; + } else if (sqltype == hyperapi::SqlType::timestamp()) { + return "timestamp[us][pyarrow]"; + } else if (sqltype == hyperapi::SqlType::timestampTZ()) { + return "timestamp[us, UTC][pyarrow]"; + } else if (sqltype == hyperapi::SqlType::date()) { + return "date32[pyarrow]"; + } + + throw nb::type_error( + ("unimplemented pandas dtype for type: " + sqltype.toString()).c_str()); +} + +using ColumnNames = std::vector; +using ResultBody = std::vector>; +// In a future version of pantab it would be nice to not require pandas dtypes +// However, the current reader just creates PyObjects and loses that information +// when passing back to the Python runtime; hence the explicit passing +using PandasDtypes = std::vector; +/// +/// read_from_hyper_query is slightly different than read_from_hyper_table +/// because the former detects a schema from the hyper Result object +/// which does not hold nullability information +/// +std::tuple +read_from_hyper_query(const std::string &path, const std::string &query) { + std::vector> result; + hyperapi::HyperProcess hyper{ + hyperapi::Telemetry::DoNotSendUsageDataToTableau}; + hyperapi::Connection connection(hyper.getEndpoint(), path); + + std::vector columnNames; + std::vector pandasDtypes; + std::vector> read_helpers; + + hyperapi::Result hyperResult = connection.executeQuery(query); + const auto resultSchema = hyperResult.getSchema(); + for (const auto &column : resultSchema.getColumns()) { + read_helpers.push_back(makeReadHelper(column.getType())); + auto name = column.getName().getUnescaped(); + columnNames.push_back(name); + + // the query result set does not tell us if columns are nullable or not + auto const sqltype = column.getType(); + pandasDtypes.push_back(pandasDtypeFromHyper(sqltype)); + } + for (const hyperapi::Row &row : hyperResult) { + std::vector rowdata; + size_t column_idx = 0; + for (const hyperapi::Value &value : row) { + const auto &read_helper = read_helpers[column_idx]; + rowdata.push_back(read_helper->Read(value)); + column_idx++; + } + result.push_back(rowdata); + } + + return std::make_tuple(result, columnNames, pandasDtypes); +} + +std::tuple +read_from_hyper_table(const std::string &path, const std::string &schema, + const std::string &table) { + std::vector> result; + hyperapi::HyperProcess hyper{ + hyperapi::Telemetry::DoNotSendUsageDataToTableau}; + hyperapi::Connection connection(hyper.getEndpoint(), path); + hyperapi::TableName extractTable{schema, table}; + const hyperapi::Catalog &catalog = connection.getCatalog(); + const hyperapi::TableDefinition tableDef = + catalog.getTableDefinition(extractTable); + + std::vector columnNames; + std::vector pandasDtypes; + std::vector> read_helpers; + + for (auto &column : tableDef.getColumns()) { + read_helpers.push_back(makeReadHelper(column.getType())); + auto name = column.getName().getUnescaped(); + columnNames.push_back(name); + + auto const sqltype = column.getType(); + pandasDtypes.push_back(pandasDtypeFromHyper(sqltype)); + } + + hyperapi::Result hyperResult = + connection.executeQuery("SELECT * FROM " + extractTable.toString()); + for (const hyperapi::Row &row : hyperResult) { + std::vector rowdata; + size_t column_idx = 0; + for (const hyperapi::Value &value : row) { + const auto &read_helper = read_helpers[column_idx]; + rowdata.push_back(read_helper->Read(value)); + column_idx++; + } + result.push_back(rowdata); + } + + return std::make_tuple(result, columnNames, pandasDtypes); +} + +NB_MODULE(pantab, m) { + m.def("write_to_hyper", &write_to_hyper, nb::arg("dict_of_exportable"), + nb::arg("path"), nb::arg("table_mode")) + .def("read_from_hyper_query", &read_from_hyper_query, nb::arg("path"), + nb::arg("query")) + .def("read_from_hyper_table", &read_from_hyper_table, nb::arg("path"), + nb::arg("schema"), nb::arg("table")); + PyDateTime_IMPORT; +} diff --git a/pantab/src/reader.c b/pantab/src/reader.c deleted file mode 100644 index 5f7e1131..00000000 --- a/pantab/src/reader.c +++ /dev/null @@ -1,204 +0,0 @@ -#include "cffi.h" -#include "type.h" -#include - -static PyObject *cls_timedelta = NULL; - -// the pointer to size is only used if receiving a character array -static PyObject *read_value(const uint8_t *value, DTYPE dtype, - const size_t *size) { - if (PyErr_CheckSignals()) { - return NULL; - } - - switch (dtype) { - case INT16_: - case INT16NA: - return PyLong_FromLong(*((int16_t *)value)); - case INT32_: - case INT32NA: - return PyLong_FromLong(*((int32_t *)value)); - case INT64_: - case INT64NA: - return PyLong_FromLongLong(*((int64_t *)value)); - - case BOOLEAN: - case BOOLEANNA: - return PyBool_FromLong(*value); - - case FLOAT32_: - case FLOAT64_: - case FLOAT32NA: - case FLOAT64NA: - return PyFloat_FromDouble(*((double *)value)); - - case STRING: - case OBJECT: - return PyUnicode_FromStringAndSize((const char *)value, *size); - - case DATE: { - hyper_date_components_t date = hyper_decode_date(*((hyper_date_t *)value)); - return PyDate_FromDate(date.year, date.month, date.day); - } - - case DATETIME64_NS: - case DATETIME64_NS_UTC: { - hyper_time_t val = *((hyper_time_t *)value); - - hyper_date_t encoded_date = - (hyper_date_t)(val / (hyper_time_t)MICROSECONDS_PER_DAY); - hyper_time_t encoded_time = val % (hyper_time_t)MICROSECONDS_PER_DAY; - hyper_date_components_t date = hyper_decode_date(encoded_date); - hyper_time_components_t time = hyper_decode_time(encoded_time); - - return PyDateTime_FromDateAndTime(date.year, date.month, date.day, - time.hour, time.minute, time.second, - time.microsecond); - } - - case TIMEDELTA64_NS: { - // Unfortunately PyDelta_FromDSU and the pandas Timedelta class - // are not compatible in signature, particularly when it comes - // to handling negative days. As such, we construct the pandas - // object instead of using the CPython API - - if (cls_timedelta == NULL) { - PyObject *mod_pandas = PyImport_ImportModule("pandas"); - if (mod_pandas == NULL) { - return NULL; - } - - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); - Py_DECREF(mod_pandas); - if (cls_timedelta == NULL) { - return NULL; - } - } - - py_interval interval = *((py_interval *)value); - if (interval.months != 0) { - PyObject *errMsg = - PyUnicode_FromFormat("Cannot read Intervals with month components."); - PyErr_SetObject(PyExc_ValueError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - - PyObject *kwargs = PyDict_New(); - if (kwargs == NULL) - return NULL; - - PyDict_SetItemString(kwargs, "days", PyLong_FromLongLong(interval.days)); - PyDict_SetItemString(kwargs, "microseconds", - PyLong_FromLongLong(interval.microseconds)); - PyObject *dummy = PyTuple_New(0); // need this for PyObject_Call - - PyObject *td = PyObject_Call(cls_timedelta, dummy, kwargs); - Py_DECREF(dummy); - Py_DECREF(kwargs); - - return td; - } - - default: { - PyObject *errMsg = PyUnicode_FromFormat("Invalid dtype: \"%s\""); - PyErr_SetObject(PyExc_ValueError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - } -} - -PyObject *read_hyper_query(PyObject *Py_UNUSED(dummy), PyObject *args) { - int ok; - PyObject *row = NULL, *resultObj; - PyTupleObject *dtypes; - hyper_rowset_t *rowset; - hyper_rowset_chunk_t *chunk; - hyper_error_t *hyper_err; - size_t num_cols, num_rows; - const uint8_t *const *values; - const size_t *sizes; - - PyDateTime_IMPORT; - - ok = PyArg_ParseTuple(args, "OO!", &resultObj, &PyTuple_Type, &dtypes); - if (!ok) - return NULL; - - // TODO: check that we get an instance of CDataObject; else will segfault - rowset = (hyper_rowset_t *)((CDataObject *)resultObj)->c_data; - - // TODO: we need to free these somewhere as these currently leak... - DTYPE *enumeratedDtypes = makeEnumeratedDtypes(dtypes); - if (enumeratedDtypes == NULL) - return NULL; - - PyObject *result = PyList_New(0); - if (result == NULL) { - return NULL; - } - - // Iterate over each result chunk - while (1) { - - hyper_err = hyper_rowset_get_next_chunk(rowset, &chunk); - if (hyper_err) { - goto ERROR_CLEANUP; - } - - if (chunk == NULL) { - break; // No more to parse - } - - hyper_rowset_chunk_field_values(chunk, &num_cols, &num_rows, &values, - &sizes); - - // For each row inside the chunk... - for (size_t i = 0; i < num_rows; i++) { - row = PyTuple_New(num_cols); - if (row == NULL) { - goto ERROR_CLEANUP; - } - - // For each column inside the row... - for (size_t j = 0; j < num_cols; j++) { - PyObject *val; - if (*values == NULL) { - val = Py_None; - Py_INCREF(val); - } else { - DTYPE dtype = enumeratedDtypes[j]; - val = read_value(*values, dtype, sizes); - } - - values++, sizes++; - - if (val == NULL) { - goto ERROR_CLEANUP; - } - - PyTuple_SET_ITEM(row, j, val); - } - - int ret = PyList_Append(result, row); - if (ret != 0) { - goto ERROR_CLEANUP; - } - } - hyper_destroy_rowset_chunk(chunk); - } - - Py_XDECREF(cls_timedelta); - - return result; - -ERROR_CLEANUP: - Py_XDECREF(row); - Py_XDECREF(result); - Py_XDECREF(cls_timedelta); - if (chunk != NULL) - hyper_destroy_rowset_chunk(chunk); - - return NULL; -} diff --git a/pantab/src/reader.h b/pantab/src/reader.h deleted file mode 100644 index 00646067..00000000 --- a/pantab/src/reader.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef PANTAB_READER_H -#define PANTAB_READER_H - -#define PY_SSIZE_T_CLEAN -#include - -PyObject *read_hyper_query(PyObject *Py_UNUSED(dummy), PyObject *args); - -#endif diff --git a/pantab/src/tableauhyperapi.h b/pantab/src/tableauhyperapi.h deleted file mode 100644 index 0d252d3b..00000000 --- a/pantab/src/tableauhyperapi.h +++ /dev/null @@ -1,83 +0,0 @@ -/* This file is a modified port of the lib_h.py file provided by Tableau. - -The original copyright notice is included below for reference. - -# ----------------------------------------------------------------------------- -# -# This file is the copyrighted property of Tableau Software and is protected -# by registered patents and other applicable U.S. and international laws and -# regulations. -# -# Unlicensed use of the contents of this file is prohibited. Please refer to -# the NOTICES.txt file for further details. -# -# ----------------------------------------------------------------------------- -*/ - -#ifndef PANTAB_HYPER_API -#define PANTAB_HYPER_API - -#include -#include -#include - -typedef uint32_t hyper_date_t; -typedef struct { - int32_t year; - int16_t month; - int16_t day; -} hyper_date_components_t; -typedef uint64_t hyper_time_t; -typedef struct { - int8_t hour; - int8_t minute; - int8_t second; - int32_t microsecond; -} hyper_time_components_t; - -typedef struct hyper_error_t hyper_error_t; -typedef struct hyper_inserter_buffer_t hyper_inserter_buffer_t; -typedef struct hyper_rowset_t hyper_rowset_t; -typedef struct hyper_rowset_chunk_t hyper_rowset_chunk_t; - -#define HYPERAPI_FUNCTIONS(C) \ - C(hyper_date_components_t, hyper_decode_date, (hyper_date_t date)) \ - C(hyper_date_t, hyper_encode_date, (hyper_date_components_t components)) \ - C(hyper_time_components_t, hyper_decode_time, (hyper_time_t time)) \ - C(hyper_time_t, hyper_encode_time, (hyper_time_components_t components)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_null, \ - (hyper_inserter_buffer_t * buffer)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_bool, \ - (hyper_inserter_buffer_t * buffer, bool value)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_int16, \ - (hyper_inserter_buffer_t * buffer, int16_t value)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_int32, \ - (hyper_inserter_buffer_t * buffer, int32_t value)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_int64, \ - (hyper_inserter_buffer_t * buffer, int64_t value)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_double, \ - (hyper_inserter_buffer_t * buffer, double value)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_binary, \ - (hyper_inserter_buffer_t * buffer, const uint8_t *value, size_t size)) \ - C(hyper_error_t *, hyper_inserter_buffer_add_raw, \ - (hyper_inserter_buffer_t * buffer, const uint8_t *value, size_t size)) \ - C(hyper_error_t *, hyper_rowset_get_next_chunk, \ - (hyper_rowset_t * rowset, hyper_rowset_chunk_t * *rowset_chunk)) \ - C(void, hyper_destroy_rowset_chunk, \ - (const hyper_rowset_chunk_t *rowset_chunk)) \ - C(void, hyper_rowset_chunk_field_values, \ - (hyper_rowset_chunk_t * rowset_chunk, size_t * col_count, \ - size_t * row_count, const uint8_t *const *values[], \ - const size_t *sizes[])) - -#define C(RET, NAME, ARGS) extern RET(*NAME) ARGS; -HYPERAPI_FUNCTIONS(C) -#undef C - -// custom addition from the Python binding; mistmatch with C API -typedef struct { - int64_t microseconds; - int32_t days; - int32_t months; -} py_interval; -#endif diff --git a/pantab/src/type.c b/pantab/src/type.c deleted file mode 100644 index 83a6b869..00000000 --- a/pantab/src/type.c +++ /dev/null @@ -1,37 +0,0 @@ -#include "type.h" - -static DTYPE stringToDtype(const char *str) { - for (Py_ssize_t i = 0; - i < (Py_ssize_t)(sizeof(dtype_map) / sizeof(dtype_map[0])); i++) { - if (strcmp(str, dtype_map[i].str) == 0) { - return dtype_map[i].dtype; - } - } - - return UNKNOWN; -} - -// Caller is responsible for returned object -DTYPE *makeEnumeratedDtypes(PyTupleObject *obj) { - Py_ssize_t len = PyTuple_GET_SIZE(obj); - DTYPE *result = malloc(len * sizeof(DTYPE)); - - for (Py_ssize_t i = 0; i < len; i++) { - PyObject *dtypeObj = PyTuple_GET_ITEM(obj, i); - const char *dtypeStr = PyUnicode_AsUTF8(dtypeObj); - DTYPE dtype = stringToDtype(dtypeStr); - - if (dtype == UNKNOWN) { - free(result); - PyObject *errMsg = - PyUnicode_FromFormat("Unknown dtype: \"%s\"\n", dtypeStr); - PyErr_SetObject(PyExc_TypeError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - - result[i] = dtype; - } - - return result; -} diff --git a/pantab/src/type.h b/pantab/src/type.h deleted file mode 100644 index eca39cb1..00000000 --- a/pantab/src/type.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef PANTAB -#define PANTAB - -#define PY_SSIZE_T_CLEAN -#include "tableauhyperapi.h" -#include -#include - -#define MICROSECONDS_PER_DAY \ - (INT64_C(24) * INT64_C(60) * INT64_C(60) * INT64_C(1000000)) - -typedef enum { - INT16_ = 1, - INT32_, - INT64_, - INT16NA = 6, - INT32NA, - INT64NA, - FLOAT32_ = 11, - FLOAT64_, - FLOAT32NA, - FLOAT64NA, - BOOLEAN = 50, - BOOLEANNA, - DATETIME64_NS = 100, - DATETIME64_NS_UTC, - DATE, - TIMEDELTA64_NS = 200, - OBJECT = 220, - STRING, - UNKNOWN = 255 -} DTYPE; - -static const struct { - DTYPE dtype; - const char *str; -} dtype_map[] = {{INT16_, "int16"}, - {INT32_, "int32"}, - {INT64_, "int64"}, - {INT16NA, "Int16"}, - {INT32NA, "Int32"}, - {INT64NA, "Int64"}, - {FLOAT32_, "float32"}, - {FLOAT64_, "float64"}, - {FLOAT32NA, "Float32"}, - {FLOAT64NA, "Float64"}, - {BOOLEAN, "bool"}, - {BOOLEANNA, "boolean"}, - {DATETIME64_NS, "datetime64[ns]"}, - {DATETIME64_NS_UTC, "datetime64[ns, UTC]"}, - {DATE, "date"}, // TODO: this isn't actually a dtype - {TIMEDELTA64_NS, "timedelta64[ns]"}, - {STRING, "string"}, - {OBJECT, "object"}}; - -// creates an enumeration from a tuple of strings, -// so ("int16", "int32") -> [INT16_, INT32_] -// caller is responsible for freeing memory -// returns NULL on failure -DTYPE *makeEnumeratedDtypes(PyTupleObject *obj); - -#endif diff --git a/pantab/src/writer.c b/pantab/src/writer.c deleted file mode 100644 index d615cf38..00000000 --- a/pantab/src/writer.c +++ /dev/null @@ -1,641 +0,0 @@ -#include "cffi.h" -#include "type.h" -#include -#define NO_IMPORT_ARRAY -#define PY_ARRAY_UNIQUE_SYMBOL PANTAB_ARRAY_API -#include "numpy_datetime.h" -#include -#include - -/* -Creates an array of NpyIter structs in the same order as the arrays supplied. - -Caller is responsible for freeing memory. Returns NULL on error -*/ -static NpyIter **initiateIters(PyObject *arrList) { - NpyIter **npyIters = - PyObject_Malloc(sizeof(NpyIter *) * PyObject_Length(arrList)); - if (npyIters == NULL) { - PyErr_NoMemory(); - return NULL; - } - - for (Py_ssize_t i = 0; i < PyObject_Length(arrList); i++) { - PyArrayObject *arr = (PyArrayObject *)PyList_GET_ITEM(arrList, i); - - // Check contents of each numpy array - NpyIter *iter = NpyIter_New(arr, NPY_ITER_READONLY | NPY_ITER_REFS_OK, - NPY_KEEPORDER, NPY_NO_CASTING, NULL); - - // TODO: do we need to check NpyIter_IterationNeedsAPI(iter) anywhere? - // Applicable because of NPY_ITER_REFS_OK flags - if (iter == NULL) { - if (i > 0) { - while (--i) { - NpyIter_Deallocate(npyIters[i]); - } - } - - PyErr_NoMemory(); - return NULL; - } - - npyIters[i] = iter; - } - - return npyIters; -} - -/* Initiate iters outside of any loop for performance. - Caller is responsible for releasing memory. - - Returns NULL on error -*/ -static NpyIter_IterNextFunc **initiateIterNextFuncs(NpyIter **npyIters, - Py_ssize_t len) { - NpyIter_IterNextFunc **npyIterNextFuncs = - PyObject_Malloc(sizeof(NpyIter_IterNextFunc *) * len); - if (npyIterNextFuncs == NULL) { - PyErr_NoMemory(); - return NULL; - } - - for (Py_ssize_t i = 0; i < len; i++) { - NpyIter_IterNextFunc *func = NpyIter_GetIterNext(npyIters[i], NULL); - if (func == NULL) { - return NULL; - } - - npyIterNextFuncs[i] = func; - } - - return npyIterNextFuncs; -} - -static char ***initiateDataPtrs(NpyIter **npyIters, Py_ssize_t len) { - char ***dataptrs = PyObject_Malloc(sizeof(char **) * len); - if (dataptrs == NULL) { - PyErr_NoMemory(); - return NULL; - } - - for (Py_ssize_t i = 0; i < len; i++) { - char **dataptr = NpyIter_GetDataPtrArray(npyIters[i]); - if (dataptr == NULL) { - return NULL; - } - - dataptrs[i] = dataptr; - } - - return dataptrs; -} - -/* -Free an array of numpy array iterators. - -TODO: dynamically calculate how many to free rather than require length as arg -*/ -static void freeIters(NpyIter **iters, Py_ssize_t length) { - for (Py_ssize_t i = 0; i < length; i++) { - NpyIter_Deallocate(iters[i]); - } -} - -static hyper_error_t *writeNonNullData(char **dataptr, DTYPE dtype, - hyper_inserter_buffer_t *insertBuffer, - Py_ssize_t row, Py_ssize_t col) { - hyper_error_t *result; - switch (dtype) { - case INT16_: { - int16_t **ptr = (int16_t **)dataptr; - int16_t val = **ptr; - result = hyper_inserter_buffer_add_int16(insertBuffer, val); - break; - } - case INT16NA: { - PyObject ***ptr = (PyObject ***)dataptr; - // The fact that NA datatypes are stored as objects is a bit - // unfortunate for sizing, as the CPython API only exposes - // Long / LongLong data types - PyObject *obj = **ptr; - long val = PyLong_AsLong(obj); - result = hyper_inserter_buffer_add_int16(insertBuffer, (int16_t)val); - break; - } - case INT32_: { - int32_t **ptr = (int32_t **)dataptr; - int32_t val = **ptr; - result = hyper_inserter_buffer_add_int32(insertBuffer, val); - break; - } - case INT32NA: { - PyObject ***ptr = (PyObject ***)dataptr; - PyObject *obj = **ptr; - long val = PyLong_AsLong(obj); - result = hyper_inserter_buffer_add_int32(insertBuffer, val); - break; - } - case INT64_: { - int64_t **ptr = (int64_t **)dataptr; - int64_t val = **ptr; - result = hyper_inserter_buffer_add_int64(insertBuffer, val); - break; - } - case INT64NA: { - PyObject ***ptr = (PyObject ***)dataptr; - PyObject *obj = **ptr; - long long val = PyLong_AsLongLong(obj); - result = hyper_inserter_buffer_add_int64(insertBuffer, val); - break; - } - case FLOAT32_: { - float_t **ptr = (float_t **)dataptr; - float_t val = **ptr; - result = hyper_inserter_buffer_add_double(insertBuffer, val); - break; - } - case FLOAT64_: { - double_t **ptr = (double_t **)dataptr; - double_t val = **ptr; - result = hyper_inserter_buffer_add_double(insertBuffer, val); - break; - } - case FLOAT32NA: - case FLOAT64NA: { - PyObject ***ptr = (PyObject ***)dataptr; - PyObject *obj = **ptr; - double_t val = PyFloat_AsDouble(obj); - result = hyper_inserter_buffer_add_double(insertBuffer, val); - break; - } - case BOOLEAN: { - npy_bool **ptr = (npy_bool **)dataptr; - npy_bool val = **ptr; - result = hyper_inserter_buffer_add_bool(insertBuffer, val); - break; - } - case BOOLEANNA: { - PyObject ***ptr = (PyObject ***)dataptr; - PyObject *obj = **ptr; - int val = obj == Py_True; - result = hyper_inserter_buffer_add_bool(insertBuffer, val); - break; - } - case DATETIME64_NS: - case DATETIME64_NS_UTC: { - npy_datetime **ptr = (npy_datetime **)dataptr; - npy_datetime val = **ptr; - - npy_datetimestruct dts; - - // TODO: here we are using dummy metadata, but ideally - // should get from array in case pandas ever allows for - // different precision datetimes - PyArray_DatetimeMetaData meta = {.base = NPY_FR_ns, .num = 1}; - int ret = convert_datetime_to_datetimestruct(&meta, val, &dts); - if (ret != 0) { - PyObject *errMsg = - PyUnicode_FromFormat("Failed to convert numpy datetime"); - PyErr_SetObject(PyExc_RuntimeError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - - hyper_date_components_t date_components = { - .year = dts.year, .month = dts.month, .day = dts.day}; - - hyper_time_components_t time_components = {.hour = dts.hour, - .minute = dts.min, - .second = dts.sec, - .microsecond = dts.us}; - - hyper_date_t date = hyper_encode_date(date_components); - hyper_time_t time = hyper_encode_time(time_components); - - // TODO: Tableau uses typedefs for unsigned 32 / 64 integers for - // date and time respectively, but stores as int64; here we cast - // explicitly but should probably bounds check for overflow as well - int64_t ms = (int64_t)time + (int64_t)date * MICROSECONDS_PER_DAY; - - result = hyper_inserter_buffer_add_int64(insertBuffer, ms); - break; - } - case TIMEDELTA64_NS: { - PyObject ***ptr = (PyObject ***)dataptr; - PyObject *data = **ptr; - - // TODO: Add error message for failed attribute access - PyObject *us = PyObject_GetAttrString(data, "microseconds"); - if (us == NULL) { - return NULL; - } - PyObject *days = PyObject_GetAttrString(data, "days"); - if (days == NULL) { - Py_DECREF(us); - return NULL; - } - - PyObject *months = PyObject_GetAttrString(data, "months"); - if (months == NULL) { - Py_DECREF(us); - Py_DECREF(days); - return NULL; - } - - py_interval interval = {.microseconds = PyLong_AsLongLong(us), - .days = PyLong_AsLong(days), - .months = PyLong_AsLong(months)}; - - // TODO: it appears there is some buffer packing being done, though - // not sure this actually works in Tableau - result = hyper_inserter_buffer_add_raw( - insertBuffer, (const unsigned char *)&interval, sizeof(py_interval)); - Py_DECREF(us); - Py_DECREF(days); - Py_DECREF(months); - break; - } - case STRING: - case OBJECT: { - PyObject ***ptr = (PyObject ***)dataptr; - PyObject *obj = **ptr; - if (dtype == OBJECT) { - // N.B. all other dtypes in pandas are well defined, but object is - // really anything For purposes of Tableau these need to be strings, - // so error out if not In the future should enforce StringDtype from - // pandas once released (1.0.0) - if (!PyUnicode_Check(obj)) { - PyObject *errMsg = PyUnicode_FromFormat( - "Invalid value \"%R\" found (row %zd column %zd)", obj, row, col); - PyErr_SetObject(PyExc_TypeError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - } - Py_ssize_t len; - // TODO: CPython uses a const char* buffer but Hyper accepts - // const unsigned char* - is this always safe? - const unsigned char *buf = - (const unsigned char *)PyUnicode_AsUTF8AndSize(obj, &len); - result = hyper_inserter_buffer_add_binary(insertBuffer, buf, len); - break; - } - default: { - PyObject *errMsg = PyUnicode_FromFormat("Invalid dtype: \"%s\""); - PyErr_SetObject(PyExc_ValueError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - } - - return result; -} - -// TODO: Make error handling consistent. Right now errors occur if -// 1. The return value is non-NULL OR -// 2. PyErr is set within this function -static hyper_error_t * -writeNonNullDataLegacy(PyObject *data, DTYPE dtype, - hyper_inserter_buffer_t *insertBuffer, Py_ssize_t row, - Py_ssize_t col) { - hyper_error_t *result; - switch (dtype) { - case INT16_: - case INT16NA: { - int16_t val = (int16_t)PyLong_AsLong(data); - result = hyper_inserter_buffer_add_int16(insertBuffer, val); - break; - } - case INT32_: - case INT32NA: { - int32_t val = (int32_t)PyLong_AsLong(data); - result = hyper_inserter_buffer_add_int32(insertBuffer, val); - break; - } - case INT64_: - case INT64NA: { - int64_t val = (int64_t)PyLong_AsLongLong(data); - result = hyper_inserter_buffer_add_int64(insertBuffer, val); - break; - } - case FLOAT32_: - case FLOAT64_: - case FLOAT32NA: - case FLOAT64NA: { - double val = PyFloat_AsDouble(data); - result = hyper_inserter_buffer_add_double(insertBuffer, val); - break; - } - case BOOLEAN: - case BOOLEANNA: { - if (PyObject_IsTrue(data)) { - result = hyper_inserter_buffer_add_bool(insertBuffer, 1); - } else { - result = hyper_inserter_buffer_add_bool(insertBuffer, 0); - } - break; - } - case DATETIME64_NS: - case DATETIME64_NS_UTC: { - hyper_date_components_t date_components = { - .year = PyDateTime_GET_YEAR(data), - .month = PyDateTime_GET_MONTH(data), - .day = PyDateTime_GET_DAY(data)}; - - hyper_time_components_t time_components = { - .hour = PyDateTime_DATE_GET_HOUR(data), - .minute = PyDateTime_DATE_GET_MINUTE(data), - .second = PyDateTime_DATE_GET_SECOND(data), - .microsecond = PyDateTime_DATE_GET_MICROSECOND(data)}; - - hyper_date_t date = hyper_encode_date(date_components); - hyper_time_t time = hyper_encode_time(time_components); - - // TODO: Tableau uses typedefs for unsigned 32 / 64 integers for - // date and time respectively, but stores as int64; here we cast - // explicitly but should probably bounds check for overflow as well - int64_t val = (int64_t)time + (int64_t)date * MICROSECONDS_PER_DAY; - - result = hyper_inserter_buffer_add_int64(insertBuffer, val); - break; - } - case TIMEDELTA64_NS: { - // TODO: Add error message for failed attribute access - PyObject *us = PyObject_GetAttrString(data, "microseconds"); - if (us == NULL) { - return NULL; - } - PyObject *days = PyObject_GetAttrString(data, "days"); - if (days == NULL) { - Py_DECREF(us); - return NULL; - } - - PyObject *months = PyObject_GetAttrString(data, "months"); - if (months == NULL) { - Py_DECREF(us); - Py_DECREF(days); - return NULL; - } - - py_interval interval = {.microseconds = PyLong_AsLongLong(us), - .days = PyLong_AsLong(days), - .months = PyLong_AsLong(months)}; - - // TODO: it appears there is some buffer packing being done, though - // not sure this actually works in Tableau - result = hyper_inserter_buffer_add_raw( - insertBuffer, (const unsigned char *)&interval, sizeof(py_interval)); - Py_DECREF(us); - Py_DECREF(days); - Py_DECREF(months); - break; - } - case STRING: - case OBJECT: { - if (dtype == OBJECT) { - // N.B. all other dtypes in pandas are well defined, but object is - // really anything For purposes of Tableau these need to be strings, - // so error out if not In the future should enforce StringDtype from - // pandas once released (1.0.0) - if (!PyUnicode_Check(data)) { - PyObject *errMsg = PyUnicode_FromFormat( - "Invalid value \"%R\" found (row %zd column %zd)", data, row, col); - PyErr_SetObject(PyExc_TypeError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - } - Py_ssize_t len; - // TODO: CPython uses a const char* buffer but Hyper accepts - // const unsigned char* - is this always safe? - const unsigned char *buf = - (const unsigned char *)PyUnicode_AsUTF8AndSize(data, &len); - result = hyper_inserter_buffer_add_binary(insertBuffer, buf, len); - break; - } - default: { - PyObject *errMsg = PyUnicode_FromFormat("Invalid dtype: \"%s\""); - PyErr_SetObject(PyExc_ValueError, errMsg); - Py_DECREF(errMsg); - return NULL; - } - } - - return result; -} - -// This function gets performance by sacrificing bounds checking -// Particulary no checking happens that the length of each iterable -// in data matches the length of the callables supplied at every step -// in the process,though note that this is critical! -// If this doesn't hold true behavior is undefined -PyObject *write_to_hyper_legacy(PyObject *Py_UNUSED(dummy), PyObject *args) { - int ok; - PyObject *data, *iterator, *row, *val, *dtypes, *null_mask, *insertBufferObj; - Py_ssize_t row_counter, ncols; - hyper_inserter_buffer_t *insertBuffer; - hyper_error_t *result; - Py_buffer buf; - - PyDateTime_IMPORT; - - // TOOD: Find better way to accept buffer pointer than putting in long - ok = PyArg_ParseTuple(args, "OOOnO!", &data, &null_mask, &insertBufferObj, - &ncols, &PyTuple_Type, &dtypes); - if (!ok) - return NULL; - - if (!PyIter_Check(data)) { - PyErr_SetString(PyExc_TypeError, "First argument must be iterable"); - return NULL; - } - - if (!PyObject_CheckBuffer(null_mask)) { - PyErr_SetString(PyExc_TypeError, - "Second argument must support buffer protocol"); - return NULL; - } - - // TODO: check that we get an instance of CDataObject; else will segfault - insertBuffer = - (hyper_inserter_buffer_t *)((CDataObject *)insertBufferObj)->c_data; - - iterator = PyObject_GetIter(data); - if (iterator == NULL) - return NULL; - - if (PyObject_GetBuffer(null_mask, &buf, PyBUF_CONTIG_RO | PyBUF_FORMAT) < 0) { - Py_DECREF(iterator); - return NULL; - } - - if (buf.ndim != 2) { - Py_DECREF(iterator); - PyBuffer_Release(&buf); - PyErr_SetString(PyExc_ValueError, "null_mask must be 2D"); - return NULL; - } - - if (strncmp(buf.format, "?", 1) != 0) { - Py_DECREF(iterator); - PyBuffer_Release(&buf); - PyErr_SetString(PyExc_ValueError, "null_mask must be boolean"); - return NULL; - } - - DTYPE *enumerated_dtypes = makeEnumeratedDtypes((PyTupleObject *)dtypes); - row_counter = 0; - Py_ssize_t item_counter = - 0; // Needed as pointer arith doesn't work for void * buf - while ((row = PyIter_Next(iterator))) { - // TODO: Add validation that the total length of all elements - // matches the length of the null buffer, otherwise wrong data - // is returned - for (Py_ssize_t i = 0; i < ncols; i++) { - if (((uint8_t *)buf.buf)[item_counter++] == 1) { - result = hyper_inserter_buffer_add_null(insertBuffer); - } else { - val = PyTuple_GET_ITEM(row, i); - result = writeNonNullDataLegacy(val, enumerated_dtypes[i], insertBuffer, - row_counter, i); - } - - if ((result != NULL) || (PyErr_Occurred())) { - free(enumerated_dtypes); - Py_DECREF(row); - Py_DECREF(iterator); - PyBuffer_Release(&buf); - return NULL; - } - } - Py_DECREF(row); - row_counter += 1; - } - - free(enumerated_dtypes); - Py_DECREF(iterator); - PyBuffer_Release(&buf); - - if (PyErr_Occurred()) - return NULL; - - Py_RETURN_NONE; -} - -PyObject *write_to_hyper(PyObject *Py_UNUSED(dummy), PyObject *args) { - int ok, success = 1; - PyObject *df, *dtypes, *null_mask, *insertBufferObj; - hyper_inserter_buffer_t *insertBuffer; - hyper_error_t *result; - Py_buffer buf; - - // TOOD: Find better way to accept buffer pointer than putting in long - ok = PyArg_ParseTuple(args, "OOOO!", &df, &null_mask, &insertBufferObj, - &PyTuple_Type, &dtypes); - if (!ok) - return NULL; - - if (!PyObject_CheckBuffer(null_mask)) { - PyErr_SetString(PyExc_TypeError, - "Second argument must support buffer protocol"); - return NULL; - } - - // TODO: check that we get an instance of CDataObject; else will segfault - insertBuffer = - (hyper_inserter_buffer_t *)((CDataObject *)insertBufferObj)->c_data; - - if (PyObject_GetBuffer(null_mask, &buf, PyBUF_CONTIG_RO | PyBUF_FORMAT) < 0) { - return NULL; - } - - if (buf.ndim != 2) { - PyBuffer_Release(&buf); - PyErr_SetString(PyExc_ValueError, "null_mask must be 2D"); - return NULL; - } - - if (strncmp(buf.format, "?", 1) != 0) { - PyBuffer_Release(&buf); - PyErr_SetString(PyExc_ValueError, "null_mask must be boolean"); - return NULL; - } - - DTYPE *enumerated_dtypes = makeEnumeratedDtypes((PyTupleObject *)dtypes); - - PyObject *mgr = PyObject_GetAttrString(df, "_mgr"); - if (mgr == NULL) { - PyBuffer_Release(&buf); - free(enumerated_dtypes); - return NULL; - } - - PyObject *arrList = PyObject_GetAttrString(mgr, "column_arrays"); - Py_DECREF(mgr); - if (arrList == NULL) { - PyBuffer_Release(&buf); - free(enumerated_dtypes); - return NULL; - } - - Py_ssize_t rowcount = PyObject_Length(df); - Py_ssize_t colcount = PyObject_Length(arrList); - Py_ssize_t bufPos; - NpyIter **npyIters = initiateIters(arrList); - Py_DECREF(arrList); - - if (npyIters == NULL) { - PyBuffer_Release(&buf); - free(enumerated_dtypes); - return NULL; - } - NpyIter_IterNextFunc **npyIterNextFuncs = - initiateIterNextFuncs(npyIters, colcount); - if (npyIterNextFuncs == NULL) { - success = 0; - goto CLEANUP; - } - - char ***dataptrs = initiateDataPtrs(npyIters, colcount); - if (dataptrs == NULL) { - success = 0; - goto CLEANUP; - } - - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - char **dataptr; - - for (Py_ssize_t rowIndex = 0; rowIndex < rowcount; rowIndex++) { - for (Py_ssize_t colIndex = 0; colIndex < colcount; colIndex++) { - bufPos = (rowIndex * colcount) + colIndex; - iter = npyIters[colIndex]; - iternext = npyIterNextFuncs[colIndex]; - dataptr = dataptrs[colIndex]; - if (((uint8_t *)buf.buf)[bufPos] == 1) { - result = hyper_inserter_buffer_add_null(insertBuffer); - } else { - result = writeNonNullData(dataptr, enumerated_dtypes[colIndex], - insertBuffer, rowIndex, colIndex); - } - iternext(iter); - - if ((result != NULL) || (PyErr_Occurred())) { - success = 0; - goto CLEANUP; - } - } - } - -CLEANUP: - freeIters(npyIters, colcount); - free(enumerated_dtypes); - PyBuffer_Release(&buf); - - if (success) - Py_RETURN_NONE; - else - return NULL; -} diff --git a/pantab/src/writer.h b/pantab/src/writer.h deleted file mode 100644 index 545a0e3f..00000000 --- a/pantab/src/writer.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef PANTAB_WRITER_H -#define PANTAB_WRITER_H - -#define PY_SSIZE_T_CLEAN -#include - -PyObject *write_to_hyper_legacy(PyObject *Py_UNUSED(dummy), PyObject *args); -PyObject *write_to_hyper(PyObject *Py_UNUSED(dummy), PyObject *args); - -#endif diff --git a/pantab/tests/conftest.py b/pantab/tests/conftest.py index 91f41057..3cf4116f 100644 --- a/pantab/tests/conftest.py +++ b/pantab/tests/conftest.py @@ -5,12 +5,8 @@ import pytest import tableauhyperapi as tab_api -import pantab._compat as compat - -@pytest.fixture -def df(): - """Fixture to use which should contain all data types.""" +def get_basic_dataframe(): df = pd.DataFrame( [ [ @@ -22,10 +18,13 @@ def df(): 3, 4.0, 5.0, + 1.0, + 2.0, + True, True, pd.to_datetime("2018-01-01"), pd.to_datetime("2018-01-01", utc=True), - pd.Timedelta("1 days 2 hours 3 minutes 4 seconds"), + "foo", "foo", np.iinfo(np.int16).min, np.iinfo(np.int32).min, @@ -43,10 +42,13 @@ def df(): np.nan, 9.0, 10.0, + 1.0, + 2.0, + False, False, pd.to_datetime("1/1/19"), pd.to_datetime("2019-01-01", utc=True), - pd.Timedelta("-1 days 2 hours 3 minutes 4 seconds"), + "bar", "bar", np.iinfo(np.int16).max, np.iinfo(np.int32).max, @@ -64,11 +66,14 @@ def df(): np.nan, np.nan, np.nan, + pd.NA, + pd.NA, False, - pd.NaT, + pd.NA, pd.NaT, pd.NaT, np.nan, + pd.NA, 0, 0, 0, @@ -86,11 +91,14 @@ def df(): "Int64", "float32", "float64", + "Float32", + "Float64", "bool", + "boolean", "datetime64", "datetime64_utc", - "timedelta64", "object", + "string", "int16_limits", "int32_limits", "int64_limits", @@ -110,11 +118,14 @@ def df(): "Int64": "Int64", "float32": np.float32, "float64": np.float64, + "Float32": "Float32", + "Float64": "Float64", "bool": bool, + "boolean": "boolean", "datetime64": "datetime64[ns]", "datetime64_utc": "datetime64[ns, UTC]", - "timedelta64": "timedelta64[ns]", "object": "object", + "string": "string", "int16_limits": np.int16, "int32_limits": np.int32, "int64_limits": np.int64, @@ -124,13 +135,46 @@ def df(): } ) - df["boolean"] = pd.Series([True, False, pd.NA], dtype="boolean") - df["string"] = pd.Series(["foo", "bar", pd.NA], dtype="string") + return df - if compat.PANDAS_120: - df["Float32"] = pd.Series([1.0, 2.0, pd.NA], dtype="Float32") - df["Float64"] = pd.Series([1.0, 2.0, pd.NA], dtype="Float64") +@pytest.fixture +def df(): + """Fixture to use which should contain all data types.""" + return get_basic_dataframe() + + +@pytest.fixture +def roundtripped(): + """Roundtripped DataFrames should use arrow dtypes by default""" + df = get_basic_dataframe() + df = df.astype( + { + "int16": "int16[pyarrow]", + "int32": "int32[pyarrow]", + "int64": "int64[pyarrow]", + "Int16": "int16[pyarrow]", + "Int32": "int32[pyarrow]", + "Int64": "int64[pyarrow]", + "float32": "double[pyarrow]", + "float64": "double[pyarrow]", + "Float32": "double[pyarrow]", + "Float64": "double[pyarrow]", + "bool": "boolean[pyarrow]", + "boolean": "boolean[pyarrow]", + "datetime64": "timestamp[us][pyarrow]", + "datetime64_utc": "timestamp[us, UTC][pyarrow]", + # "timedelta64": "timedelta64[ns]", + "object": "string[pyarrow]", + "int16_limits": "int16[pyarrow]", + "int32_limits": "int32[pyarrow]", + "int64_limits": "int64[pyarrow]", + "float32_limits": "double[pyarrow]", + "float64_limits": "double[pyarrow]", + "non-ascii": "string[pyarrow]", + "string": "string[pyarrow]", + } + ) return df @@ -164,9 +208,3 @@ def table_name(request): def datapath(): """Location of data files in test folder.""" return pathlib.Path(__file__).parent / "data" - - -@pytest.fixture(params=[False, True]) -def use_parquet(request): - """Whether to use parquet for intermediate file storage.""" - return request.param diff --git a/pantab/tests/test_reader.py b/pantab/tests/test_reader.py index 377b29d7..466089e0 100644 --- a/pantab/tests/test_reader.py +++ b/pantab/tests/test_reader.py @@ -1,5 +1,3 @@ -from sqlite3 import connect - import pandas as pd import pandas.testing as tm import pytest @@ -27,57 +25,10 @@ def test_reports_unsupported_type(datapath): would be string columns. This led to very fascinating failures. """ db_path = datapath / "geography.hyper" - with pytest.raises( - TypeError, match=r"Column \"x\" has unsupported datatype GEOGRAPHY" - ): + with pytest.raises(TypeError, match=r"GEOGRAPHY"): pantab.frame_from_hyper(db_path, table="test") -def test_months_in_interval_raises(df, tmp_hyper, monkeypatch): - # Monkeypatch a new constructor that hard codes months - def __init__(self, months: int, days: int, microseconds: int): - self.months = 1 - self.days = days - self.microseconds = microseconds - - monkeypatch.setattr(pantab._writer.tab_api.Interval, "__init__", __init__) - pantab.frame_to_hyper(df, tmp_hyper, table="test") - with pytest.raises( - ValueError, match=r"Cannot read Intervals with month components\." - ): - pantab.frame_from_hyper(tmp_hyper, table="test") - - with pytest.raises( - ValueError, match=r"Cannot read Intervals with month components\." - ): - pantab.frames_from_hyper(tmp_hyper) - - -def test_error_on_first_column(df, tmp_hyper, monkeypatch): - """ - We had a defect due to which pantab segfaulted when an error occured in one of - the first two columns. This test case is a regression test against that. - """ - - # Monkeypatch a new constructor that hard codes months - def __init__(self, months: int, days: int, microseconds: int): - self.months = 1 - self.days = days - self.microseconds = microseconds - - monkeypatch.setattr(pantab._writer.tab_api.Interval, "__init__", __init__) - - df = pd.DataFrame( - [[pd.Timedelta("1 days 2 hours 3 minutes 4 seconds")]], columns=["timedelta64"] - ).astype({"timedelta64": "timedelta64[ns]"}) - pantab.frame_to_hyper(df, tmp_hyper, table="test") - - with pytest.raises( - ValueError, match=r"Cannot read Intervals with month components\." - ): - pantab.frame_from_hyper(tmp_hyper, table="test") - - def test_read_non_roundtrippable(datapath): result = pantab.frame_from_hyper( datapath / "dates.hyper", table=TableName("Extract", "Extract") @@ -85,7 +36,7 @@ def test_read_non_roundtrippable(datapath): expected = pd.DataFrame( [["1900-01-01", "2000-01-01"], [pd.NaT, "2050-01-01"]], columns=["Date1", "Date2"], - dtype="datetime64[ns]", + dtype="date32[day][pyarrow]", ) tm.assert_frame_equal(result, expected) @@ -99,7 +50,12 @@ def test_reads_non_writeable(datapath): [["row1", 1.0], ["row2", 2.0]], columns=["Non-Nullable String", "Non-Nullable Float"], ) - expected["Non-Nullable String"] = expected["Non-Nullable String"].astype("string") + expected["Non-Nullable Float"] = expected["Non-Nullable Float"].astype( + "double[pyarrow]" + ) + expected["Non-Nullable String"] = expected["Non-Nullable String"].astype( + "string[pyarrow]" + ) tm.assert_frame_equal(result, expected) @@ -111,22 +67,21 @@ def test_read_query(df, tmp_hyper): result = pantab.frame_from_hyper_query(tmp_hyper, query) expected = pd.DataFrame([[1, "_2"], [6, "_7"], [0, "_0"]], columns=["i", "_i2"]) - expected = expected.astype({"i": "Int16", "_i2": "string"}) + expected = expected.astype({"i": "int16[pyarrow]", "_i2": "string[pyarrow]"}) tm.assert_frame_equal(result, expected) -def test_empty_read_query(df: pd.DataFrame, tmp_hyper): +def test_empty_read_query(df: pd.DataFrame, roundtripped, tmp_hyper): """ red-green for empty query results """ # sql cols need to base case insensitive & unique - df = df[pd.Series(df.columns).apply(lambda s: s.lower()).drop_duplicates()] - conn = connect(":memory:") table_name = "test" - df.to_sql(name=table_name, con=conn, index=False) pantab.frame_to_hyper(df, tmp_hyper, table=table_name) query = f"SELECT * FROM {table_name} limit 0" - expected = pd.read_sql_query(query, conn) + expected = pd.DataFrame(columns=df.columns) + expected = expected.astype(roundtripped.dtypes) + result = pantab.frame_from_hyper_query(tmp_hyper, query) tm.assert_frame_equal(result, expected) diff --git a/pantab/tests/test_roundtrip.py b/pantab/tests/test_roundtrip.py index 1d23c0dc..32c9f065 100644 --- a/pantab/tests/test_roundtrip.py +++ b/pantab/tests/test_roundtrip.py @@ -1,61 +1,28 @@ -from pathlib import Path - -import numpy as np import pandas as pd import pandas.testing as tm -import pytest -from tableauhyperapi import Connection, CreateMode, HyperProcess, TableName, Telemetry +from tableauhyperapi import TableName import pantab -def assert_roundtrip_equal(result, expected): - """Compat helper for comparing round-tripped results.""" - - expected["object"] = expected["object"].astype("string") - expected["non-ascii"] = expected["non-ascii"].astype("string") - expected["datetime64_utc"] = expected["datetime64_utc"].dt.tz_localize(None) - - tm.assert_frame_equal(result, expected) - - -def test_basic(df, tmp_hyper, table_name, table_mode): +def test_basic(df, roundtripped, tmp_hyper, table_name, table_mode): # Write twice; depending on mode this should either overwrite or duplicate entries pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode=table_mode) pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode=table_mode) result = pantab.frame_from_hyper(tmp_hyper, table=table_name) - expected = df.copy() - expected["float32"] = expected["float32"].astype(np.float64) - expected["Float32"] = expected["Float32"].astype(np.float64) - expected["Float64"] = expected["Float64"].astype(np.float64) - + expected = roundtripped if table_mode == "a": expected = pd.concat([expected, expected]).reset_index(drop=True) - assert_roundtrip_equal(result, expected) + # TODO: somehow concat turns string[pyarrow] into string python + for col in ("object", "non-ascii", "string"): + expected[col] = expected[col].astype("string[pyarrow]") - -def test_use_float_na_flag(df, tmp_hyper, table_name): - pantab.frame_to_hyper(df, tmp_hyper, table=table_name) - result = pantab.frame_from_hyper(tmp_hyper, table=table_name, use_float_na=False) - expected = df.copy() - expected["float32"] = expected["float32"].astype(np.float64) - expected["Float32"] = expected["Float32"].astype(np.float64) - expected["Float64"] = expected["Float64"].astype(np.float64) - assert_roundtrip_equal(result, expected) - - result = pantab.frame_from_hyper(tmp_hyper, table=table_name, use_float_na=True) - expected = df.copy() - expected["float32"] = expected["float32"].astype("Float64") - expected["float64"] = expected["float64"].astype("Float64") - expected["float32_limits"] = expected["float32_limits"].astype("Float64") - expected["float64_limits"] = expected["float64_limits"].astype("Float64") - expected["Float32"] = expected["Float32"].astype("Float64") - assert_roundtrip_equal(result, expected) + tm.assert_frame_equal(result, expected) -def test_multiple_tables(df, tmp_hyper, table_name, table_mode): +def test_multiple_tables(df, roundtripped, tmp_hyper, table_name, table_mode): # Write twice; depending on mode this should either overwrite or duplicate entries pantab.frames_to_hyper( {table_name: df, "table2": df}, tmp_hyper, table_mode=table_mode @@ -65,100 +32,18 @@ def test_multiple_tables(df, tmp_hyper, table_name, table_mode): ) result = pantab.frames_from_hyper(tmp_hyper) - expected = df.copy() - expected["float32"] = expected["float32"].astype(np.float64) - expected["Float32"] = expected["Float32"].astype(np.float64) - expected["Float64"] = expected["Float64"].astype(np.float64) + expected = roundtripped if table_mode == "a": expected = pd.concat([expected, expected]).reset_index(drop=True) + # TODO: somehow concat turns string[pyarrow] into string python + for col in ("object", "non-ascii", "string"): + expected[col] = expected[col].astype("string[pyarrow]") + # some test trickery here if not isinstance(table_name, TableName) or table_name.schema_name is None: table_name = TableName("public", table_name) assert set(result.keys()) == set((table_name, TableName("public", "table2"))) for val in result.values(): - assert_roundtrip_equal(val, expected) - - -def test_roundtrip_with_external_hyper_process(df, tmp_hyper): - default_log_path = Path.cwd() / "hyperd.log" - if default_log_path.exists(): - default_log_path.unlink() - - # By passing in a pre-spawned HyperProcess, one can e.g. avoid creating a log file - parameters = {"log_config": ""} - with HyperProcess( - Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU, parameters=parameters - ) as hyper: - # test frame_to_hyper/frame_from_hyper - pantab.frame_to_hyper(df, tmp_hyper, table="test", hyper_process=hyper) - result = pantab.frame_from_hyper(tmp_hyper, table="test", hyper_process=hyper) - expected = df.copy() - expected["float32"] = expected["float32"].astype(np.float64) - expected["Float32"] = expected["Float32"].astype(np.float64) - expected["Float64"] = expected["Float64"].astype(np.float64) - assert_roundtrip_equal(result, expected) - - # test frame_from_hyper_query - result = pantab.frame_from_hyper_query( - tmp_hyper, "SELECT * FROM test", hyper_process=hyper - ) - assert result.size == df.size - - # test frames_to_hyper/frames_from_hyper - pantab.frames_to_hyper( - {"test2": df, "test": df}, tmp_hyper, hyper_process=hyper - ) - result = pantab.frames_from_hyper(tmp_hyper, hyper_process=hyper) - assert set(result.keys()) == set( - (TableName("public", "test"), TableName("public", "test2")) - ) - - for val in result.values(): - assert_roundtrip_equal(val, expected) - - assert not default_log_path.exists() - - -def test_roundtrip_with_external_hyper_connection(df, tmp_hyper): - with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: - pantab.frames_to_hyper( - {"test": df, "test2": df}, tmp_hyper, hyper_process=hyper - ) - - with Connection(hyper.endpoint, tmp_hyper, CreateMode.NONE) as connection: - result = pantab.frame_from_hyper(connection, table="test") - expected = df.copy() - expected["float32"] = expected["float32"].astype(np.float64) - expected["Float32"] = expected["Float32"].astype(np.float64) - expected["Float64"] = expected["Float64"].astype(np.float64) - assert_roundtrip_equal(result, expected) - - result = pantab.frame_from_hyper_query(connection, "SELECT * FROM test") - assert result.size == df.size - - result = pantab.frames_from_hyper(connection) - assert set(result.keys()) == set( - (TableName("public", "test"), TableName("public", "test2")) - ) - for val in result.values(): - assert_roundtrip_equal(val, expected) - - -def test_external_hyper_connection_and_process_error(df, tmp_hyper): - with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: - with Connection(hyper.endpoint, tmp_hyper, CreateMode.CREATE) as connection: - expected_msg = ( - "hyper_process parameter is useless because `Connection` is provided" - ) - with pytest.raises(ValueError, match=expected_msg): - pantab.frame_from_hyper(connection, table="test", hyper_process=hyper) - - with pytest.raises(ValueError, match=expected_msg): - pantab.frame_from_hyper_query( - connection, "SELECT * FROM test", hyper_process=hyper - ) - - with pytest.raises(ValueError, match=expected_msg): - pantab.frames_from_hyper(connection, hyper_process=hyper) + tm.assert_frame_equal(val, expected) diff --git a/pantab/tests/test_types.py b/pantab/tests/test_types.py deleted file mode 100644 index 6612651d..00000000 --- a/pantab/tests/test_types.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest -import tableauhyperapi as tab_api - -import pantab._types - - -@pytest.mark.parametrize( - "nullability", [tab_api.Nullability.NULLABLE, tab_api.Nullability.NOT_NULLABLE] -) -def test_read_varchar_type(nullability): - """ - Test that we can read a VARCHAR column from Hyper. - """ - vchar = tab_api.SqlType.varchar(255) - vchar_column = pantab._types._ColumnType(vchar, nullability) - assert pantab._types._get_pandas_type(vchar_column) == "string" diff --git a/pantab/tests/test_writer.py b/pantab/tests/test_writer.py index 922322e0..8ca2cee0 100644 --- a/pantab/tests/test_writer.py +++ b/pantab/tests/test_writer.py @@ -1,140 +1,78 @@ import re from datetime import datetime, timezone -import numpy as np import pandas as pd import pytest -import tableauhyperapi as tab_api from tableauhyperapi import Connection, CreateMode, HyperProcess, Telemetry import pantab -def test_bad_table_mode_raises(df, tmp_hyper, use_parquet): - if use_parquet: - pytest.importorskip("pyarrow") - df = df.drop(columns=["timedelta64"]) - +def test_bad_table_mode_raises(df, tmp_hyper): msg = "'table_mode' must be either 'w' or 'a'" with pytest.raises(ValueError, match=msg): pantab.frame_to_hyper( - df, tmp_hyper, table="test", table_mode="x", use_parquet=use_parquet + df, + tmp_hyper, + table="test", + table_mode="x", ) with pytest.raises(ValueError, match=msg): pantab.frames_to_hyper({"a": df}, tmp_hyper, table_mode="x") -def test_append_mode_raises_column_mismatch(df, tmp_hyper, table_name, use_parquet): - if use_parquet: - pytest.importorskip("pyarrow") - df = df.drop(columns=["timedelta64"]) - - pantab.frame_to_hyper(df, tmp_hyper, table=table_name, use_parquet=use_parquet) - - df = df.drop("object", axis=1) - msg = "^Mismatched column definitions:" - with pytest.raises(TypeError, match=msg): - pantab.frame_to_hyper( - df, tmp_hyper, table=table_name, table_mode="a", use_parquet=use_parquet - ) - - -def test_append_mode_raises_column_dtype_mismatch( - df, tmp_hyper, table_name, use_parquet -): - if use_parquet: - pytest.importorskip("pyarrow") - df = df.drop(columns=["timedelta64"]) - - pantab.frame_to_hyper(df, tmp_hyper, table=table_name, use_parquet=use_parquet) +@pytest.mark.parametrize("new_dtype", ["int64", float]) +def test_append_mode_raises_column_dtype_mismatch(new_dtype, df, tmp_hyper, table_name): + pantab.frame_to_hyper(df, tmp_hyper, table=table_name) - df["int16"] = df["int16"].astype(np.int64) - msg = "^Mismatched column definitions:" - with pytest.raises(TypeError, match=msg): - pantab.frame_to_hyper( - df, tmp_hyper, table=table_name, table_mode="a", use_parquet=use_parquet - ) + df["int16"] = df["int16"].astype(new_dtype) + # TODO: a better error message from hyper would be nice here + # seems like a limitation of hyper api + msg = "" + with pytest.raises(RuntimeError, match=msg): + pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode="a") -def test_failed_write_doesnt_overwrite_file( - df, tmp_hyper, monkeypatch, table_mode, use_parquet -): - if use_parquet: - pytest.importorskip("pyarrow") - df = df.drop(columns=["timedelta64"]) - +def test_failed_write_doesnt_overwrite_file(df, tmp_hyper, monkeypatch, table_mode): pantab.frame_to_hyper( - df, tmp_hyper, table="test", table_mode=table_mode, use_parquet=use_parquet + df, + tmp_hyper, + table="test", + table_mode=table_mode, ) last_modified = tmp_hyper.stat().st_mtime - # Let's patch the Inserter to fail on creation - def failure(*args, **kwargs): - raise ValueError("dummy failure") - - if use_parquet: - pytest.importorskip("pyarrow") - pytest.skip("TODO: should figure out patching here") - # monkeypatch.setattr(pantab._writer.pq, "write_table", failure, raising=True) - else: - monkeypatch.setattr(pantab._writer.tab_api, "Inserter", failure, raising=True) + # Pick a dtype we know will fail + df["should_fail"] = pd.Series([tuple((1, 2))]) # Try out our write methods - with pytest.raises(ValueError, match="dummy failure"): - pantab.frame_to_hyper( - df, tmp_hyper, table="test", table_mode=table_mode, use_parquet=use_parquet - ) - pantab.frames_to_hyper( - {"test": df}, tmp_hyper, table_mode=table_mode, use_parquet=use_parquet - ) + with pytest.raises(Exception): + pantab.frame_to_hyper(df, tmp_hyper, table="test", table_mode=table_mode) + pantab.frames_to_hyper({"test": df}, tmp_hyper, table_mode=table_mode) # Neither should not update file stats assert last_modified == tmp_hyper.stat().st_mtime -def test_duplicate_columns_raises(tmp_hyper, use_parquet): +def test_duplicate_columns_raises(tmp_hyper): df = pd.DataFrame([[1, 1]], columns=[1, 1]) - with pytest.raises( - tab_api.hyperexception.HyperException, - match="column '1' specified more than once", - ): - pantab.frame_to_hyper(df, tmp_hyper, table="foo", use_parquet=use_parquet) - - with pytest.raises( - tab_api.hyperexception.HyperException, - match="column '1' specified more than once", - ): - pantab.frames_to_hyper({"test": df}, tmp_hyper, use_parquet=use_parquet) - - -@pytest.mark.parametrize("dtype", ["UInt64", "datetime64[ns, US/Eastern]"]) -def test_unsupported_dtype_raises(dtype, tmp_hyper, use_parquet): - df = pd.DataFrame([[1]], dtype=dtype) - - msg = re.escape(f"Conversion of '{dtype}' dtypes not supported!") - with pytest.raises(TypeError, match=msg): - pantab.frame_to_hyper(df, tmp_hyper, table="test", use_parquet=use_parquet) - - -def test_bad_value_gives_clear_message(tmp_hyper): - df = pd.DataFrame([[{"a": "b"}]], columns=["a"]) + msg = r"Duplicate column names found: \[1, 1\]" + with pytest.raises(ValueError, match=msg): + pantab.frame_to_hyper(df, tmp_hyper, table="foo") - msg = r"Invalid value \"{'a': 'b'}\" found \(row 0 column 0\)" + with pytest.raises(ValueError, match=msg): + pantab.frames_to_hyper({"test": df}, tmp_hyper) - with pytest.raises(TypeError, match=msg): - pantab.frame_to_hyper(df, tmp_hyper, table="test") +def test_unsupported_dtype_raises(tmp_hyper): + df = pd.DataFrame([[pd.Timedelta("1D")]]) -def test_use_parquet_with_timedelta_raises(df, tmp_hyper): - msg = "Writing timedelta values with use_parquet=True is not yet supported." + msg = re.escape("Unsupported Arrow type") with pytest.raises(ValueError, match=msg): - pantab.frame_to_hyper(df, tmp_hyper, table="test", use_parquet=True) + pantab.frame_to_hyper(df, tmp_hyper, table="test") -@pytest.mark.skipif( - not pantab._compat.PANDAS_130, reason="bug is specifically with >=pandas 1.3" -) def test_utc_bug(tmp_hyper): """ Red-Green for UTC bug @@ -154,19 +92,3 @@ def test_utc_bug(tmp_hyper): expected: {df.utc_time} actual: {[c[0] for c in resp]} """ - - -@pytest.mark.skipif( - not pantab._compat.PANDAS_130, reason="bug is specifically with >=pandas 1.3" -) -def test_maybe_convert_utc(tmp_hyper): - """ - timezone aware is not supported, thus we ensure timezone naive - """ - df = pd.DataFrame( - {"utc_time": [datetime.now(timezone.utc), pd.Timestamp("today", tz="UTC")]} - ) - assert not df.select_dtypes("datetime64[ns, UTC]").empty - df = pantab._writer._maybe_convert_utctimestamp(df) - assert df.select_dtypes("datetime64[ns, UTC]").empty - assert not df.select_dtypes("datetime64[ns]").empty diff --git a/pyproject.toml b/pyproject.toml index 9324a45c..a8316704 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,14 @@ [build-system] -build-backend="mesonpy" -requires=["meson-python", "tableauhyperapi", "oldest-supported-numpy"] +requires = [ + "scikit-build-core", + "nanobind", + "oldest-supported-numpy", # only needed for datetime +] +build-backend = "scikit_build_core.build" [project] name = "pantab" -version = "3.0.3" +version = "4.0.0rc" description = "Converts pandas DataFrames into Tableau Hyper Extracts and back" license = {file = "LICENSE.txt"} readme = "README.md" @@ -24,9 +28,12 @@ classifiers = [ keywords = ["tableau", "visualization", "pandas", "dataframe"] dependencies = [ - "pandas>=1.0.0", + "pandas>=2.0.0", "tableauhyperapi>=0.0.14567", "numpy", + # in the future we need not require pyarrow as pandas implements the + # PyCapsule interface. See pandas PR #56587 + "pyarrow>=14.0.0", ] [project.urls] @@ -48,10 +55,8 @@ testpaths = [ ] [tool.mypy] -ignore_missing_imports = true - -[tool.mypy.overrides] -module = ["numpy", "pandas.*", "pytest", "setuptools", "tableauhyperapi.*", "pyarrow.*"] +[[tool.mypy.overrides]] +module = ["tableauhyperapi.*"] ignore_missing_imports = true [tool.isort] @@ -59,14 +64,26 @@ include_trailing_comma = true line_length = 88 multi_line_output = 3 known_first_party = "pantab" -known_third_party = "libpantab" [tool.cibuildwheel] build = "cp39-*64 cp310-*64 cp311-*64 cp312-*64" skip = "*musllinux*" -test-command = "pytest {project}/pantab/tests" +test-command = "pytest --import-mode=importlib {project}/pantab/tests" test-requires = ["pytest"] [tool.ruff] line-length = 88 + +[tool.cibuildwheel.linux] +repair-wheel-command = """ +auditwheel repair -w {dest_dir} {wheel} --exclude libtableauhyperapi.so +""" + +[tool.cibuildwheel.macos] +# --ignore-missing-dependencies is risky but didnt see a good way of convincing +# delocate-wheel otherwise that it is OK to not see the libtableauhyperli +# version 0.10.6 has an exclude option that *may* be helpful +repair-wheel-command = """ +delocate-listdeps {wheel} && delocate-wheel --ignore-missing-dependencies --require-archs {delocate_archs} -w {dest_dir} {wheel} +"""