diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 73bc756de9302..a2417616cf80a 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -214,7 +214,8 @@ With custom types and inference this is not always possible so exceptions are ma pandas-specific types ~~~~~~~~~~~~~~~~~~~~~ -Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. +Commonly used types specific to pandas will appear in `pandas._typing `__ and you should use these where applicable. This module is private and is meant for pandas development. +Types that are meant for user consumption should be exposed in `pandas.api.typing.aliases `__ and ideally added to the `pandas-stubs `__ project. For example, quite a few functions in pandas accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module diff --git a/doc/source/reference/aliases.rst b/doc/source/reference/aliases.rst new file mode 100644 index 0000000000000..8fd9ed3649c0b --- /dev/null +++ b/doc/source/reference/aliases.rst @@ -0,0 +1,96 @@ +{{ header }} + +.. _api.typing.aliases: + +====================================== +pandas typing aliases +====================================== + +************** +Typing aliases +************** + +.. currentmodule:: pandas.api.typing.aliases + +The typing declarations in ``pandas/_typing.py`` are considered private, and used +by pandas developers for type checking of the pandas code base. For users, it is +highly recommended to use the ``pandas-stubs`` package that represents the officially +supported type declarations for users of pandas. +They are documented here for users who wish to use these declarations in their +own python code that calls pandas or expects certain results. + +.. warning:: + + Note that the definitions and use cases of these aliases are subject to change without notice in any major, minor, or patch release of pandas. + +Each of these aliases listed in the table below can be found by importing them from :py:mod:`pandas.api.typing.aliases`. + +==================================== ================================================================ +Alias Meaning +==================================== ================================================================ +:py:type:`AggFuncType` Type of functions that can be passed to :meth:`agg` methods +:py:type:`AlignJoin` Argument type for ``join`` in :meth:`DataFrame.join` +:py:type:`AnyAll` Argument type for ``how`` in :meth:`dropna` +:py:type:`AnyArrayLike` Used to represent :class:`ExtensionArray`, ``numpy`` arrays, :class:`Index` and :class:`Series` +:py:type:`ArrayLike` Used to represent :class:`ExtensionArray`, ``numpy`` arrays +:py:type:`AstypeArg` Argument type in :meth:`astype` +:py:type:`Axes` :py:type:`AnyArrayLike` plus sequences (not strings) and ``range`` +:py:type:`Axis` Argument type for ``axis`` in many methods +:py:type:`CSVEngine` Argument type for ``engine`` in :meth:`DataFrame.read_csv` +:py:type:`ColspaceArgType` Argument type for ``colspace`` in :meth:`DataFrame.to_html` +:py:type:`CompressionOptions` Argument type for ``compression`` in all I/O output methods except :meth:`DataFrame.to_parquet` +:py:type:`CorrelationMethod` Argument type for ``correlation`` in :meth:`corr` +:py:type:`DropKeep` Argument type for ``keep`` in :meth:`drop_duplicates` +:py:type:`Dtype` Types as objects that can be used to specify dtypes +:py:type:`DtypeArg` Argument type for ``dtype`` in various methods +:py:type:`DtypeBackend` Argument type for ``dtype_backend`` in various methods +:py:type:`DtypeObj` Numpy dtypes and Extension dtypes +:py:type:`ExcelWriterIfSheetExists` Argument type for ``if_sheet_exists`` in :class:`ExcelWriter` +:py:type:`ExcelWriterMergeCells` Argument type for ``merge_cells`` in :meth:`to_excel` +:py:type:`FilePath` Type of paths for files for I/O methods +:py:type:`FillnaOptions` Argument type for ``method`` in various methods where NA values are filled +:py:type:`FloatFormatType` Argument type for ``float_format`` in :meth:`to_string` +:py:type:`FormattersType` Argument type for ``formatters`` in :meth:`to_string` +:py:type:`FromDictOrient` Argument type for ``orient`` in :meth:`DataFrame.from_dict` +:py:type:`HTMLFlavors` Argument type for ``flavor`` in :meth:`pandas.read_html` +:py:type:`IgnoreRaise` Argument type for ``errors`` in multiple methods +:py:type:`IndexLabel` Argument type for ``level`` in multiple methods +:py:type:`InterpolateOptions` Argument type for ``interpolate`` in :meth:`interpolate` +:py:type:`JSONEngine` Argument type for ``engine`` in :meth:`read_json` +:py:type:`JSONSerializable` Argument type for the return type of a callable for argument ``default_handler`` in :meth:`to_json` +:py:type:`JoinHow` Argument type for ``how`` in :meth:`pandas.merge_ordered` and for ``join`` in :meth:`Series.align` +:py:type:`JoinValidate` Argument type for ``validate`` in :meth:`DataFrame.join` +:py:type:`MergeHow` Argument type for ``how`` in :meth:`merge` +:py:type:`MergeValidate` Argument type for ``validate`` in :meth:`merge` +:py:type:`NaPosition` Argument type for ``na_position`` in :meth:`sort_index` and :meth:`sort_values` +:py:type:`NsmallestNlargestKeep` Argument type for ``keep`` in :meth:`nlargest` and :meth:`nsmallest` +:py:type:`OpenFileErrors` Argument type for ``errors`` in :meth:`to_hdf` and :meth:`to_csv` +:py:type:`Ordered` Return type for :py:attr:`ordered`` in :class:`CategoricalDtype` and :class:`Categorical` +:py:type:`ParquetCompressionOptions` Argument type for ``compression`` in :meth:`DataFrame.to_parquet` +:py:type:`QuantileInterpolation` Argument type for ``interpolation`` in :meth:`quantile` +:py:type:`ReadBuffer` Additional argument type corresponding to buffers for various file reading methods +:py:type:`ReadCsvBuffer` Additional argument type corresponding to buffers for :meth:`pandas.read_csv` +:py:type:`ReadPickleBuffer` Additional argument type corresponding to buffers for :meth:`pandas.read_pickle` +:py:type:`ReindexMethod` Argument type for ``reindex`` in :meth:`reindex` +:py:type:`Scalar` Types that can be stored in :class:`Series` with non-object dtype +:py:type:`SequenceNotStr` Used for arguments that require sequences, but not plain strings +:py:type:`SliceType` Argument types for ``start`` and ``end`` in :meth:`Index.slice_locs` +:py:type:`SortKind` Argument type for ``kind`` in :meth:`sort_index` and :meth:`sort_values` +:py:type:`StorageOptions` Argument type for ``storage_options`` in various file output methods +:py:type:`Suffixes` Argument type for ``suffixes`` in :meth:`merge`, :meth:`compare` and :meth:`merge_ordered` +:py:type:`TakeIndexer` Argument type for ``indexer`` and ``indices`` in :meth:`take` +:py:type:`TimeAmbiguous` Argument type for ``ambiguous`` in time operations +:py:type:`TimeGrouperOrigin` Argument type for ``origin`` in :meth:`resample` and :class:`TimeGrouper` +:py:type:`TimeNonexistent` Argument type for ``nonexistent`` in time operations +:py:type:`TimeUnit` Time unit argument and return type for :py:attr:`unit`, arguments ``unit`` and ``date_unit`` +:py:type:`TimedeltaConvertibleTypes` Argument type for ``offset`` in :meth:`resample`, ``halflife`` in :meth:`ewm` and ``start`` and ``end`` in :meth:`pandas.timedelta_range` +:py:type:`TimestampConvertibleTypes` Argument type for ``origin`` in :meth:`resample` and :meth:`pandas.to_datetime` +:py:type:`ToStataByteorder` Argument type for ``byteorder`` in :meth:`DataFrame.to_stata` +:py:type:`ToTimestampHow` Argument type for ``how`` in :meth:`to_timestamp` and ``convention`` in :meth:`resample` +:py:type:`UpdateJoin` Argument type for ``join`` in :meth:`DataFrame.update` +:py:type:`UsecolsArgType` Argument type for ``usecols`` in :meth:`pandas.read_clipboard`, :meth:`pandas.read_csv` and :meth:`pandas.read_excel` +:py:type:`WindowingRankType` Argument type for ``method`` in :meth:`rank`` in rolling and expanding window operations +:py:type:`WriteBuffer` Additional argument type corresponding to buffers for various file output methods +:py:type:`WriteExcelBuffer` Additional argument type corresponding to buffers for :meth:`to_excel` +:py:type:`XMLParsers` Argument type for ``parser`` in :meth:`DataFrame.to_xml` and :meth:`pandas.read_xml` +==================================== ================================================================ diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 639bac4d40b70..ec9e3c1bad476 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -55,6 +55,7 @@ to be stable. extensions testing missing_value + aliases .. This is to prevent warnings in the doc build. We don't want to encourage .. these methods. diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3c83f2a9758c1..280f995532c6f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -220,6 +220,7 @@ Other enhancements - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`) - Improved deprecation message for offset aliases (:issue:`60820`) +- Many type aliases are now exposed in the new submodule :py:mod:`pandas.api.typing.aliases` (:issue:`55231`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) diff --git a/pandas/_typing.py b/pandas/_typing.py index 0a6653f05e59a..2799dac51370e 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -133,8 +133,27 @@ def __reversed__(self) -> Iterator[_T_co]: ... PythonScalar: TypeAlias = str | float | bool DatetimeLikeScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta"] -PandasScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta", "Interval"] -Scalar: TypeAlias = PythonScalar | PandasScalar | np.datetime64 | np.timedelta64 | date + +# aligned with pandas-stubs - typical scalars found in Series. Explicitly leaves +# out object +_IndexIterScalar: TypeAlias = Union[ + str, + bytes, + date, + datetime, + timedelta, + np.datetime64, + np.timedelta64, + bool, + int, + float, + "Timestamp", + "Timedelta", +] +Scalar: TypeAlias = Union[ + _IndexIterScalar, "Interval", complex, np.integer, np.floating, np.complexfloating +] + IntStrT = TypeVar("IntStrT", bound=int | str) # timestamp and timedelta convertible types @@ -312,6 +331,9 @@ def closed(self) -> bool: CompressionOptions: TypeAlias = ( Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"] | CompressionDict | None ) +ParquetCompressionOptions: TypeAlias = ( + Literal["snappy", "gzip", "brotli", "lz4", "zstd"] | None +) # types in DataFrameFormatter FormattersType: TypeAlias = ( diff --git a/pandas/api/typing/aliases.py b/pandas/api/typing/aliases.py new file mode 100644 index 0000000000000..c027a20585600 --- /dev/null +++ b/pandas/api/typing/aliases.py @@ -0,0 +1,135 @@ +from pandas._typing import ( + AggFuncType, + AlignJoin, + AnyAll, + AnyArrayLike, + ArrayLike, + AstypeArg, + Axes, + Axis, + ColspaceArgType, + CompressionOptions, + CorrelationMethod, + CSVEngine, + DropKeep, + Dtype, + DtypeArg, + DtypeBackend, + DtypeObj, + ExcelWriterIfSheetExists, + ExcelWriterMergeCells, + FilePath, + FillnaOptions, + FloatFormatType, + FormattersType, + FromDictOrient, + HTMLFlavors, + IgnoreRaise, + IndexLabel, + InterpolateOptions, + JoinHow, + JoinValidate, + JSONEngine, + JSONSerializable, + MergeHow, + MergeValidate, + NaPosition, + NsmallestNlargestKeep, + OpenFileErrors, + Ordered, + ParquetCompressionOptions, + QuantileInterpolation, + ReadBuffer, + ReadCsvBuffer, + ReadPickleBuffer, + ReindexMethod, + Scalar, + SequenceNotStr, + SliceType, + SortKind, + StorageOptions, + Suffixes, + TakeIndexer, + TimeAmbiguous, + TimedeltaConvertibleTypes, + TimeGrouperOrigin, + TimeNonexistent, + TimestampConvertibleTypes, + TimeUnit, + ToStataByteorder, + ToTimestampHow, + UpdateJoin, + UsecolsArgType, + WindowingRankType, + WriteBuffer, + WriteExcelBuffer, + XMLParsers, +) + +__all__ = [ + "AggFuncType", + "AlignJoin", + "AnyAll", + "AnyArrayLike", + "ArrayLike", + "AstypeArg", + "Axes", + "Axis", + "CSVEngine", + "ColspaceArgType", + "CompressionOptions", + "CorrelationMethod", + "DropKeep", + "Dtype", + "DtypeArg", + "DtypeBackend", + "DtypeObj", + "ExcelWriterIfSheetExists", + "ExcelWriterMergeCells", + "FilePath", + "FillnaOptions", + "FloatFormatType", + "FormattersType", + "FromDictOrient", + "HTMLFlavors", + "IgnoreRaise", + "IndexLabel", + "InterpolateOptions", + "JSONEngine", + "JSONSerializable", + "JoinHow", + "JoinValidate", + "MergeHow", + "MergeValidate", + "NaPosition", + "NsmallestNlargestKeep", + "OpenFileErrors", + "Ordered", + "ParquetCompressionOptions", + "QuantileInterpolation", + "ReadBuffer", + "ReadCsvBuffer", + "ReadPickleBuffer", + "ReindexMethod", + "Scalar", + "SequenceNotStr", + "SliceType", + "SortKind", + "StorageOptions", + "Suffixes", + "TakeIndexer", + "TimeAmbiguous", + "TimeGrouperOrigin", + "TimeNonexistent", + "TimeUnit", + "TimedeltaConvertibleTypes", + "TimestampConvertibleTypes", + "ToStataByteorder", + "ToTimestampHow", + "UpdateJoin", + "UsecolsArgType", + "WindowingRankType", + "WriteBuffer", + "WriteExcelBuffer", + "XMLParsers", +] diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a3634e370cfc3..ccc80368b4106 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -599,7 +599,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): - if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type] + if not np_can_cast_scalar(fill_value, dtype): # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c1f8be1381b23..eafe1567dd156 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -237,6 +237,7 @@ MutableMappingT, NaPosition, NsmallestNlargestKeep, + ParquetCompressionOptions, PythonFuncType, QuantileInterpolation, ReadBuffer, @@ -2859,7 +2860,7 @@ def to_parquet( path: None = ..., *, engine: Literal["auto", "pyarrow", "fastparquet"] = ..., - compression: str | None = ..., + compression: ParquetCompressionOptions = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., storage_options: StorageOptions = ..., @@ -2872,7 +2873,7 @@ def to_parquet( path: FilePath | WriteBuffer[bytes], *, engine: Literal["auto", "pyarrow", "fastparquet"] = ..., - compression: str | None = ..., + compression: ParquetCompressionOptions = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., storage_options: StorageOptions = ..., @@ -2885,7 +2886,7 @@ def to_parquet( path: FilePath | WriteBuffer[bytes] | None = None, *, engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", - compression: str | None = "snappy", + compression: ParquetCompressionOptions = "snappy", index: bool | None = None, partition_cols: list[str] | None = None, storage_options: StorageOptions | None = None, diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 037d255d848ba..1a3f8cc046066 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -43,6 +43,7 @@ from pandas._typing import ( DtypeBackend, FilePath, + ParquetCompressionOptions, ReadBuffer, StorageOptions, WriteBuffer, @@ -175,7 +176,7 @@ def write( self, df: DataFrame, path: FilePath | WriteBuffer[bytes], - compression: str | None = "snappy", + compression: ParquetCompressionOptions = "snappy", index: bool | None = None, storage_options: StorageOptions | None = None, partition_cols: list[str] | None = None, @@ -411,7 +412,7 @@ def to_parquet( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, engine: str = "auto", - compression: str | None = "snappy", + compression: ParquetCompressionOptions = "snappy", index: bool | None = None, storage_options: StorageOptions | None = None, partition_cols: list[str] | None = None, diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 49c5daee9fc43..ff5e60d700644 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -13,6 +13,7 @@ types as api_types, typing as api_typing, ) +from pandas.api.typing import aliases as api_aliases class Base: @@ -277,6 +278,7 @@ class TestApi(Base): "TimedeltaIndexResamplerGroupby", "TimeGrouper", "Window", + "aliases", ] allowed_api_types = [ "is_any_real_numeric_dtype", @@ -344,6 +346,73 @@ class TestApi(Base): "ExtensionScalarOpsMixin", ] allowed_api_executors = ["BaseExecutionEngine"] + allowed_api_aliases = [ + "AggFuncType", + "AlignJoin", + "AnyAll", + "AnyArrayLike", + "ArrayLike", + "AstypeArg", + "Axes", + "Axis", + "CSVEngine", + "ColspaceArgType", + "CompressionOptions", + "CorrelationMethod", + "DropKeep", + "Dtype", + "DtypeArg", + "DtypeBackend", + "DtypeObj", + "ExcelWriterIfSheetExists", + "ExcelWriterMergeCells", + "FilePath", + "FillnaOptions", + "FloatFormatType", + "FormattersType", + "FromDictOrient", + "HTMLFlavors", + "IgnoreRaise", + "IndexLabel", + "InterpolateOptions", + "JSONEngine", + "JSONSerializable", + "JoinHow", + "JoinValidate", + "MergeHow", + "MergeValidate", + "NaPosition", + "NsmallestNlargestKeep", + "OpenFileErrors", + "Ordered", + "ParquetCompressionOptions", + "QuantileInterpolation", + "ReadBuffer", + "ReadCsvBuffer", + "ReadPickleBuffer", + "ReindexMethod", + "Scalar", + "SequenceNotStr", + "SliceType", + "SortKind", + "StorageOptions", + "Suffixes", + "TakeIndexer", + "TimeAmbiguous", + "TimeGrouperOrigin", + "TimeNonexistent", + "TimeUnit", + "TimedeltaConvertibleTypes", + "TimestampConvertibleTypes", + "ToStataByteorder", + "ToTimestampHow", + "UpdateJoin", + "UsecolsArgType", + "WindowingRankType", + "WriteBuffer", + "WriteExcelBuffer", + "XMLParsers", + ] def test_api(self): self.check(api, self.allowed_api_dirs) @@ -366,6 +435,9 @@ def test_api_extensions(self): def test_api_executors(self): self.check(api_executors, self.allowed_api_executors) + def test_api_typing_aliases(self): + self.check(api_aliases, self.allowed_api_aliases) + class TestErrors(Base): def test_errors(self):