diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index c7cf1374cf2..4cb3cff24c9 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit c7cf1374cf284c0c73024cd1437becea75558bf8 +Subproject commit 4cb3cff24c965fb329cdae763eabce47395a68a0 diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index c0e8af0c661..302f7715fd1 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -39,3 +39,15 @@ API Reference api/dataset api/cuda api/misc + api/tensors + +************* +Tensors +************* + +.. _toc.tensors: + +.. toctree:: + :maxdepth: 2 + + \ No newline at end of file diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst index 48cc67eb667..9ed8a717a7a 100644 --- a/docs/source/python/api/tables.rst +++ b/docs/source/python/api/tables.rst @@ -55,12 +55,7 @@ Dataframe Interchange Protocol interchange.from_dataframe -.. _api.tensor: +See Also +-------- -Tensors -------- - -.. autosummary:: - :toctree: ../generated/ - - Tensor +For information about tensors, refer to :doc:`tensors` diff --git a/docs/source/python/api/tensors.rst b/docs/source/python/api/tensors.rst new file mode 100644 index 00000000000..9c788c90767 --- /dev/null +++ b/docs/source/python/api/tensors.rst @@ -0,0 +1,74 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.tensor: + +Tensors +======= + +PyArrow supports both dense and sparse tensors. Dense tensors store all data values explicitly, while sparse tensors represent only the non-zero elements and their locations, making them efficient for storage and computation. + +Dense Tensors +------------- + +.. autosummary:: + :toctree: ../generated/ + + Tensor + +Sparse Tensors +-------------- + +PyArrow supports the following sparse tensor formats: + +.. autosummary:: + :toctree: ../generated/ + + SparseCOOTensor + SparseCSRMatrix + SparseCSCMatrix + SparseCSFTensor + +SparseCOOTensor +^^^^^^^^^^^^^^^ + +The ``SparseCOOTensor`` represents a sparse tensor in Coordinate (COO) format, where non-zero elements are stored as tuples of row and column indices. + +For detailed examples, see :ref:`data/SparseCOOTensor`. + +SparseCSRMatrix +^^^^^^^^^^^^^^^ + +The ``SparseCSRMatrix`` represents a sparse matrix in Compressed Sparse Row (CSR) format. This format is useful for matrix-vector multiplication. + +For detailed examples, see :ref:`data/SparseCSRMatrix` + +SparseCSCMatrix +^^^^^^^^^^^^^^^ + +The ``SparseCSCMatrix`` represents a sparse matrix in Compressed Sparse Column (CSC) format, where data is stored by columns. + +For detailed examples, see :ref:`data/SparseCSCMatrix`. + +SparseCSFTensor +^^^^^^^^^^^^^^^ + +The ``SparseCSFTensor`` represents a sparse tensor in Compressed Sparse Fiber (CSF) format, which is a generalization of the CSR format for higher dimensions. + +For detailed examples, see :ref:`data/SparseCSFTensor`. \ No newline at end of file diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 4a0f2af6d48..db2c1860299 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -561,6 +561,135 @@ schema without having to get any of the batches.:: It can also be sent between languages using the :ref:`C stream interface `. +Sparse Tensor Classes +===================== + +SparseCOOTensor +--------------- + +The ``SparseCOOTensor`` represents a sparse tensor in Coordinate (COO) format, where non-zero elements are stored as tuples of row and column indices. + +Example Usage: +^^^^^^^^^^^^^^ + +.. code-block:: python + + >>> import pyarrow as pa + >>> indices = [ + ... pa.array([0, 1]), + ... pa.array([1, 0]) + ... ] + >>> data = pa.array([1, 2]) + >>> shape = (2, 3) + + >>> tensor = pa.SparseCOOTensor.from_numpy(indices, data, shape) + >>> print(tensor) + + + +SparseCSRMatrix +--------------- + +``SparseCSRMatrix`` represents a sparse matrix in Compressed Sparse Row (CSR) format, where non-zero elements are stored in a compressed manner using arrays for data, indices, and indptr. + +Example Usage: +^^^^^^^^^^^^^^ + +.. code-block:: python + + >>> import pyarrow as pa + >>> data = pa.array([1, 2, 3]) + >>> indptr = pa.array([0, 2, 3]) + >>> indices = pa.array([0, 2, 1]) + >>> shape = (2, 3) + >>> sparse_matrix = pa.SparseCSRMatrix.from_numpy(data, indptr, indices, shape) + >>> print(sparse_matrix) + + + +SparseCSCMatrix +--------------- + +``SparseCSCMatrix`` represents a sparse matrix in Compressed Sparse Column (CSC) format, where non-zero elements are stored in a compressed manner using arrays for data, indices, and indptr. + +Example Usage: +^^^^^^^^^^^^^^ + +.. code-block:: python + + >>> import pyarrow as pa + >>> data = pa.array([4, 5, 6]) + >>> indptr = pa.array([0, 1, 3]) + >>> indices = pa.array([0, 2, 1]) + >>> shape = (3, 2) + + >>> sparse_matrix = pa.SparseCSCMatrix.from_numpy(data, indptr, indices, shape) + >>> print(sparse_matrix) + + + +SparseCSFTensor +--------------- + +``SparseCSFTensor`` represents a sparse tensor in Compressed Sparse Fiber (CSF) format, optimized for multi-dimensional sparse data storage. + +Example Usage: +^^^^^^^^^^^^^^ + +.. code-block:: python + + >>> import pyarrow as pa + >>> data = pa.array([1, 2, 3]) + >>> indices = [ + ... pa.array([0, 0, 1]), + ... pa.array([0, 1, 2]), + ... ] + >>> shape = (2, 3) + + >>> sparse_tensor = pa.SparseCSFTensor.from_numpy(data, indices, shape) + >>> print(sparse_tensor) + + + +Conversion of RecordBatch to Tensor +----------------------------------- + +Each array of the ``RecordBatch`` has its own contiguous memory that is not necessarily +adjacent to other arrays. A different memory structure that is used in machine learning +libraries is a two-dimensional array (also called a 2-dim tensor or a matrix) which takes +only one contiguous block of memory. + +For this reason, there is a function ``pyarrow.RecordBatch.to_tensor()`` available +to efficiently convert tabular columnar data into a tensor. + +Data types supported in this conversion are unsigned, signed integer, and float +types. Currently, only column-major conversion is supported. + +Example Usage: +^^^^^^^^^^^^^^ + +.. code-block:: python + + >>> import pyarrow as pa + >>> arr1 = [1, 2, 3, 4, 5] + >>> arr2 = [10, 20, 30, 40, 50] + >>> batch = pa.RecordBatch.from_arrays( + ... [ + ... pa.array(arr1, type=pa.uint16()), + ... pa.array(arr2, type=pa.int16()), + ... ], ["a", "b"] + ... ) + >>> batch.to_tensor() + + type: int32 + shape: (9, 2) + strides: (4, 36) + >>> batch.to_tensor().to_numpy() + array([[ 1, 10], + [ 2, 20], + [ 3, 30], + [ 4, 40], + [ 5, 50]], dtype=int32) Conversion of RecordBatch to Tensor ----------------------------------- diff --git a/filtered_rat.txt b/filtered_rat.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index 1ad26027527..6b54e1c48e5 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -18,6 +18,7 @@ import numpy as np import pyarrow as pa + try: import pyarrow.parquet as pq except ImportError: @@ -34,7 +35,7 @@ def setup(self): num_cols = 10 unique_values = np.array([rands(value_size) for - i in range(nuniques)], dtype='O') + _ in range(nuniques)], dtype='O') values = unique_values[np.random.randint(0, nuniques, size=length)] self.table = pa.table([pa.array(values) for i in range(num_cols)], names=['f{}'.format(i) for i in range(num_cols)]) @@ -58,7 +59,7 @@ def time_convert_pandas_and_write_binary_table(self): def generate_dict_strings(string_size, nunique, length, random_order=True): - uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O') + uniques = np.array([rands(string_size) for _ in range(nunique)], dtype='O') if random_order: indices = np.random.randint(0, nunique, size=length).astype('i4') else: @@ -71,7 +72,7 @@ def generate_dict_table(num_cols, string_size, nunique, length, data = generate_dict_strings(string_size, nunique, length, random_order=random_order) return pa.table([ - data for i in range(num_cols) + data for _ in range(num_cols) ], names=['f{}'.format(i) for i in range(num_cols)]) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 3e0c63c18fc..0a5db26f265 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -610,7 +610,20 @@ shape: {0.shape}""".format(self) cdef class SparseCSRMatrix(_Weakrefable): """ - A sparse CSR matrix. + SparseCSRMatrix represents a sparse matrix in Compressed Sparse Row (CSR) format. + + Example: + >>> import pyarrow as pa + >>> import numpy as np + >>> data = np.array([1, 2, 3]) + >>> indptr = np.array([0, 2, 3]) + >>> indices = np.array([0, 2, 1]) + >>> shape = (2, 3) + >>> tensor = pa.SparseCSRMatrix.from_numpy(data, indptr, indices, shape) + >>> print(tensor) + + type: int64 + shape: (2, 3) """ def __init__(self): diff --git a/rat.txt b/rat.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/testing b/testing index 4d209492d51..ad82a736c17 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 4d209492d514c2d3cb2d392681b9aa00e6d8da1c +Subproject commit ad82a736c170e97b7c8c035ebd8a801c17eec170