Skip to content

Commit 1825a4a

Browse files
committed
chore: add num_proc argument to Dataset.to_sql
1 parent cf647ab commit 1825a4a

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

src/datasets/arrow_dataset.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5299,6 +5299,7 @@ def to_sql(
52995299
name: str,
53005300
con: Union[str, "sqlalchemy.engine.Connection", "sqlalchemy.engine.Engine", "sqlite3.Connection"],
53015301
batch_size: Optional[int] = None,
5302+
num_proc: Optional[int] = None,
53025303
**sql_writer_kwargs,
53035304
) -> int:
53045305
"""Exports the dataset to a SQL database.
@@ -5311,6 +5312,11 @@ def to_sql(
53115312
batch_size (`int`, *optional*):
53125313
Size of the batch to load in memory and write at once.
53135314
Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
5315+
num_proc (`int`, *optional*):
5316+
Number of processes for multiprocessing. By default, it doesn't
5317+
use multiprocessing. `batch_size` in this case defaults to
5318+
`datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default
5319+
value if you have sufficient compute power.
53145320
**sql_writer_kwargs (additional keyword arguments):
53155321
Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).
53165322
@@ -5341,7 +5347,7 @@ def to_sql(
53415347
# Dynamic import to avoid circular dependency
53425348
from .io.sql import SqlDatasetWriter
53435349

5344-
return SqlDatasetWriter(self, name, con, batch_size=batch_size, **sql_writer_kwargs).write()
5350+
return SqlDatasetWriter(self, name, con, batch_size=batch_size, num_proc=num_proc, **sql_writer_kwargs).write()
53455351

53465352
def _estimate_nbytes(self) -> int:
53475353
dataset_nbytes = self.data.nbytes

0 commit comments

Comments
 (0)