From b9b5317d78b96667eb86622f6de83bce20a799fe Mon Sep 17 00:00:00 2001 From: Howard Yen Date: Tue, 1 Apr 2025 13:25:29 -0400 Subject: [PATCH 1/4] support for loading tsv and csv --- datatools/load.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/datatools/load.py b/datatools/load.py index a3ec02f..754d1be 100644 --- a/datatools/load.py +++ b/datatools/load.py @@ -26,6 +26,18 @@ def load_from_hub(path: str): return load_dataset(path, name=(name[0] if name else None), split=(split[0] if split else None)) +def load_csv(path: Union[Path, str], tsv: bool = False): + from datasets import load_dataset + if tsv: + return load_dataset("csv", data_files=path, delimiter="\t")['train'] + else: + return load_dataset("csv", data_files=path)['train'] + + +def load_tsv(path: Union[Path, str]): + return load_csv(path, tsv=True) + + def load_hf_dataset(path: Union[Path, str], input_type: str): from datasets import load_from_disk, Dataset path = str(path) @@ -35,6 +47,8 @@ def load_hf_dataset(path: Union[Path, str], input_type: str): "arrow": Dataset.from_file, "parquet": Dataset.from_parquet, "hub": load_from_hub, + "csv": load_csv, + "tsv": load_tsv, }[input_type](path) @@ -54,7 +68,7 @@ def load(*input_paths: List[Union[Path, str]], options: Optional[LoadOptions] = # Best guess from file extension # Iterate over suffixes in reverse order to handle cases like .jsonl.zst for suffix in path.suffixes[::-1]: - if suffix in [".arrow", ".parquet", ".npy", ".jsonl"]: + if suffix in [".arrow", ".parquet", ".npy", ".jsonl", ".tsv", ".csv"]: input_type = suffix[1:] break @@ -64,7 +78,7 @@ def load(*input_paths: List[Union[Path, str]], options: Optional[LoadOptions] = return JsonlDataset(input_paths) elif input_type == "npy": return np.concatenate([np.load(path) for path in input_paths]) - elif input_type in {"hf", "arrow", "parquet", "hub"}: + elif input_type in {"hf", "arrow", "parquet", "hub", "csv", "tsv"}: from datasets import concatenate_datasets return concatenate_datasets([load_hf_dataset(path, input_type) for path in input_paths]) else: From 6b88c068ce695e7898b44d2f8f2c71c4a6bd3464 Mon Sep 17 00:00:00 2001 From: Howard Yen Date: Tue, 1 Apr 2025 16:16:33 -0400 Subject: [PATCH 2/4] update tsv --- datatools/load.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/datatools/load.py b/datatools/load.py index 754d1be..8d428c2 100644 --- a/datatools/load.py +++ b/datatools/load.py @@ -26,18 +26,14 @@ def load_from_hub(path: str): return load_dataset(path, name=(name[0] if name else None), split=(split[0] if split else None)) -def load_csv(path: Union[Path, str], tsv: bool = False): +def load_csv(path: Union[Path, str]): from datasets import load_dataset - if tsv: + if "tsv" in path: return load_dataset("csv", data_files=path, delimiter="\t")['train'] else: return load_dataset("csv", data_files=path)['train'] -def load_tsv(path: Union[Path, str]): - return load_csv(path, tsv=True) - - def load_hf_dataset(path: Union[Path, str], input_type: str): from datasets import load_from_disk, Dataset path = str(path) @@ -48,7 +44,7 @@ def load_hf_dataset(path: Union[Path, str], input_type: str): "parquet": Dataset.from_parquet, "hub": load_from_hub, "csv": load_csv, - "tsv": load_tsv, + "tsv": load_csv, }[input_type](path) From c3ae514051e27f7c78d41b11820b81fd319c09d1 Mon Sep 17 00:00:00 2001 From: Howard Yen Date: Wed, 2 Apr 2025 09:18:12 -0400 Subject: [PATCH 3/4] add documentation in readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index cc981d2..5694cc6 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Clone this repo and install via `pip install -e .` or install from pypi via `pip ###### `load(path, load_options)` Loads the dataset at the path _**tries to infer what format it is in**_ (e.g., compressed json, pyarrow, MDS, ...) based on clues from the file format and directory structure +For loading from datasets hosted on huggingface hub, you can use the `hub` input type and specify the dataset name as `path/to/dataset>name#split`. For example, `load("tatsu-lab/alpaca_eval>alpaca_eval#eval")` is equivalent to `datasets.load_dataset("tatsu-lab/alpaca_eval", split="eval")`. + ###### `process(input_dataset, process_fn, output_path, process_options)` Processes an input dataset and writes the results to disk. It supports: From 5d9b303dea71e8bec9a821fec3a1e14f1f13b2d7 Mon Sep 17 00:00:00 2001 From: Howard Yen Date: Wed, 2 Apr 2025 09:23:08 -0400 Subject: [PATCH 4/4] fix csv --- datatools/load.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datatools/load.py b/datatools/load.py index 8d428c2..1245d88 100644 --- a/datatools/load.py +++ b/datatools/load.py @@ -28,6 +28,10 @@ def load_from_hub(path: str): def load_csv(path: Union[Path, str]): from datasets import load_dataset + if isinstance(path, Path): + # HY: load_dataset expects a string path + path = str(path) + if "tsv" in path: return load_dataset("csv", data_files=path, delimiter="\t")['train'] else: @@ -67,6 +71,9 @@ def load(*input_paths: List[Union[Path, str]], options: Optional[LoadOptions] = if suffix in [".arrow", ".parquet", ".npy", ".jsonl", ".tsv", ".csv"]: input_type = suffix[1:] break + elif not path.exists(): + # HY: if the path does not exist (not a file or directory), we assume it should be loaded from hub + input_type = "hub" if input_type == "mosaic": return LocalDatasets(input_paths)