From d49829c9dc449f583b2ce75032aea7a2028fffd0 Mon Sep 17 00:00:00 2001 From: Kauna <16511995+klei22@users.noreply.github.com> Date: Sat, 18 Oct 2025 17:05:24 -0700 Subject: [PATCH 1/3] Add translation-aware parquet dataset utility --- .../utils/get_translation_parquet_dataset.py | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 data/template/utils/get_translation_parquet_dataset.py diff --git a/data/template/utils/get_translation_parquet_dataset.py b/data/template/utils/get_translation_parquet_dataset.py new file mode 100644 index 0000000000..59e86d247a --- /dev/null +++ b/data/template/utils/get_translation_parquet_dataset.py @@ -0,0 +1,145 @@ +"""Utilities for flattening translation columns in Parquet datasets.""" +from __future__ import annotations + +import argparse +import json +import os +from typing import Iterable, Sequence, Tuple + +from .get_parquet_dataset import convert_to_json, download_file, find_parquet_links + + +def emit_translation_items( + json_path: str, + output_path: str, + language_prefixes: Sequence[Tuple[str, str]], +) -> None: + """Emit flattened translation rows from ``json_path`` into ``output_path``. + + Parameters + ---------- + json_path: + Path to the JSON file produced from a Parquet shard. + output_path: + File where the flattened text should be appended. + language_prefixes: + Ordered collection of (language, prefix) tuples. Each translation entry + writes one line per language using the associated prefix when the + translation text is present. + """ + if not language_prefixes: + return + + with open(json_path, "r", encoding="utf-8") as handle: + records = json.load(handle) + + if not isinstance(records, list): + return + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + + with open(output_path, "a", encoding="utf-8") as out_handle: + for record in records: + translation = record.get("translation") + if not isinstance(translation, dict): + continue + + segments = [] + for language, prefix in language_prefixes: + text = translation.get(language) + if not text: + continue + segments.append(f"{prefix}{text}") + + if segments: + out_handle.write("\n".join(segments) + "\n\n") + + +def download_translation_dataset( + url: str, + output_text_file: str, + language_prefixes: Sequence[Tuple[str, str]], + append: bool = False, +) -> None: + """Download, convert, and flatten translation datasets from ``url``. + + The function downloads all Parquet files advertised at ``url`` (typically a + Hugging Face dataset folder), converts them to JSON if necessary, and emits + flattened text records to ``output_text_file`` using the provided language + prefixes. + """ + parquet_links = find_parquet_links(url) + download_dir = "./downloaded_parquets" + json_dir = "./json_output" + os.makedirs(download_dir, exist_ok=True) + os.makedirs(json_dir, exist_ok=True) + + if not append: + open(output_text_file, "w", encoding="utf-8").close() + + for link in parquet_links: + file_name = link.split("/")[-1].split("?")[0] + parquet_path = os.path.join(download_dir, file_name) + json_path = os.path.join(json_dir, file_name.replace(".parquet", ".json")) + + if not os.path.exists(parquet_path): + download_file(link, parquet_path) + + convert_to_json(parquet_path, json_path) + emit_translation_items(json_path, output_text_file, language_prefixes) + + + +def parse_language_prefixes(prefix_args: Iterable[Tuple[str, str]]) -> Sequence[Tuple[str, str]]: + """Validate and normalize CLI ``--prefix`` arguments.""" + prefixes: list[Tuple[str, str]] = [] + for language, prefix in prefix_args: + if not language: + raise ValueError("Language code for --prefix cannot be empty") + prefixes.append((language, prefix)) + return prefixes + + +def main() -> None: + parser = argparse.ArgumentParser( + description=( + "Download Europarl-style translation Parquet files and emit prefixed text." + ) + ) + parser.add_argument( + "--url", + required=True, + help="Dataset folder URL listing the Parquet shards (e.g. Hugging Face tree view).", + ) + parser.add_argument( + "-o", + "--output", + default="input.txt", + help="Where to write the flattened text output.", + ) + parser.add_argument( + "--prefix", + nargs=2, + action="append", + metavar=("LANG", "PREFIX"), + required=True, + help="Language/prefix pairs like --prefix bg 'BG: ' --prefix cs 'CS: '.", + ) + parser.add_argument( + "--append", + action="store_true", + help="Append to the output file instead of overwriting it.", + ) + args = parser.parse_args() + + language_prefixes = parse_language_prefixes(args.prefix) + download_translation_dataset( + args.url, + args.output, + language_prefixes, + append=args.append, + ) + + +if __name__ == "__main__": + main() From 2e7a0bf7d66d0a42af45280b8538c5252a1dec5c Mon Sep 17 00:00:00 2001 From: kauna Date: Sun, 19 Oct 2025 03:36:00 +0000 Subject: [PATCH 2/3] Add new parquet parser for common translation format --- data/template/utils/get_translation_parquet_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/template/utils/get_translation_parquet_dataset.py b/data/template/utils/get_translation_parquet_dataset.py index 59e86d247a..3231ac4917 100644 --- a/data/template/utils/get_translation_parquet_dataset.py +++ b/data/template/utils/get_translation_parquet_dataset.py @@ -6,7 +6,7 @@ import os from typing import Iterable, Sequence, Tuple -from .get_parquet_dataset import convert_to_json, download_file, find_parquet_links +from get_parquet_dataset import convert_to_json, download_file, find_parquet_links def emit_translation_items( From 27bb3b0555d293073127fd644cdf3dd1e55ff326 Mon Sep 17 00:00:00 2001 From: kauna Date: Sun, 19 Oct 2025 03:58:35 +0000 Subject: [PATCH 3/3] Add english-hindi dataset --- data/iitb-english-hindi/README.md | 68 ++++++++++++++++++++++++++ data/iitb-english-hindi/get_dataset.sh | 9 ++++ data/iitb-english-hindi/prepare.py | 1 + data/iitb-english-hindi/utils | 1 + 4 files changed, 79 insertions(+) create mode 100644 data/iitb-english-hindi/README.md create mode 100644 data/iitb-english-hindi/get_dataset.sh create mode 120000 data/iitb-english-hindi/prepare.py create mode 120000 data/iitb-english-hindi/utils diff --git a/data/iitb-english-hindi/README.md b/data/iitb-english-hindi/README.md new file mode 100644 index 0000000000..661ddea973 --- /dev/null +++ b/data/iitb-english-hindi/README.md @@ -0,0 +1,68 @@ +# IITB English–Hindi Parallel Corpus (cfilt/iitb-english-hindi) + +### Dataset Overview + +The **IIT Bombay English-Hindi Parallel Corpus** is a large-scale bilingual +dataset created by the **Center for Indian Language Technology (CFILT)** at IIT +Bombay. It contains **1.66 million English–Hindi sentence pairs** collected +from multiple open sources and curated over several years for **machine +translation and linguistic research**. + +| Field | Value | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------- | +| **Dataset name** | `cfilt/iitb-english-hindi` | +| **Languages** | English (`en`), Hindi (`hi`) | +| **Modality** | Text (parallel corpus) | +| **Format** | Parquet | +| **Size** | ~190 MB (≈ 1.66 M rows) | +| **Splits** | `train`, `validation`, `test` | +| **License** | [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) | +| **Hugging Face page** | 🔗 [https://huggingface.co/datasets/cfilt/iitb-english-hindi](https://huggingface.co/datasets/cfilt/iitb-english-hindi) | +| **Official site** | [http://www.cfilt.iitb.ac.in/iitb_parallel](http://www.cfilt.iitb.ac.in/iitb_parallel) | + +--- + +### 🧠 Example Record + +```json +{ + "en": "Give your application an accessibility workout", + "hi": "अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें" +} +``` + +--- + +🔗 [IITB-English-Hindi-PC GitHub](https://github.com/cfiltnlp/IITB-English-Hindi-PC) + +--- + +### 🧩 Typical Uses + +* English↔Hindi machine translation +* Bilingual lexicon extraction +* Cross-lingual representation learning +* Evaluation of translation quality metrics (BLEU, chrF, etc.) + +--- + +### 🧾 Citation + +If you use this dataset, please cite: + +> **Anoop Kunchukuttan, Pratik Mehta, Pushpak Bhattacharyya** +> *The IIT Bombay English–Hindi Parallel Corpus* +> *Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)*, Miyazaki, Japan. + +```bibtex +@inproceedings{kunchukuttan-etal-2018-iit, + title = {The IIT Bombay English-Hindi Parallel Corpus}, + author = {Kunchukuttan, Anoop and Mehta, Pratik and Bhattacharyya, Pushpak}, + booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, + year = {2018}, + address = {Miyazaki, Japan}, + publisher = {European Language Resources Association (ELRA)}, + url = {https://aclanthology.org/L18-1548} +} +``` + diff --git a/data/iitb-english-hindi/get_dataset.sh b/data/iitb-english-hindi/get_dataset.sh new file mode 100644 index 0000000000..1d1dcf599e --- /dev/null +++ b/data/iitb-english-hindi/get_dataset.sh @@ -0,0 +1,9 @@ +#!/bin/bash +URL="https://huggingface.co/datasets/cfilt/iitb-english-hindi/tree/main/data" + +python utils/get_translation_parquet_dataset.py \ + --url "$URL" \ + --prefix en $'\nEN: ' \ + --prefix hi $'HI: ' \ + --output input.txt + diff --git a/data/iitb-english-hindi/prepare.py b/data/iitb-english-hindi/prepare.py new file mode 120000 index 0000000000..713f6b0012 --- /dev/null +++ b/data/iitb-english-hindi/prepare.py @@ -0,0 +1 @@ +../template/prepare.py \ No newline at end of file diff --git a/data/iitb-english-hindi/utils b/data/iitb-english-hindi/utils new file mode 120000 index 0000000000..ea6a0ddd72 --- /dev/null +++ b/data/iitb-english-hindi/utils @@ -0,0 +1 @@ +../template/utils \ No newline at end of file