From d49829c9dc449f583b2ce75032aea7a2028fffd0 Mon Sep 17 00:00:00 2001
From: Kauna <16511995+klei22@users.noreply.github.com>
Date: Sat, 18 Oct 2025 17:05:24 -0700
Subject: [PATCH 1/3] Add translation-aware parquet dataset utility

---
 .../utils/get_translation_parquet_dataset.py  | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 data/template/utils/get_translation_parquet_dataset.py

diff --git a/data/template/utils/get_translation_parquet_dataset.py b/data/template/utils/get_translation_parquet_dataset.py
new file mode 100644
index 0000000000..59e86d247a
--- /dev/null
+++ b/data/template/utils/get_translation_parquet_dataset.py
@@ -0,0 +1,145 @@
+"""Utilities for flattening translation columns in Parquet datasets."""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from typing import Iterable, Sequence, Tuple
+
+from .get_parquet_dataset import convert_to_json, download_file, find_parquet_links
+
+
+def emit_translation_items(
+    json_path: str,
+    output_path: str,
+    language_prefixes: Sequence[Tuple[str, str]],
+) -> None:
+    """Emit flattened translation rows from ``json_path`` into ``output_path``.
+
+    Parameters
+    ----------
+    json_path:
+        Path to the JSON file produced from a Parquet shard.
+    output_path:
+        File where the flattened text should be appended.
+    language_prefixes:
+        Ordered collection of (language, prefix) tuples. Each translation entry
+        writes one line per language using the associated prefix when the
+        translation text is present.
+    """
+    if not language_prefixes:
+        return
+
+    with open(json_path, "r", encoding="utf-8") as handle:
+        records = json.load(handle)
+
+    if not isinstance(records, list):
+        return
+
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+
+    with open(output_path, "a", encoding="utf-8") as out_handle:
+        for record in records:
+            translation = record.get("translation")
+            if not isinstance(translation, dict):
+                continue
+
+            segments = []
+            for language, prefix in language_prefixes:
+                text = translation.get(language)
+                if not text:
+                    continue
+                segments.append(f"{prefix}{text}")
+
+            if segments:
+                out_handle.write("\n".join(segments) + "\n\n")
+
+
+def download_translation_dataset(
+    url: str,
+    output_text_file: str,
+    language_prefixes: Sequence[Tuple[str, str]],
+    append: bool = False,
+) -> None:
+    """Download, convert, and flatten translation datasets from ``url``.
+
+    The function downloads all Parquet files advertised at ``url`` (typically a
+    Hugging Face dataset folder), converts them to JSON if necessary, and emits
+    flattened text records to ``output_text_file`` using the provided language
+    prefixes.
+    """
+    parquet_links = find_parquet_links(url)
+    download_dir = "./downloaded_parquets"
+    json_dir = "./json_output"
+    os.makedirs(download_dir, exist_ok=True)
+    os.makedirs(json_dir, exist_ok=True)
+
+    if not append:
+        open(output_text_file, "w", encoding="utf-8").close()
+
+    for link in parquet_links:
+        file_name = link.split("/")[-1].split("?")[0]
+        parquet_path = os.path.join(download_dir, file_name)
+        json_path = os.path.join(json_dir, file_name.replace(".parquet", ".json"))
+
+        if not os.path.exists(parquet_path):
+            download_file(link, parquet_path)
+
+        convert_to_json(parquet_path, json_path)
+        emit_translation_items(json_path, output_text_file, language_prefixes)
+
+
+
+def parse_language_prefixes(prefix_args: Iterable[Tuple[str, str]]) -> Sequence[Tuple[str, str]]:
+    """Validate and normalize CLI ``--prefix`` arguments."""
+    prefixes: list[Tuple[str, str]] = []
+    for language, prefix in prefix_args:
+        if not language:
+            raise ValueError("Language code for --prefix cannot be empty")
+        prefixes.append((language, prefix))
+    return prefixes
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Download Europarl-style translation Parquet files and emit prefixed text."
+        )
+    )
+    parser.add_argument(
+        "--url",
+        required=True,
+        help="Dataset folder URL listing the Parquet shards (e.g. Hugging Face tree view).",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="input.txt",
+        help="Where to write the flattened text output.",
+    )
+    parser.add_argument(
+        "--prefix",
+        nargs=2,
+        action="append",
+        metavar=("LANG", "PREFIX"),
+        required=True,
+        help="Language/prefix pairs like --prefix bg 'BG: ' --prefix cs 'CS: '.",
+    )
+    parser.add_argument(
+        "--append",
+        action="store_true",
+        help="Append to the output file instead of overwriting it.",
+    )
+    args = parser.parse_args()
+
+    language_prefixes = parse_language_prefixes(args.prefix)
+    download_translation_dataset(
+        args.url,
+        args.output,
+        language_prefixes,
+        append=args.append,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 2e7a0bf7d66d0a42af45280b8538c5252a1dec5c Mon Sep 17 00:00:00 2001
From: kauna <kauna@avalonwest.tech>
Date: Sun, 19 Oct 2025 03:36:00 +0000
Subject: [PATCH 2/3] Add new parquet parser for common translation format

---
 data/template/utils/get_translation_parquet_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/template/utils/get_translation_parquet_dataset.py b/data/template/utils/get_translation_parquet_dataset.py
index 59e86d247a..3231ac4917 100644
--- a/data/template/utils/get_translation_parquet_dataset.py
+++ b/data/template/utils/get_translation_parquet_dataset.py
@@ -6,7 +6,7 @@
 import os
 from typing import Iterable, Sequence, Tuple
 
-from .get_parquet_dataset import convert_to_json, download_file, find_parquet_links
+from get_parquet_dataset import convert_to_json, download_file, find_parquet_links
 
 
 def emit_translation_items(

From 27bb3b0555d293073127fd644cdf3dd1e55ff326 Mon Sep 17 00:00:00 2001
From: kauna <kauna@avalonwest.tech>
Date: Sun, 19 Oct 2025 03:58:35 +0000
Subject: [PATCH 3/3] Add english-hindi dataset

---
 data/iitb-english-hindi/README.md      | 68 ++++++++++++++++++++++++++
 data/iitb-english-hindi/get_dataset.sh |  9 ++++
 data/iitb-english-hindi/prepare.py     |  1 +
 data/iitb-english-hindi/utils          |  1 +
 4 files changed, 79 insertions(+)
 create mode 100644 data/iitb-english-hindi/README.md
 create mode 100644 data/iitb-english-hindi/get_dataset.sh
 create mode 120000 data/iitb-english-hindi/prepare.py
 create mode 120000 data/iitb-english-hindi/utils

diff --git a/data/iitb-english-hindi/README.md b/data/iitb-english-hindi/README.md
new file mode 100644
index 0000000000..661ddea973
--- /dev/null
+++ b/data/iitb-english-hindi/README.md
@@ -0,0 +1,68 @@
+# IITB English–Hindi Parallel Corpus (cfilt/iitb-english-hindi)
+
+### Dataset Overview
+
+The **IIT Bombay English-Hindi Parallel Corpus** is a large-scale bilingual
+dataset created by the **Center for Indian Language Technology (CFILT)** at IIT
+Bombay. It contains **1.66 million English–Hindi sentence pairs** collected
+from multiple open sources and curated over several years for **machine
+translation and linguistic research**.
+
+| Field                 | Value                                                                                                                   |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| **Dataset name**      | `cfilt/iitb-english-hindi`                                                                                              |
+| **Languages**         | English (`en`), Hindi (`hi`)                                                                                            |
+| **Modality**          | Text (parallel corpus)                                                                                                  |
+| **Format**            | Parquet                                                                                                                 |
+| **Size**              | ~190 MB (≈ 1.66 M rows)                                                                                                 |
+| **Splits**            | `train`, `validation`, `test`                                                                                           |
+| **License**           | [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/)                                                         |
+| **Hugging Face page** | 🔗 [https://huggingface.co/datasets/cfilt/iitb-english-hindi](https://huggingface.co/datasets/cfilt/iitb-english-hindi) |
+| **Official site**     | [http://www.cfilt.iitb.ac.in/iitb_parallel](http://www.cfilt.iitb.ac.in/iitb_parallel)                                  |
+
+---
+
+### 🧠 Example Record
+
+```json
+{
+  "en": "Give your application an accessibility workout",
+  "hi": "अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें"
+}
+```
+
+---
+
+🔗 [IITB-English-Hindi-PC GitHub](https://github.com/cfiltnlp/IITB-English-Hindi-PC)
+
+---
+
+### 🧩 Typical Uses
+
+* English↔Hindi machine translation
+* Bilingual lexicon extraction
+* Cross-lingual representation learning
+* Evaluation of translation quality metrics (BLEU, chrF, etc.)
+
+---
+
+### 🧾 Citation
+
+If you use this dataset, please cite:
+
+> **Anoop Kunchukuttan, Pratik Mehta, Pushpak Bhattacharyya**
+> *The IIT Bombay English–Hindi Parallel Corpus*
+> *Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)*, Miyazaki, Japan.
+
+```bibtex
+@inproceedings{kunchukuttan-etal-2018-iit,
+  title     = {The IIT Bombay English-Hindi Parallel Corpus},
+  author    = {Kunchukuttan, Anoop and Mehta, Pratik and Bhattacharyya, Pushpak},
+  booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
+  year      = {2018},
+  address   = {Miyazaki, Japan},
+  publisher = {European Language Resources Association (ELRA)},
+  url       = {https://aclanthology.org/L18-1548}
+}
+```
+
diff --git a/data/iitb-english-hindi/get_dataset.sh b/data/iitb-english-hindi/get_dataset.sh
new file mode 100644
index 0000000000..1d1dcf599e
--- /dev/null
+++ b/data/iitb-english-hindi/get_dataset.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+URL="https://huggingface.co/datasets/cfilt/iitb-english-hindi/tree/main/data"
+
+python utils/get_translation_parquet_dataset.py \
+  --url "$URL" \
+  --prefix en $'\nEN: ' \
+  --prefix hi $'HI: ' \
+  --output input.txt
+
diff --git a/data/iitb-english-hindi/prepare.py b/data/iitb-english-hindi/prepare.py
new file mode 120000
index 0000000000..713f6b0012
--- /dev/null
+++ b/data/iitb-english-hindi/prepare.py
@@ -0,0 +1 @@
+../template/prepare.py
\ No newline at end of file
diff --git a/data/iitb-english-hindi/utils b/data/iitb-english-hindi/utils
new file mode 120000
index 0000000000..ea6a0ddd72
--- /dev/null
+++ b/data/iitb-english-hindi/utils
@@ -0,0 +1 @@
+../template/utils
\ No newline at end of file