GenBench · yuvalreif · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023
diff --git a/src/genbench/tasks/bias_amplified_splits/__init__.py b/src/genbench/tasks/bias_amplified_splits/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class BiasAmplifiedSplits(TaskDict):
+    pass
diff --git a/src/genbench/tasks/bias_amplified_splits/config.jsonnet b/src/genbench/tasks/bias_amplified_splits/config.jsonnet
@@ -0,0 +1,25 @@
+{
+    name: 'Bias-amplified Splits',
+
+    description: 'In Bias-amplified Splits, we argue that to promote models robust to subtle biases, we should in fact *amplify* dataset biases during evaluation, instead of attempting to balance and remove them. We take existing datasets and extract a bias-amplified training set and an anti-biased test set from the original splits. For resplitting, we use a novel clustering-based approach to detect anti-biased minority examples.',
+
+    keywords: [
+        'non-i.i.d. generalization',
+        'dataset biases',
+        'minority examples',
+        'robustness'
+    ],
+
+    authors: [
+        'Yuval Reif',
+        'Roy Schwartz'
+    ],
+
+    subtasks_order: [
+        'mnli',
+        'mnli_prompt_based',
+        'wanli',
+        'wanli_prompt_based',
+        'qqp',
+    ],
+}
diff --git a/src/genbench/tasks/bias_amplified_splits/doc.md b/src/genbench/tasks/bias_amplified_splits/doc.md
@@ -0,0 +1,37 @@
+## Motivation
+NLP models often rely on superficial cues known as *dataset biases* to achieve impressive performance, and can fail on examples where these biases do not hold. 
+Recent work sought to develop robust, unbiased models by filtering *biased* examples from training sets. We argue that such filtering can obscure the true capabilities of models to overcome biases, which might never be removed in full from the dataset. 
+
+We propose that in order to drive the development of models robust to subtle biases, dataset biases should in fact be **amplified** in the training set. 
+We introduce **bias-amplifed splits**, an evaluation framework defined by a *bias-amplified* training set and an *anti-biased* test set, both automatically extracted from existing datasets. 
+To extract *biased* and *anti-biased* examples, we use a novel clustering-based approach for detecting minority examples in the dataset.
+
+## Examples
+Bias-amplified splits can be created for various tasks. Each of the sub-tasks is for a different dataset -- see examples in each sub-task's documentation.
+
+## Data Source
+We extract bias-amplified splits from existing datasets. All datasets can be obtained via the Hugging Face dataset hub. See references and license details in each of the subtasks.
+
+## Limitations and Bias
+TODO: *Note any known limitations or biases that the Bias-amplified Splits has, with links and references if possible.*
+
+## Citation
+```
+@inproceedings{reif-schwartz-2023-fighting,
+    title = "Fighting Bias With Bias: Promoting Model Robustness by Amplifying Dataset Biases",
+    author = "Reif, Yuval  and
+      Schwartz, Roy",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
+    month = jul,
+    year = "2023",
+    address = "Toronto, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2023.findings-acl.833",
+    pages = "13169--13189",
+    abstract = "NLP models often rely on superficial cues known as dataset biases to achieve impressive performance, and can fail on examples where these biases do not hold. Recent work sought to develop robust, unbiased models by filtering biased examples from training sets. In this work, we argue that such filtering can obscure the true capabilities of models to overcome biases, which might never be removed in full from the dataset. We suggest that in order to drive the development of models robust to subtle biases, dataset biases should be amplified in the training set. We introduce an evaluation framework defined by a bias-amplified training set and an anti-biased test set, both automatically extracted from existing datasets. Experiments across three notions of bias, four datasets and two models show that our framework is substantially more challenging for models than the original data splits, and even more challenging than hand-crafted challenge sets. Our evaluation framework can use any existing dataset, even those considered obsolete, to test model robustness. We hope our work will guide the development of robust models that do not rely on superficial biases and correlations. To this end, we publicly release our code and data.",
+}
+```
+
+## Further References
+TODO: *Add any useful further references.*
+
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/GenBench_eval_card.jpg b/src/genbench/tasks/bias_amplified_splits/mnli/GenBench_eval_card.jpg
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/__init__.py b/src/genbench/tasks/bias_amplified_splits/mnli/__init__.py
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/config.jsonnet b/src/genbench/tasks/bias_amplified_splits/mnli/config.jsonnet
@@ -0,0 +1,53 @@
+{
+    name: 'Bias-amplified Splits (MultiNLI)',
+
+    description: 'We take MultiNLI (MNLI) and extract a bias-amplified training set and an anti-biased test set for it from the original splits. For resplitting, we use a novel clustering-based approach to detect anti-biased minority examples.',
+
+    keywords: [
+        'non-i.i.d. generalization',
+        'dataset biases',
+        'minority examples',
+        'robustness'
+    ],
+
+    authors: [
+        'Yuval Reif',
+        'Roy Schwartz'
+    ],
+
+    data_source: {
+        type: 'hf',
+        hf_id: ['glue', 'mnli'],
+        git_commit_sha: 'fd8e86499fa5c264fcaad392a8f49ddf58bf4037',
+    },
+
+    has_validation_set: true,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    field_mapping: {
+        input: 'sentence_pair',
+        target: 'label',
+    },
+
+    split_file: 'split.jsonnet',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0
+        }
+    ],
+
+    preparation_strategies: {
+        // Finetuning the model on the task's train-set and then evaluating
+        // on the task's test-set. This is suitable for models such as RoBERTa, BERT, etc.,
+        // but can be used for LLMs as well.
+        finetuning: {
+            objective: 'maximum_likelihood',
+            // ... other model-agnostic finetuing options ...
+        }
+    },
+}
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/doc.md b/src/genbench/tasks/bias_amplified_splits/mnli/doc.md
@@ -0,0 +1,60 @@
+# Bias-amplified Splits (MultiNLI)
+
+## Abstract
+NLP models often rely on superficial cues known as *dataset biases* to achieve impressive performance, and can fail on examples where these biases do not hold. 
+Recent work sought to develop robust, unbiased models by filtering *biased* examples from training sets. In this work, we argue that such filtering can obscure the true capabilities of models to overcome biases, which might never be removed in full from the dataset. 
+We suggest that in order to drive the development of models robust to subtle biases, dataset biases should be *amplified* in the training set. We introduce an evaluation framework defined by a *bias-amplified* training set and an *anti-biased* test set, both automatically extracted from existing datasets. 
+Experiments across three notions of *bias*, four datasets and two models show that our framework is substantially more challenging for models than the original data splits, and even more challenging than hand-crafted challenge sets. 
+Our evaluation framework can use any existing dataset, even those considered obsolete, to test model robustness. We hope our work will guide the development of robust models that do not rely on superficial biases and correlations. To this end, we publicly release our code and data.
+
+## Examples
+Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral).  For instance:
+```
+premise: Your gift is appreciated by each and every student who will benefit from your generosity.
+hypothesis: The gift is appreciated by some students, but not those by who did not receive it.
+label: entailment
+
+premise: Your gift is appreciated by each and every student who will benefit from your generosity.
+hypothesis: The generous offering is intendend to make the students' lives harder.
+label: contradiction
+
+premise: Your gift is appreciated by each and every student who will benefit from your generosity.
+hypothesis: Hundreds of students will benefit from your generosity.
+label: neutral
+```
+
+## Usage
+This task can be used for regular finetuning. Afterwards, accuracy is used to score models' predictions.
+Here is a small code snippet that illustrates how to load the dataset, and evaluate the accuracy of a single item:
+
+```
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+task = load_task("bias_amplified_splits:mnli")
+ds = task.get_prepared_datasets(
+    PreparationStrategy.FINETUNING, 
+    shot_list=[0]
+)
+task.evaluate_predictions(
+    predictions=[{"target": 1}],
+    gold=[ds["test"][0]],
+)
+```
+
+## Data Source
+The data can be obtained from `glue/mnli` via the Huggingface dataset hub.
+MultiNLI is licensed under a OANC License (see https://cims.nyu.edu/~sbowman/multinli/paper.pdf).
+
+## Limitations and Bias
+TODO: *Note any known limitations or biases that the Bias-amplified Splits (MultiNLI) has, with links and references if possible.*
+
+## GenBench Eval card
+- ***Generalisation type*** The generalisation type evaluated is 'robustness'.
+- ***Motivation*** It is designed to better understand how models can overcome difficulties that are intrinsically hard for them, without further taking into account external factors of practicalities, fairness, etc.
+- ***Shift source*** because all the data is natural, but the splitting dimension is unnatural, this is a partitioned, natural shift source.
+- ***Shift locus*** This test can be used to evaluate a finetuned model (or a model trained from scratch)
+- ***Shift type*** The shift type is a covariate one, considering that we do not modify the output distribution.
+
+
+![GenBench Eval Card](GenBench_eval_card.jpg)
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/requirements-usage-example.txt b/src/genbench/tasks/bias_amplified_splits/mnli/requirements-usage-example.txt
@@ -0,0 +1,3 @@
+transformers
+datasets
+evaluate
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/split.jsonnet b/src/genbench/tasks/bias_amplified_splits/mnli/split.jsonnet
diff --git a/src/genbench/tasks/bias_amplified_splits/mnli/task.py b/src/genbench/tasks/bias_amplified_splits/mnli/task.py
@@ -0,0 +1,75 @@
+from typing import Any, Dict, Mapping
+
+from datasets import Dataset
+
+from genbench import Task
+from genbench.api import DatasetSplit
+from genbench.task import resplit_data_source
+from genbench.utils.file import load_jsonnet
+from genbench.utils.tasks import get_task_dir
+
+
+class BiasAmplifiedSplitsMnli(Task):
+    def format_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Perform preprocessing/formatting on an example-level.
+
+        By default, this method does nothing more than mapping original data source
+        fields to the expected fields.
+
+        `example` directly comes from the data source (e.g. downloaded HF dataset),
+        and it may contain fields such as `question` or `answer`. This method should
+        prepare the example used in the task. i.e. should create fields `input`,
+        `target`, `target_scores`, or `target_labels` depending on the task type.
+
+        Args:
+            example: A dictionary containing key-value pairs for an example from the source dataset.
+
+
+        Returns:
+            A dictionary containing key-value pairs for the preprocessed/formatted example.
+            The dictionary should contain keys `input`, `target`, `target_scores`, or `target_label`
+            depending on the task type.
+        """
+        return {
+            "input": f"{example['premise']} </s> {example['hypothesis']}",
+            "target": example["label"],
+        }
+
+    def get_datasets_raw(self) -> Mapping[DatasetSplit, Dataset]:
+        data_source = self._load_data_source()
+
+        # There is a test ("test_split_file") that verifies the split file.
+        # It expects "train", "validation", "test" to be in the split definition,
+        # but in MNLI there are validation_matched/mismatched, etc.
+        # however, to define "validation"/"test" in the splits file, we need these to exist in the original dataset.
+        # so define "validation", "test" sets just so the keys exist in the dataset.
+        data_source["validation"] = data_source["validation_matched"]
+        data_source["test"] = data_source["test_matched"]
+
+        if self.config.split_file is not None:
+            split_file_path = get_task_dir(self.root_task_id, self.subtask_id) / self.config.split_file
+            splitting_info = load_jsonnet(split_file_path)
+            data_source = resplit_data_source(data_source, splitting_info)
+
+        output = {}
+        for split in sorted(data_source.keys()):
+            dataset = data_source[split]
+            output[split] = dataset.map(
+                self.format_example,
+                num_proc=self.dataset_format_num_proc,
+                batched=self.dataset_format_batched,
+                desc=f"Formatting `{split}` examples",
+            )
+            assert all([f in output[split].column_names for f in ["input", "target"]])
+
+        # Assign id to each example
+        for split in sorted(output.keys()):
+            output[split] = output[split].map(
+                lambda example, idx: {"_genbench_idx": idx},
+                with_indices=True,
+                num_proc=self.dataset_format_num_proc,
+                batched=False,
+                desc=f"Assigning id to `{split}` examples",
+            )
+
+        return output