Skip to content
This repository was archived by the owner on Jul 23, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from genbench import TaskDict


class BiasAmplifiedSplits(TaskDict):
pass
25 changes: 25 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/config.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
name: 'Bias-amplified Splits',

description: 'In Bias-amplified Splits, we argue that to promote models robust to subtle biases, we should in fact *amplify* dataset biases during evaluation, instead of attempting to balance and remove them. We take existing datasets and extract a bias-amplified training set and an anti-biased test set from the original splits. For resplitting, we use a novel clustering-based approach to detect anti-biased minority examples.',

keywords: [
'non-i.i.d. generalization',
'dataset biases',
'minority examples',
'robustness'
],

authors: [
'Yuval Reif',
'Roy Schwartz'
],

subtasks_order: [
'mnli',
'mnli_prompt_based',
'wanli',
'wanli_prompt_based',
'qqp',
],
}
37 changes: 37 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
## Motivation
NLP models often rely on superficial cues known as *dataset biases* to achieve impressive performance, and can fail on examples where these biases do not hold.
Recent work sought to develop robust, unbiased models by filtering *biased* examples from training sets. We argue that such filtering can obscure the true capabilities of models to overcome biases, which might never be removed in full from the dataset.

We propose that in order to drive the development of models robust to subtle biases, dataset biases should in fact be **amplified** in the training set.
We introduce **bias-amplifed splits**, an evaluation framework defined by a *bias-amplified* training set and an *anti-biased* test set, both automatically extracted from existing datasets.
To extract *biased* and *anti-biased* examples, we use a novel clustering-based approach for detecting minority examples in the dataset.

## Examples
Bias-amplified splits can be created for various tasks. Each of the sub-tasks is for a different dataset -- see examples in each sub-task's documentation.

## Data Source
We extract bias-amplified splits from existing datasets. All datasets can be obtained via the Hugging Face dataset hub. See references and license details in each of the subtasks.

## Limitations and Bias
TODO: *Note any known limitations or biases that the Bias-amplified Splits has, with links and references if possible.*

## Citation
```
@inproceedings{reif-schwartz-2023-fighting,
title = "Fighting Bias With Bias: Promoting Model Robustness by Amplifying Dataset Biases",
author = "Reif, Yuval and
Schwartz, Roy",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.833",
pages = "13169--13189",
abstract = "NLP models often rely on superficial cues known as dataset biases to achieve impressive performance, and can fail on examples where these biases do not hold. Recent work sought to develop robust, unbiased models by filtering biased examples from training sets. In this work, we argue that such filtering can obscure the true capabilities of models to overcome biases, which might never be removed in full from the dataset. We suggest that in order to drive the development of models robust to subtle biases, dataset biases should be amplified in the training set. We introduce an evaluation framework defined by a bias-amplified training set and an anti-biased test set, both automatically extracted from existing datasets. Experiments across three notions of bias, four datasets and two models show that our framework is substantially more challenging for models than the original data splits, and even more challenging than hand-crafted challenge sets. Our evaluation framework can use any existing dataset, even those considered obsolete, to test model robustness. We hope our work will guide the development of robust models that do not rely on superficial biases and correlations. To this end, we publicly release our code and data.",
}
```

## Further References
TODO: *Add any useful further references.*

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
53 changes: 53 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/mnli/config.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
name: 'Bias-amplified Splits (MultiNLI)',

description: 'We take MultiNLI (MNLI) and extract a bias-amplified training set and an anti-biased test set for it from the original splits. For resplitting, we use a novel clustering-based approach to detect anti-biased minority examples.',

keywords: [
'non-i.i.d. generalization',
'dataset biases',
'minority examples',
'robustness'
],

authors: [
'Yuval Reif',
'Roy Schwartz'
],

data_source: {
type: 'hf',
hf_id: ['glue', 'mnli'],
git_commit_sha: 'fd8e86499fa5c264fcaad392a8f49ddf58bf4037',
},

has_validation_set: true,
has_train_set: true,

task_type: 'multiple_choice',

field_mapping: {
input: 'sentence_pair',
target: 'label',
},

split_file: 'split.jsonnet',

evaluation_metrics: [
{
hf_id: 'accuracy',
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
best_score: 1.0
}
],

preparation_strategies: {
// Finetuning the model on the task's train-set and then evaluating
// on the task's test-set. This is suitable for models such as RoBERTa, BERT, etc.,
// but can be used for LLMs as well.
finetuning: {
objective: 'maximum_likelihood',
// ... other model-agnostic finetuing options ...
}
},
}
60 changes: 60 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/mnli/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Bias-amplified Splits (MultiNLI)

## Abstract
NLP models often rely on superficial cues known as *dataset biases* to achieve impressive performance, and can fail on examples where these biases do not hold.
Recent work sought to develop robust, unbiased models by filtering *biased* examples from training sets. In this work, we argue that such filtering can obscure the true capabilities of models to overcome biases, which might never be removed in full from the dataset.
We suggest that in order to drive the development of models robust to subtle biases, dataset biases should be *amplified* in the training set. We introduce an evaluation framework defined by a *bias-amplified* training set and an *anti-biased* test set, both automatically extracted from existing datasets.
Experiments across three notions of *bias*, four datasets and two models show that our framework is substantially more challenging for models than the original data splits, and even more challenging than hand-crafted challenge sets.
Our evaluation framework can use any existing dataset, even those considered obsolete, to test model robustness. We hope our work will guide the development of robust models that do not rely on superficial biases and correlations. To this end, we publicly release our code and data.

## Examples
Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). For instance:
```
premise: Your gift is appreciated by each and every student who will benefit from your generosity.
hypothesis: The gift is appreciated by some students, but not those by who did not receive it.
label: entailment

premise: Your gift is appreciated by each and every student who will benefit from your generosity.
hypothesis: The generous offering is intendend to make the students' lives harder.
label: contradiction

premise: Your gift is appreciated by each and every student who will benefit from your generosity.
hypothesis: Hundreds of students will benefit from your generosity.
label: neutral
```

## Usage
This task can be used for regular finetuning. Afterwards, accuracy is used to score models' predictions.
Here is a small code snippet that illustrates how to load the dataset, and evaluate the accuracy of a single item:

```
from genbench import load_task
from genbench.api import PreparationStrategy

task = load_task("bias_amplified_splits:mnli")
ds = task.get_prepared_datasets(
PreparationStrategy.FINETUNING,
shot_list=[0]
)
task.evaluate_predictions(
predictions=[{"target": 1}],
gold=[ds["test"][0]],
)
```

## Data Source
The data can be obtained from `glue/mnli` via the Huggingface dataset hub.
MultiNLI is licensed under a OANC License (see https://cims.nyu.edu/~sbowman/multinli/paper.pdf).

## Limitations and Bias
TODO: *Note any known limitations or biases that the Bias-amplified Splits (MultiNLI) has, with links and references if possible.*

## GenBench Eval card
- ***Generalisation type*** The generalisation type evaluated is 'robustness'.
- ***Motivation*** It is designed to better understand how models can overcome difficulties that are intrinsically hard for them, without further taking into account external factors of practicalities, fairness, etc.
- ***Shift source*** because all the data is natural, but the splitting dimension is unnatural, this is a partitioned, natural shift source.
- ***Shift locus*** This test can be used to evaluate a finetuned model (or a model trained from scratch)
- ***Shift type*** The shift type is a covariate one, considering that we do not modify the output distribution.


![GenBench Eval Card](GenBench_eval_card.jpg)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
transformers
datasets
evaluate
5 changes: 5 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/mnli/split.jsonnet

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions src/genbench/tasks/bias_amplified_splits/mnli/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Any, Dict, Mapping

from datasets import Dataset

from genbench import Task
from genbench.api import DatasetSplit
from genbench.task import resplit_data_source
from genbench.utils.file import load_jsonnet
from genbench.utils.tasks import get_task_dir


class BiasAmplifiedSplitsMnli(Task):
def format_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
"""Perform preprocessing/formatting on an example-level.

By default, this method does nothing more than mapping original data source
fields to the expected fields.

`example` directly comes from the data source (e.g. downloaded HF dataset),
and it may contain fields such as `question` or `answer`. This method should
prepare the example used in the task. i.e. should create fields `input`,
`target`, `target_scores`, or `target_labels` depending on the task type.

Args:
example: A dictionary containing key-value pairs for an example from the source dataset.


Returns:
A dictionary containing key-value pairs for the preprocessed/formatted example.
The dictionary should contain keys `input`, `target`, `target_scores`, or `target_label`
depending on the task type.
"""
return {
"input": f"{example['premise']} </s> {example['hypothesis']}",
"target": example["label"],
}

def get_datasets_raw(self) -> Mapping[DatasetSplit, Dataset]:
data_source = self._load_data_source()

# There is a test ("test_split_file") that verifies the split file.
# It expects "train", "validation", "test" to be in the split definition,
# but in MNLI there are validation_matched/mismatched, etc.
# however, to define "validation"/"test" in the splits file, we need these to exist in the original dataset.
# so define "validation", "test" sets just so the keys exist in the dataset.
data_source["validation"] = data_source["validation_matched"]
data_source["test"] = data_source["test_matched"]

if self.config.split_file is not None:
split_file_path = get_task_dir(self.root_task_id, self.subtask_id) / self.config.split_file
splitting_info = load_jsonnet(split_file_path)
data_source = resplit_data_source(data_source, splitting_info)

output = {}
for split in sorted(data_source.keys()):
dataset = data_source[split]
output[split] = dataset.map(
self.format_example,
num_proc=self.dataset_format_num_proc,
batched=self.dataset_format_batched,
desc=f"Formatting `{split}` examples",
)
assert all([f in output[split].column_names for f in ["input", "target"]])

# Assign id to each example
for split in sorted(output.keys()):
output[split] = output[split].map(
lambda example, idx: {"_genbench_idx": idx},
with_indices=True,
num_proc=self.dataset_format_num_proc,
batched=False,
desc=f"Assigning id to `{split}` examples",
)

return output
Loading