diff --git a/src/genbench/tasks/slayqa/__init__.py b/src/genbench/tasks/slayqa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/slayqa/config.jsonnet b/src/genbench/tasks/slayqa/config.jsonnet new file mode 100644 index 0000000..df13305 --- /dev/null +++ b/src/genbench/tasks/slayqa/config.jsonnet @@ -0,0 +1,54 @@ +{ + name: 'slayqa', + + // @TODO: Add a description of the task + description: 'SlayQA aims to measure generalisation in social reasoning where agents are referred to with gender-neutral and/or neo-pronouns.', + + // @TODO: Add a list of keywords that describe the task + keywords: [ + 'social reasoning', + 'neo-pronouns', + 'inclusivity', + 'fairness', + ], + + authors: [ + 'Bastian Bunzeck', + + ], + + data_source: { + type: 'hf', + hf_id: 'bbunzeck/slayqa', + git_commit_sha: '63cc322aa7a18e288f95e165cef06850c1b064dc', + }, + + has_validation_set: false, + has_train_set: false, + + task_type: 'multiple_choice', + + + evaluation_metrics: [ + { + hf_id: 'accuracy', + best_score: 1.0, + git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a', + }, + ], + + preparation_strategies: { + // A recipe for preparing the model to perform the task by configuring its prompt. + // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc. + // We provide a few options for configuring the prompt. But, the task creator can + // also provide a custom prompt preparation in the task's Python class. + prompt_based_testing: { + prompt_builder: { + instruction_zero_shot: '', + instruction_few_shot: '', + input_prefix: '', + output_prefix: '', + } + }, + }, +} \ No newline at end of file diff --git a/src/genbench/tasks/slayqa/doc.md b/src/genbench/tasks/slayqa/doc.md new file mode 100644 index 0000000..1870dc9 --- /dev/null +++ b/src/genbench/tasks/slayqa/doc.md @@ -0,0 +1,66 @@ +# SlayQA + +We present SlayQA: Social LinguisticsAnalyticsYielding Queer Agents, a novel benchmark set derived from the existing [Social IQa (SiQA) dataset](https://doi.org/10.18653/v1/D19-1454). It contains a situation description (the context), social reasoning questions and three prospective answers, where all context-question-answer pairs include at least two acts of pronoun-based reference and gender-neutral pronouns. Because SlayQA systematically replaces established, gendered pronouns with gender-affirming neo-pronouns, it is a more inclusive dataset that better reflects the diversity of human identities. This also marks a significant linguistic distribution shift in comparison to common pre-training corpora like C4 or Dolma. Consequentially, SlayQA aim to assess how well language models are able to generalize to novel linguistic structure. + +## Abstract +TBD, not yet finalised. + +## Examples + +The tasks are social reasoning problems that involve the selection of a correct answer: + +Context + question: +- Taylor called Sydney to ask if ae would like to go out. What does Taylor need to do before this? + +Answer options: +- have a phone +- give Taylor an answer +- check es email + +## Usage + +``` +# Load the task +task = genbench.load_task('slayqa') +dataset = task.get_datasets_raw() + +# dataset['test'] contains the individual entries: +# 'input' has combined context and question +# 'target_options' contains three possible answers +# 'target' contains index of gold standard answer + +# My predictions should be a list of dicts with predicted 'target' values + +# add your own prediction code here :) + +# or: naive baseline +my_predictions = [] +for i in range(len(dataset['test'])): + temp_dict = {} + temp_dict['target'] = 1 + my_predictions.append(temp_dict) + +# Evaluate +# The evaluate_predictions method compares the 'target' entry for a list of prediction dicts and the gold standard from the data set +task.evaluate_predictions(predictions = my_predictions, gold = dataset['test']) +``` + +## Data Source +Data set hosted at [Hugging Face hub](https://huggingface.co/datasets/bbunzeck/slayqa). + +## Limitations and Bias +Our study focuses on neo-pronouns. In addition to these, other gender-inclusive options exist, e.g. nounself pronouns, emojiself pronouns, numberself pronouns or nameself pronouns (see [Lauscher et al. 2022](https://aclanthology.org/2022.coling-1.105)). We opted to focus on neo-pronouns because we wanted to assess a limited and (somewhat) established set of gender-inclusive options, but further studies should also try to evaluate even more diverse options. + +## GenBench eval card +**Motivation**: This test is designed to assess whether LMs can generalise to novel pronouns in human-like ways (cognitive motivation) and whether they succeed in doing so with gender-neutral neo-pronouns (fairness motivation) in a social reasoning setting. + +**Generalisation type**: Because we only included examples with at least two coreference chains, its performance reflects compositional generalisation. + +**Shift type**: As the test samples differ in their pronoun frequencies from the training corpus, we assume that the shift is a covariate shift. + +**Shift source**: The distribution shift was artificially generated by systematically replacing gendered pronouns with neo-pronouns. + +**Shift locus**: The test data was altered to differ from the pretraining data, and as we assess the capabilities without further fine-tuning, the shift locus is located between the pretraining corpora and our test data. + + +![GenBench Eval Card](eval_card.png) diff --git a/src/genbench/tasks/slayqa/eval_card.png b/src/genbench/tasks/slayqa/eval_card.png new file mode 100644 index 0000000..433fdd7 Binary files /dev/null and b/src/genbench/tasks/slayqa/eval_card.png differ diff --git a/src/genbench/tasks/slayqa/task.py b/src/genbench/tasks/slayqa/task.py new file mode 100644 index 0000000..0999fdc --- /dev/null +++ b/src/genbench/tasks/slayqa/task.py @@ -0,0 +1,26 @@ +from typing import Any, Dict + +from genbench import Task + + +# @Task.register("SlayQA") +class SlayQA(Task): + """slayqa""" + + def format_example(self, example: Dict[str, Any]) -> Dict[str, Any]: + """Perform preprocessing/formatting on an example-level. + Map `context`, `question` to input, 'correct' to target and answerA, answerB and answerCC to target_options. + + Args: + example: A dictionary containing key-value pairs for an example from the SlayQA dataset. + + + Returns: + A dictionary containing key-value pairs for the formatted example. + """ + input = f"{example['context']} {example['question']}" + target = int(example["correct"]) - 1 + answerA = example["answerA"] + answerB = example["answerB"] + answerC = example["answerC"] + return {"input": input, "target": int(target), "target_options": [answerA, answerB, answerC]}