diff --git a/src/genbench/tasks/shifted_pauq/__init__.py b/src/genbench/tasks/shifted_pauq/__init__.py new file mode 100644 index 0000000..eacc664 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/__init__.py @@ -0,0 +1,5 @@ +from genbench import TaskDict + + +class ShiftedPauq(TaskDict): + pass diff --git a/src/genbench/tasks/shifted_pauq/config.jsonnet b/src/genbench/tasks/shifted_pauq/config.jsonnet new file mode 100644 index 0000000..bb19030 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/config.jsonnet @@ -0,0 +1,27 @@ +{ + name: 'Shifted Pauq', + + description: 'Shifted Pauq aims to measure compositional generalization in task of text2sql on different full shift splits.', + + keywords: [ + 'compositional generalization', + 'text2sql', + 'distribution shift', + 'multilingual' + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + subtasks_order: [ + 'ru_os_split', + 'en_os_split', + 'ru_trl_split', + 'en_trl_split', + 'ru_tsl_split', + 'en_tsl_split' + ], +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/doc.md b/src/genbench/tasks/shifted_pauq/doc.md new file mode 100644 index 0000000..6fc66e9 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/doc.md @@ -0,0 +1,187 @@ +In this work, we propose a custom split of a _Text-to-SQL_ dataset in Russian (PAUQ [1]) and English (Spider [2]) that assesses compositional generalization in _text-to-query_ models. + +## Motivation +PAUQ [1] is the Russian version of the Spider [2]. We have fixed the original queries and database content. +We have also translated both questions and queries to Russian language and updated database content with Russian entities. +In this evaluation we want to explore the ability of finetuned language models to cope with distribution shift +in a field of semantic parsing on different generated splits - target query length (see **ru_trl_split**, **en_trl_split**, **ru_tsl_split**, **en_tsl_split**), +target maximum compound divergence (will be submitted later) in contrast to i.i.d. generated split (see **ru_os_split**, **en_os_split**). + + +**GenBench taxonomy values for the experiments:** + +Motivation: Practical + +Generalisation type: Compositional, Structural, Across language + +Shift type: Full + +Shift source: Naturally occuring + +Shift locus: train--test, Finetune train--test + + +Additional details from PAUQ abstract [1]: We construct and complement a Spider dataset for the Russian language, thus creating the first publicly available text-to-SQL dataset in Russian. While examining dataset components—NL questions, SQL queries, and database content—we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. To conduct the experiment, we adapt baseline models RAT-SQL and BRIDGE and provide in-depth query component analysis. Both models demonstrate strong single language results and improved accuracy with multilingual training on the target language. + +baseline scores of a sequence-to-sequence model on our splits: TODO + +The size of the datasets: please refer to [1,2] for details + + + +## Examples +Each sample follows the original Spider structure. +**English example** +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Treasury' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Homeland Security';", + 'question': "List the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born.", + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, 2, [0, [0, 2, False], None], '"Treasury"', None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + ...}, + 'question_toks': ['List', + 'the', + 'states', + 'where', + 'both', + 'the', + 'secretary', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +**Russian example** + +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Управление общего учёта' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Бюджетная служба Конгресса';", + 'question': 'Перечислите штаты, в которых родились и секретарь управления общего учёта, и секретарь бюджетной службы Конгресса.', + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Управление общего учёта"', + None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + 'limit': None, + 'intersect': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Бюджетная служба Конгресса"', + None]], + ...}, + 'question_toks': ['Перечислите', + 'штаты', + ',', + 'в', + 'которых', + 'родились', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + 'management', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +Each query is supported by a corresponding database for measuring execution match. + +## Data Source +Spider was annotated by 11 Yale students. In PAUQ, we have refined it and translated with experts help. Please refer to [1,2] + +## Limitations and Bias +*Note any known limitations or biases that the Shifted Pauq has, with links and references if possible.* + +from [1]: PAUQ is an adaptation of the Spider dataset to Russian language, it indeed inherits most of Spider's limitations. First of all, the data is still `artificial' which means that it was created by a limited number of people specifically for training and evaluating text-to-SQL models, thus it lacks the diversity and complexity of natural data formed by questions that people formulate in order to get the desired information from the database. For instance, the real-world data contain NL queries that require common sense knowledge which can't be extracted directly from the database; ambiguous questions allowing various ways of interpretation that are quite frequent and queries with window functions that make the process easier and more convenient, -- all of these aren't included in the Spider dataset, as well as in PAUQ. Another limitation concerns evaluation metric -- exact match, which is the most commonly used to evaluate text-to-SQL models performance. However, the metric is too strict and prone to false negative results. + + +## Citation +[1] [https://aclanthology.org/2022.findings-emnlp.175.pdf](https://aclanthology.org/2022.findings-emnlp.175.pdf) +``` +@inproceedings{pauq, + title = "{PAUQ}: Text-to-{SQL} in {R}ussian", + author = "Bakshandaeva, Daria and + Somov, Oleg and + Dmitrieva, Ekaterina and + Davydova, Vera and + Tutubalina, Elena", + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-emnlp.175", + pages = "2355--2376", + abstract = "Semantic parsing is an important task that allows to democratize human-computer interaction. One of the most popular text-to-SQL datasets with complex and diverse natural language (NL) questions and SQL queries is Spider. We construct and complement a Spider dataset for Russian, thus creating the first publicly available text-to-SQL dataset for this language. While examining its components - NL questions, SQL queries and databases content - we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. We select thirty functional test sets with different features that can be used for the evaluation of neural models{'} abilities. To conduct the experiments, we adapt baseline architectures RAT-SQL and BRIDGE and provide in-depth query component analysis. On the target language, both models demonstrate strong results with monolingual training and improved accuracy in multilingual scenario. In this paper, we also study trade-offs between machine-translated and manually-created NL queries. At present, Russian text-to-SQL is lacking in datasets as well as trained models, and we view this work as an important step towards filling this gap.", +} +``` +[2] [https://aclanthology.org/D18-1425/](https://aclanthology.org/D18-1425/) +``` +@inproceedings{spider, + title={Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task}, + author={Yu, Tao and Zhang, Rui and Yang, Kai and Yasunaga, Michihiro and Wang, Dongxu and Li, Zifan and Ma, James and Li, Irene and Yao, Qingning and Roman, Shanelle and others}, + booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + pages={3911--3921}, + year={2018} +} +``` + + + +## Further References +*Add any useful further references.* diff --git a/src/genbench/tasks/shifted_pauq/en_os_split/__init__.py b/src/genbench/tasks/shifted_pauq/en_os_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/shifted_pauq/en_os_split/config.jsonnet b/src/genbench/tasks/shifted_pauq/en_os_split/config.jsonnet new file mode 100644 index 0000000..62e3260 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_os_split/config.jsonnet @@ -0,0 +1,46 @@ +{ + name: 'Shifted Pauq (en_os_split)', + + description: 'Shifted Pauq in English (en_os_split) aims to measure model ability to generate SQL queries from English input with splits generated in i.i.d. strategy.', + + keywords: [ + 'text2sql', + 'iid distribution', + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + data_source: { + type: 'hf', + hf_id: ['composite/pauq', 'en_os'], + git_commit_sha: '63e3e9329f785d097f4746618737d69530d1cdb4', + }, + + has_validation_set: false, + has_train_set: true, + + task_type: 'free_form', + + field_mapping: { + input: 'question', + target: 'query' + }, + + evaluation_metrics: [ + { + hf_id: 'evaluate-metric/exact_match', + best_score: 1.0, + git_commit_sha: '8e612716f2b1b08d23b0b2d7aa667d2f38eb989e', + } + ], + + preparation_strategies: { + finetuning: { + objective: 'maximum_likelihood', + } + } +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/en_os_split/doc.md b/src/genbench/tasks/shifted_pauq/en_os_split/doc.md new file mode 100644 index 0000000..fd5a54b --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_os_split/doc.md @@ -0,0 +1,83 @@ +# Shifted PAUQ -- Text-to-SQL (en_pauq_iid_split) + +## Abstract +This is original split from Spider dataset in English [1]. We will use this split in addition to the **ru_os_split** split to measure generalisation across languages. + +## Examples +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Treasury' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Homeland Security';", + 'question': "List the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born.", + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, 2, [0, [0, 2, False], None], '"Treasury"', None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + ...}, + 'question_toks': ['List', + 'the', + 'states', + 'where', + 'both', + 'the', + 'secretary', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +## Usage +*Describe how to load your task and what is required for evaluation, if anything.* + +Please refer to the **ru_os_split** card. + +## Data Source +*Describe the data source for this Shifted PAUQ (en_pauq_iid_split).* + +Please refer to the **ru_os_split** card. + +## Limitations and Bias + +Please refer to the **ru_os_split** card. + +## GenBench Eval card +*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for +your task*. + +**GenBench taxonomy values for the experiments:** + +Same as in the **ru_os_split** card with addition: + +Generalisation type: generalisation across languages (from English to Russian) + +## References +[1] [https://aclanthology.org/D18-1425/](https://aclanthology.org/D18-1425/) +``` +@inproceedings{spider, + title={Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task}, + author={Yu, Tao and Zhang, Rui and Yang, Kai and Yasunaga, Michihiro and Wang, Dongxu and Li, Zifan and Ma, James and Li, Irene and Yao, Qingning and Roman, Shanelle and others}, + booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + pages={3911--3921}, + year={2018} +} +``` diff --git a/src/genbench/tasks/shifted_pauq/en_os_split/task.py b/src/genbench/tasks/shifted_pauq/en_os_split/task.py new file mode 100644 index 0000000..d612424 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_os_split/task.py @@ -0,0 +1,5 @@ +from genbench import Task + + +class ShiftedPauqEnOsSplit(Task): + pass diff --git a/src/genbench/tasks/shifted_pauq/en_trl_split/__init__.py b/src/genbench/tasks/shifted_pauq/en_trl_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/shifted_pauq/en_trl_split/config.jsonnet b/src/genbench/tasks/shifted_pauq/en_trl_split/config.jsonnet new file mode 100644 index 0000000..d40a2fc --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_trl_split/config.jsonnet @@ -0,0 +1,47 @@ +{ + name: 'Shifted Pauq (en_trl_split)', + + description: 'Shifted Pauq (en_trl_split) aims to measure compositional generalization of on text2sql dataset with splits based on target length. Input language is English. Short query templates are in test, while long query templates are in train. To measure compostional distribution, it was made certain, that all of test tokens of target queries are present in train set.', + + keywords: [ + 'text2sql', + 'label shift', + 'compositional generalization', + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + data_source: { + type: 'hf', + hf_id: ['composite/pauq', 'en_trl'], + git_commit_sha: '63e3e9329f785d097f4746618737d69530d1cdb4', + }, + + has_validation_set: false, + has_train_set: true, + + task_type: 'free_form', + + field_mapping: { + input: 'question', + target: 'query' + }, + + evaluation_metrics: [ + { + hf_id: 'evaluate-metric/exact_match', + best_score: 1.0, + git_commit_sha: '8e612716f2b1b08d23b0b2d7aa667d2f38eb989e' + } + ], + + preparation_strategies: { + finetuning: { + objective: 'maximum_likelihood', + } + } +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/en_trl_split/doc.md b/src/genbench/tasks/shifted_pauq/en_trl_split/doc.md new file mode 100644 index 0000000..5f227c4 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_trl_split/doc.md @@ -0,0 +1,87 @@ +# # Shifted PAUQ -- Text-to-SQL (en_trl_split) + +## Abstract +This is length based split based on the token length of target query template in English. + +We will use this split in addition to the **ru_trl_split** split to measure generalisation across languages. + + In order to measure compositional generalization we have verified that all SQL test tokens are present in train. + +## Examples +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Treasury' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Homeland Security';", + 'question': "List the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born.", + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, 2, [0, [0, 2, False], None], '"Treasury"', None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + ...}, + 'question_toks': ['List', + 'the', + 'states', + 'where', + 'both', + 'the', + 'secretary', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +## Usage +*Describe how to load your task and what is required for evaluation, if anything.* + +Please refer to the **ru_trl_split** card. + +## Data Source +*Describe the data source for this Shifted Pauq (en_trl_split).* + +Please refer to the **ru_trl_split** card. + +## Limitations and Bias +*Note any known limitations or biases that the Shifted Pauq (en_trl_split) has, with links and references if possible.* + +Please refer to the **ru_trl_split** card. + +## GenBench Eval card +*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. + +**GenBench taxonomy values for the experiments:** + +Same as in the **ru_trl_split** card with addition: + +Generalisation type: generalisation across languages (from English to Russian) + +## References +[1] [https://aclanthology.org/D18-1425/](https://aclanthology.org/D18-1425/) +``` +@inproceedings{spider, + title={Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task}, + author={Yu, Tao and Zhang, Rui and Yang, Kai and Yasunaga, Michihiro and Wang, Dongxu and Li, Zifan and Ma, James and Li, Irene and Yao, Qingning and Roman, Shanelle and others}, + booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + pages={3911--3921}, + year={2018} +} +``` diff --git a/src/genbench/tasks/shifted_pauq/en_trl_split/task.py b/src/genbench/tasks/shifted_pauq/en_trl_split/task.py new file mode 100644 index 0000000..67d2876 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_trl_split/task.py @@ -0,0 +1,5 @@ +from genbench import Task + + +class ShiftedPauqEnTRLSplit(Task): + pass diff --git a/src/genbench/tasks/shifted_pauq/en_tsl_split/__init__.py b/src/genbench/tasks/shifted_pauq/en_tsl_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/shifted_pauq/en_tsl_split/config.jsonnet b/src/genbench/tasks/shifted_pauq/en_tsl_split/config.jsonnet new file mode 100644 index 0000000..8520db3 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_tsl_split/config.jsonnet @@ -0,0 +1,47 @@ +{ + name: 'Shifted Pauq (en_tsl_split)', + + description: 'Shifted Pauq (en_tsl_split) aims to measure compositional generalization of on text2sql dataset with splits based on target length. Input language is English. Long query templates are in test, while short query templates are in train. To measure compostional distribution, it was made certain, that all of test tokens of target queries are present in train set.', + + keywords: [ + 'text2sql', + 'label shift', + 'compositional generalization', + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + data_source: { + type: 'hf', + hf_id: ['composite/pauq', 'en_tsl'], + git_commit_sha: '63e3e9329f785d097f4746618737d69530d1cdb4', + }, + + has_validation_set: false, + has_train_set: true, + + task_type: 'free_form', + + field_mapping: { + input: 'question', + target: 'query' + }, + + evaluation_metrics: [ + { + hf_id: 'evaluate-metric/exact_match', + best_score: 1.0, + git_commit_sha: '8e612716f2b1b08d23b0b2d7aa667d2f38eb989e' + } + ], + + preparation_strategies: { + finetuning: { + objective: 'maximum_likelihood', + } + } +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/en_tsl_split/doc.md b/src/genbench/tasks/shifted_pauq/en_tsl_split/doc.md new file mode 100644 index 0000000..9274a80 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_tsl_split/doc.md @@ -0,0 +1,87 @@ +# # Shifted PAUQ -- Text-to-SQL (en_tsl_split) + +## Abstract +This is length based split based on the token length of target query template in English. + +We will use this split in addition to the **ru_tsl_split** split to measure generalisation across languages. + + In order to measure compositional generalization we have verified that all SQL test tokens are present in train. + +## Examples +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Treasury' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Homeland Security';", + 'question': "List the states where both the secretary of 'Treasury' department and the secretary of 'Homeland Security' were born.", + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, 2, [0, [0, 2, False], None], '"Treasury"', None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + ...}, + 'question_toks': ['List', + 'the', + 'states', + 'where', + 'both', + 'the', + 'secretary', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +## Usage +*Describe how to load your task and what is required for evaluation, if anything.* + +Please refer to the **ru_tsl_split** card. + +## Data Source +*Describe the data source for this Shifted Pauq (en_tsl_split).* + +Please refer to the **ru_tsl_split** card. + +## Limitations and Bias +*Note any known limitations or biases that the Shifted Pauq (en_tsl_split) has, with links and references if possible.* + +Please refer to the **ru_tsl_split** card. + +## GenBench Eval card +*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. + +**GenBench taxonomy values for the experiments:** + +Same as in the **ru_tsl_split** card with addition: + +Generalisation type: generalisation across languages (from English to Russian) + +## References +[1] [https://aclanthology.org/D18-1425/](https://aclanthology.org/D18-1425/) +``` +@inproceedings{spider, + title={Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task}, + author={Yu, Tao and Zhang, Rui and Yang, Kai and Yasunaga, Michihiro and Wang, Dongxu and Li, Zifan and Ma, James and Li, Irene and Yao, Qingning and Roman, Shanelle and others}, + booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + pages={3911--3921}, + year={2018} +} +``` diff --git a/src/genbench/tasks/shifted_pauq/en_tsl_split/task.py b/src/genbench/tasks/shifted_pauq/en_tsl_split/task.py new file mode 100644 index 0000000..9976385 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/en_tsl_split/task.py @@ -0,0 +1,5 @@ +from genbench import Task + + +class ShiftedPauqEnTSLSplit(Task): + pass diff --git a/src/genbench/tasks/shifted_pauq/ru_os_split/__init__.py b/src/genbench/tasks/shifted_pauq/ru_os_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/shifted_pauq/ru_os_split/config.jsonnet b/src/genbench/tasks/shifted_pauq/ru_os_split/config.jsonnet new file mode 100644 index 0000000..44d8447 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_os_split/config.jsonnet @@ -0,0 +1,47 @@ +{ + name: 'Shifted Pauq (ru_os_split)', + + description: 'Shifted Pauq in Russian (ru_os_split) aims to measure model ability to generate SQL queries from Russian input with splits generated in i.i.d. strategy.', + + keywords: [ + 'text2sql', + 'iid distribution', + 'multilingual' + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + data_source: { + type: 'hf', + hf_id: ['composite/pauq', 'ru_os'], + git_commit_sha: '63e3e9329f785d097f4746618737d69530d1cdb4', + }, + + has_validation_set: false, + has_train_set: true, + + task_type: 'free_form', + + field_mapping: { + input: 'question', + target: 'query' + }, + + evaluation_metrics: [ + { + hf_id: 'evaluate-metric/exact_match', + best_score: 1.0, + git_commit_sha: '8e612716f2b1b08d23b0b2d7aa667d2f38eb989e' + } + ], + + preparation_strategies: { + finetuning: { + objective: 'maximum_likelihood', + } + } +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/ru_os_split/doc.md b/src/genbench/tasks/shifted_pauq/ru_os_split/doc.md new file mode 100644 index 0000000..8c39f87 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_os_split/doc.md @@ -0,0 +1,128 @@ +# Shifted PAUQ -- Text-to-SQL (ru_pauq_iid_split) + +## Abstract +This is an original split of the Russian _Text-to-SQL_ dataset, PAUQ [1], which is under the assumption of independency and identically (i.i.d) distributed train and test data. We provide this split in addition to the **ru_trl_split** and **ru_tsl_split** splits. + +baseline scores of a sequence-to-sequence model such as BRIDGE: please refer to [1] for details + +the size of the dataset: please refer to [1] for details + +## Examples +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Управление общего учёта' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Бюджетная служба Конгресса';", + 'question': 'Перечислите штаты, в которых родились и секретарь управления общего учёта, и секретарь бюджетной службы Конгресса.', + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Управление общего учёта"', + None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + 'limit': None, + 'intersect': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Бюджетная служба Конгресса"', + None]], + ...}, + 'question_toks': ['Перечислите', + 'штаты', + ',', + 'в', + 'которых', + 'родились', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + 'management', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +## Usage +*Describe how to load your task and what is required for evaluation, if anything.* + +For evaluation, we will use a sequence-to-sequence architecture. Evaluation metric is exact match. Please refer to config.jsonnet for details for now. + + +## Data Source +*Describe the data source for this Shifted Pauq (ru_os_split).* + +The PAUQ statistics can be found in [1] + +Train/test data for the ru_os split is available at https://huggingface.co/datasets/composite/pauq + +## Limitations and Bias +*Note any known limitations or biases that the Shifted Pauq (ru_pauq_target_length_split) has, with links and references if possible.* + +from [1]: PAUQ is an adaptation of the Spider dataset to Russian language, it indeed inherits most of Spider's limitations. First of all, the data is still `artificial' which means that it was created by a limited number of people specifically for training and evaluating text-to-SQL models, thus it lacks the diversity and complexity of natural data formed by questions that people formulate in order to get the desired information from the database. For instance, the real-world data contain NL queries that require common sense knowledge which can't be extracted directly from the database; ambiguous questions allowing various ways of interpretation that are quite frequent and queries with window functions that make the process easier and more convenient, -- all of these aren't included in the Spider dataset, as well as in PAUQ. Another limitation concerns evaluation metric -- exact match, which is the most commonly used to evaluate text-to-SQL models performance. However, the metric is too strict and prone to false negative results. + + +## GenBench Eval card +*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. + +**GenBench taxonomy values for the experiments:** + +Motivation: Practical + +Generalisation type: Compositional, Structural + +Shift type: Covariate + +Shift source: Naturally occuring + +Shift locus: train--test, Finetune train--test + +## Citation +[1] [https://aclanthology.org/2022.findings-emnlp.175.pdf](https://aclanthology.org/2022.findings-emnlp.175.pdf) +``` +@inproceedings{pauq, + title = "{PAUQ}: Text-to-{SQL} in {R}ussian", + author = "Bakshandaeva, Daria and + Somov, Oleg and + Dmitrieva, Ekaterina and + Davydova, Vera and + Tutubalina, Elena", + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-emnlp.175", + pages = "2355--2376", + abstract = "Semantic parsing is an important task that allows to democratize human-computer interaction. One of the most popular text-to-SQL datasets with complex and diverse natural language (NL) questions and SQL queries is Spider. We construct and complement a Spider dataset for Russian, thus creating the first publicly available text-to-SQL dataset for this language. While examining its components - NL questions, SQL queries and databases content - we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. We select thirty functional test sets with different features that can be used for the evaluation of neural models{'} abilities. To conduct the experiments, we adapt baseline architectures RAT-SQL and BRIDGE and provide in-depth query component analysis. On the target language, both models demonstrate strong results with monolingual training and improved accuracy in multilingual scenario. In this paper, we also study trade-offs between machine-translated and manually-created NL queries. At present, Russian text-to-SQL is lacking in datasets as well as trained models, and we view this work as an important step towards filling this gap.", +} +``` + diff --git a/src/genbench/tasks/shifted_pauq/ru_os_split/task.py b/src/genbench/tasks/shifted_pauq/ru_os_split/task.py new file mode 100644 index 0000000..5c70a67 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_os_split/task.py @@ -0,0 +1,5 @@ +from genbench import Task + + +class ShiftedPauqRuOsSplit(Task): + pass diff --git a/src/genbench/tasks/shifted_pauq/ru_trl_split/__init__.py b/src/genbench/tasks/shifted_pauq/ru_trl_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/shifted_pauq/ru_trl_split/config.jsonnet b/src/genbench/tasks/shifted_pauq/ru_trl_split/config.jsonnet new file mode 100644 index 0000000..0d58f70 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_trl_split/config.jsonnet @@ -0,0 +1,47 @@ +{ + name: 'Shifted Pauq (ru_trl_split)', + + description: 'Shifted Pauq (en_trl_split) aims to measure compositional generalization of on text2sql dataset with splits based on target length. Input language is Russian. Short query templates are in test, while long query templates are in train. To measure compostional distribution, it was made certain, that all of test tokens of target queries are present in train set.', + + keywords: [ + 'text2sql', + 'label shift', + 'compositional generalization', + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + data_source: { + type: 'hf', + hf_id: ['composite/pauq', 'ru_trl'], + git_commit_sha: '63e3e9329f785d097f4746618737d69530d1cdb4', + }, + + has_validation_set: false, + has_train_set: true, + + task_type: 'free_form', + + field_mapping: { + input: 'question', + target: 'query' + }, + + evaluation_metrics: [ + { + hf_id: 'evaluate-metric/exact_match', + best_score: 1.0, + git_commit_sha: '8e612716f2b1b08d23b0b2d7aa667d2f38eb989e' + } + ], + + preparation_strategies: { + finetuning: { + objective: 'maximum_likelihood', + } + } +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/ru_trl_split/doc.md b/src/genbench/tasks/shifted_pauq/ru_trl_split/doc.md new file mode 100644 index 0000000..a41fbe7 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_trl_split/doc.md @@ -0,0 +1,119 @@ +# Shifted PAUQ -- Text-to-SQL (ru_trl_split) +*A brief explaination of the task, its motivation and a description of the submssion data. Minimally, this should include: what generalisation property(ies) the task is testing for (GenBench taxonomy values); what assumptions are made about the training data; the size of the dataset; baseline scores. If it already contains this information, feel free to copy the abstract of the accompanying paper.* + +## Abstract +In this work, we propose a custom split of the Russian _Text-to-SQL_ dataset, PAUQ [1], that assesses compositional generalization in a _text-to-query_ models. +The proposed split, ru_trl_split, is length-based split based on the token length, i.e. datasets' items are separated by length such that the test set contains examples of different lengths than those in the train set. In this setup, we measure generalization to simpler queries. +In order to measure compositional generalization, we have verified that all SQL test tokens are present in train. + +* Baseline scores of a sequence-to-sequence model: TODO +* The size of the dataset: please refer to [1] for details +* The size of the split: TODO + +## Examples +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Управление общего учёта' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Бюджетная служба Конгресса';", + 'question': 'Перечислите штаты, в которых родились и секретарь управления общего учёта, и секретарь бюджетной службы Конгресса.', + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Управление общего учёта"', + None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + 'limit': None, + 'intersect': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Бюджетная служба Конгресса"', + None]], + ...}, + 'question_toks': ['Перечислите', + 'штаты', + ',', + 'в', + 'которых', + 'родились', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + 'management', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +## Usage +*Describe how to load your task and what is required for evaluation, if anything.* + +For evaluation, we will use a sequence-to-sequence architecture. Evaluation metric is exact match. Please refer to config.jsonnet for details for now. + + +## Data Source +*Describe the data source for this Shifted Pauq (ru_trl_split).* + +The PAUQ statistics can be found in [1] + +Train/test data for the ru_trl_split split is available at https://huggingface.co/datasets/composite/pauq + +## Limitations and Bias +*Note any known limitations or biases that the Shifted Pauq (ru_trl_split) has, with links and references if possible.* + +Our research explores distribution shift, i.e. data drift using ru_trl_split, and investigates how they affect the model. Specifically, we examine the scenario where test tokens are present in the training set and do not explore the more challenging case of modeling unseen tokens. + +Big language models such as Codex, a 175B GPT model further fine-tuned on code, are out of the scope of this study. + +## GenBench Eval card +*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. + +### GenBench taxonomy values for the experiments: + +* **Motivation:** Practical +* **Generalisation type:** Compositional, Structural +* **Shift type:** Covariate +* **Shift source:** Naturally occuring +* **Shift locus:** train--test, Finetune train--test + + +## References +[1] PAUQ: Text-to-SQL in Russian +``` +@inproceedings{pauq, + title={PAUQ: Text-to-SQL in Russian}, + author={Bakshandaeva, Daria and Somov, Oleg and Dmitrieva, Ekaterina and Davydova, Vera and Tutubalina, Elena}, + booktitle={Findings of the Association for Computational Linguistics: EMNLP 2022}, + pages={2355--2376}, + year={2022} +} +``` diff --git a/src/genbench/tasks/shifted_pauq/ru_trl_split/task.py b/src/genbench/tasks/shifted_pauq/ru_trl_split/task.py new file mode 100644 index 0000000..d50e880 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_trl_split/task.py @@ -0,0 +1,5 @@ +from genbench import Task + + +class ShiftedPauqRuTRLSplit(Task): + pass diff --git a/src/genbench/tasks/shifted_pauq/ru_tsl_split/__init__.py b/src/genbench/tasks/shifted_pauq/ru_tsl_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/shifted_pauq/ru_tsl_split/config.jsonnet b/src/genbench/tasks/shifted_pauq/ru_tsl_split/config.jsonnet new file mode 100644 index 0000000..90ef79b --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_tsl_split/config.jsonnet @@ -0,0 +1,47 @@ +{ + name: 'Shifted Pauq (ru_tsl_split)', + + description: 'Shifted Pauq (en_tsl_split) aims to measure compositional generalization of on text2sql dataset with splits based on target length. Input language is Russian. Long query templates are in test, while short query templates are in train. To measure compostional distribution, it was made certain, that all of test tokens of target queries are present in train set.', + + keywords: [ + 'text2sql', + 'label shift', + 'compositional generalization', + ], + + authors: [ + 'Somov Oleg', + 'Dmietrieva Ekaterina', + 'Tutubalina Elena', + ], + + data_source: { + type: 'hf', + hf_id: ['composite/pauq', 'ru_tsl'], + git_commit_sha: '63e3e9329f785d097f4746618737d69530d1cdb4', + }, + + has_validation_set: false, + has_train_set: true, + + task_type: 'free_form', + + field_mapping: { + input: 'question', + target: 'query' + }, + + evaluation_metrics: [ + { + hf_id: 'evaluate-metric/exact_match', + best_score: 1.0, + git_commit_sha: '8e612716f2b1b08d23b0b2d7aa667d2f38eb989e' + } + ], + + preparation_strategies: { + finetuning: { + objective: 'maximum_likelihood', + } + } +} \ No newline at end of file diff --git a/src/genbench/tasks/shifted_pauq/ru_tsl_split/doc.md b/src/genbench/tasks/shifted_pauq/ru_tsl_split/doc.md new file mode 100644 index 0000000..45c4a33 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_tsl_split/doc.md @@ -0,0 +1,119 @@ +# Shifted PAUQ -- Text-to-SQL (ru_tsl_split) +*A brief explaination of the task, its motivation and a description of the submssion data. Minimally, this should include: what generalisation property(ies) the task is testing for (GenBench taxonomy values); what assumptions are made about the training data; the size of the dataset; baseline scores. If it already contains this information, feel free to copy the abstract of the accompanying paper.* + +## Abstract +In this work, we propose a custom split of the Russian _Text-to-SQL_ dataset, PAUQ [1], that assesses compositional generalization in a _text-to-query_ models. +The proposed split, ru_tsl_split, is length-based split based on the token length, i.e. datasets' items are separated by length such that the test set contains examples of different lengths than those in the train set. In this setup, we measure generalization to more complex queries. +In order to measure compositional generalization, we have verified that all SQL test tokens are present in train. + +* Baseline scores of a sequence-to-sequence model: TODO +* The size of the dataset: please refer to [1] for details +* The size of the split: TODO + +## Examples +``` +{'id': 'TS_0014', + 'db_id': 'department_management', + 'source': 'spider-train', + 'type': 'train', + 'query': "SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Управление общего учёта' INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = 'Бюджетная служба Конгресса';", + 'question': 'Перечислите штаты, в которых родились и секретарь управления общего учёта, и секретарь бюджетной службы Конгресса.', + 'sql': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Управление общего учёта"', + None]], + 'groupBy': [], + 'having': [], + 'orderBy': [], + 'limit': None, + 'intersect': {'from': {'table_units': [['table_unit', 0], + ['table_unit', 2], + ['table_unit', 1]], + 'conds': [[False, 2, [0, [0, 1, False], None], [0, 11, False], None], + 'and', + [False, 2, [0, [0, 12, False], None], [0, 7, False], None]]}, + 'select': [False, [[0, [0, [0, 9, False], None]]]], + 'where': [[False, + 2, + [0, [0, 2, False], None], + '"Бюджетная служба Конгресса"', + None]], + ...}, + 'question_toks': ['Перечислите', + 'штаты', + ',', + 'в', + 'которых', + 'родились', + ...], + 'query_toks': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + 'management', + ...], + 'query_toks_no_values': ['select', + 't3.born_state', + 'from', + 'department', + 'as', + 't1', + 'join', + ...], + 'masked_query': 'SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_1 INTERSECT SELECT T3.born_state FROM department AS T1 JOIN management AS T2 ON T1.department_id = T2.department_id JOIN head AS T3 ON T2.head_id = T3.head_id WHERE T1.name = TEXT_VALUE_2'} +``` + +## Usage +*Describe how to load your task and what is required for evaluation, if anything.* + +For evaluation, we will use a sequence-to-sequence architecture. Evaluation metric is exact match. Please refer to config.jsonnet for details for now. + + +## Data Source +*Describe the data source for this Shifted Pauq (ru_tsl_split).* + +The PAUQ statistics can be found in [1] + +Train/test data for the ru_tsl_split split is available at https://huggingface.co/datasets/composite/pauq + +## Limitations and Bias +*Note any known limitations or biases that the Shifted Pauq (ru_tsl_split) has, with links and references if possible.* + +Our research explores distribution shift, i.e. data drift using ru_tsl_split, and investigates how they affect the model. Specifically, we examine the scenario where test tokens are present in the training set and do not explore the more challenging case of modeling unseen tokens. + +Big language models such as Codex, a 175B GPT model further fine-tuned on code, are out of the scope of this study. + +## GenBench Eval card +*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. + +### GenBench taxonomy values for the experiments: + +* **Motivation:** Practical +* **Generalisation type:** Compositional, Structural +* **Shift type:** Covariate +* **Shift source:** Naturally occuring +* **Shift locus:** train--test, Finetune train--test + + +## References +[1] PAUQ: Text-to-SQL in Russian +``` +@inproceedings{pauq, + title={PAUQ: Text-to-SQL in Russian}, + author={Bakshandaeva, Daria and Somov, Oleg and Dmitrieva, Ekaterina and Davydova, Vera and Tutubalina, Elena}, + booktitle={Findings of the Association for Computational Linguistics: EMNLP 2022}, + pages={2355--2376}, + year={2022} +} +``` diff --git a/src/genbench/tasks/shifted_pauq/ru_tsl_split/task.py b/src/genbench/tasks/shifted_pauq/ru_tsl_split/task.py new file mode 100644 index 0000000..9ad79c3 --- /dev/null +++ b/src/genbench/tasks/shifted_pauq/ru_tsl_split/task.py @@ -0,0 +1,5 @@ +from genbench import Task + + +class ShiftedPauqRuTSLSplit(Task): + pass