Skip to content

Commit 1621640

Browse files
KennethEnevoldsensilky1708
authored andcommitted
fix: Add BRIGHT (long) and fix bug in TaskResult.filter_and_validate() (embeddings-benchmark#2041)
* fix: Add BRIGHT Long Fixes embeddings-benchmark#1978 * fix: Add BRIGHT(long) * fix bug in task results * updated bright * updated tests for TaskResults
1 parent cc4a794 commit 1621640

File tree

4 files changed

+83
-14
lines changed

4 files changed

+83
-14
lines changed

mteb/benchmarks/benchmarks.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -1060,9 +1060,7 @@
10601060

10611061
BRIGHT = Benchmark(
10621062
name="BRIGHT",
1063-
tasks=get_tasks(
1064-
tasks=["BrightRetrieval"],
1065-
),
1063+
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
10661064
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
10671065
BRIGHT is the first text retrieval
10681066
benchmark that requires intensive reasoning to retrieve relevant documents with
@@ -1079,6 +1077,28 @@
10791077
}""",
10801078
)
10811079

1080+
1081+
BRIGHT_LONG = Benchmark(
1082+
name="BRIGHT (long)",
1083+
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["long"]),
1084+
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
1085+
BRIGHT is the first text retrieval
1086+
benchmark that requires intensive reasoning to retrieve relevant documents with
1087+
a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
1088+
economics, psychology, mathematics, and coding. These queries are drawn from
1089+
naturally occurring and carefully curated human data.
1090+
1091+
This is the long version of the benchmark, which only filter longer documents.
1092+
""",
1093+
reference="https://brightbenchmark.github.io/",
1094+
citation="""@article{su2024bright,
1095+
title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
1096+
author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
1097+
journal={arXiv preprint arXiv:2407.12883},
1098+
year={2024}
1099+
}""",
1100+
)
1101+
10821102
CODE_RAG = Benchmark(
10831103
name="CodeRAG",
10841104
tasks=get_tasks(

mteb/load_results/task_results.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
525525
if task is None:
526526
task = get_task(self.task_name)
527527

528-
splits = task.metadata.eval_splits
528+
splits = task.eval_splits
529529
hf_subsets = task.hf_subsets
530530
hf_subsets = set(hf_subsets)
531531

mteb/tasks/Retrieval/eng/BrightRetrieval.py

-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval):
5050
domains=["Non-fiction", "Written"],
5151
task_subtypes=["Article retrieval"],
5252
license="cc-by-4.0",
53-
socioeconomic_status="low",
5453
annotations_creators="derived",
5554
dialect=[],
5655
sample_creation="found",

tests/test_load_results/test_mteb_results.py tests/test_load_results/test_task_results.py

+59-9
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ class DummyTask(AbsTask):
3434
annotations_creators="derived",
3535
dialect=[],
3636
bibtex_citation="",
37-
descriptive_stats={},
3837
modalities=["text"],
3938
sample_creation="created",
4039
)
@@ -48,11 +47,11 @@ def _evaluate_subset(self, **kwargs):
4847
def _calculate_metrics_from_split(
4948
self, split: str, hf_subset: str | None = None, compute_overall=False
5049
) -> dict[str, float]:
51-
pass
50+
return {}
5251

5352

54-
def test_mteb_results():
55-
"""Test TaskResult class (this is the same as the example in the docstring)"""
53+
@pytest.fixture()
54+
def task_result():
5655
scores = {
5756
"train": {
5857
"en-de": {
@@ -66,13 +65,19 @@ def test_mteb_results():
6665

6766
evaluation_time = 100
6867

69-
mteb_results = TaskResult.from_task_results(
68+
return TaskResult.from_task_results(
7069
task=DummyTask(), scores=scores, evaluation_time=evaluation_time
7170
)
7271

73-
assert mteb_results.get_score() == 0.55
74-
assert mteb_results.get_score(languages=["eng"]) == 0.55
75-
assert mteb_results.get_score(languages=["fra"]) == 0.6
72+
73+
def test_task_results_get_score(task_result: TaskResult):
74+
"""Test TaskResult class (this is the same as the example in the docstring)"""
75+
assert task_result.get_score() == 0.55
76+
assert task_result.get_score(languages=["eng"]) == 0.55
77+
assert task_result.get_score(languages=["fra"]) == 0.6
78+
79+
80+
def test_task_results_to_dict(task_result: TaskResult):
7681
dict_repr = {
7782
"dataset_revision": "1.0",
7883
"task_name": "dummy_task",
@@ -94,7 +99,52 @@ def test_mteb_results():
9499
]
95100
},
96101
}
97-
assert mteb_results.to_dict() == dict_repr
102+
assert task_result.to_dict() == dict_repr
103+
104+
105+
def test_task_results_validate_and_filter():
106+
scores = {
107+
"train": {
108+
"en-de": {
109+
"main_score": 0.5,
110+
},
111+
"en-fr": {
112+
"main_score": 0.6,
113+
},
114+
},
115+
"test": {
116+
"en-de": {
117+
"main_score": 0.3,
118+
},
119+
"en-fr": {
120+
"main_score": 0.4,
121+
},
122+
},
123+
}
124+
125+
evaluation_time = 100
126+
127+
res = TaskResult.from_task_results(
128+
task=DummyTask(), scores=scores, evaluation_time=evaluation_time
129+
)
130+
131+
task = DummyTask()
132+
task._eval_splits = ["train", "test"]
133+
res1 = res.validate_and_filter_scores(task=task)
134+
135+
assert res1.scores.keys() == {"train", "test"}
136+
assert res1.get_score() == (0.5 + 0.6 + 0.3 + 0.4) / 4
137+
138+
task._eval_splits = ["test"]
139+
res2 = res.validate_and_filter_scores(task=task)
140+
assert res2.scores.keys() == {"test"}
141+
assert res2.get_score() == (0.3 + 0.4) / 2 # only test scores
142+
143+
task.hf_subsets = ["en-de"]
144+
task._eval_splits = ["train", "test"]
145+
res3 = res.validate_and_filter_scores(task=task)
146+
assert res3.scores.keys() == {"train", "test"}
147+
assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores
98148

99149

100150
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)