fix: Add BRIGHT (long) and fix bug in TaskResult.filter_and_validate() (embeddings-benchmark#2041)

KennethEnevoldsen · silky1708 · commit 16216402c1a0 · 2025-03-10T15:39:58.000-07:00
* fix: Add BRIGHT Long Fixes embeddings-benchmark#1978 * fix: Add BRIGHT(long) * fix bug in task results * updated bright * updated tests for TaskResults
diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -1060,9 +1060,7 @@
 
 BRIGHT = Benchmark(
     name="BRIGHT",
-    tasks=get_tasks(
-        tasks=["BrightRetrieval"],
-    ),
+    tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
     description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
     BRIGHT is the first text retrieval
     benchmark that requires intensive reasoning to retrieve relevant documents with
@@ -1079,6 +1077,28 @@
 }""",
 )
 
+
+BRIGHT_LONG = Benchmark(
+    name="BRIGHT (long)",
+    tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["long"]),
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
+BRIGHT is the first text retrieval
+benchmark that requires intensive reasoning to retrieve relevant documents with
+a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
+economics, psychology, mathematics, and coding. These queries are drawn from
+naturally occurring and carefully curated human data.
+
+This is the long version of the benchmark, which only filter longer documents.
+    """,
+    reference="https://brightbenchmark.github.io/",
+    citation="""@article{su2024bright,
+  title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
+  author={Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
+  journal={arXiv preprint arXiv:2407.12883},
+  year={2024}
+}""",
+)
+
 CODE_RAG = Benchmark(
     name="CodeRAG",
     tasks=get_tasks(
diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py
@@ -525,7 +525,7 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
         if task is None:
             task = get_task(self.task_name)
 
-        splits = task.metadata.eval_splits
+        splits = task.eval_splits
         hf_subsets = task.hf_subsets
         hf_subsets = set(hf_subsets)
 
diff --git a/mteb/tasks/Retrieval/eng/BrightRetrieval.py b/mteb/tasks/Retrieval/eng/BrightRetrieval.py
@@ -50,7 +50,6 @@ class BrightRetrieval(MultilingualTask, AbsTaskRetrieval):
         domains=["Non-fiction", "Written"],
         task_subtypes=["Article retrieval"],
         license="cc-by-4.0",
-        socioeconomic_status="low",
         annotations_creators="derived",
         dialect=[],
         sample_creation="found",
diff --git a/tests/test_load_results/test_task_results.py b/tests/test_load_results/test_task_results.py
@@ -34,7 +34,6 @@ class DummyTask(AbsTask):
         annotations_creators="derived",
         dialect=[],
         bibtex_citation="",
-        descriptive_stats={},
         modalities=["text"],
         sample_creation="created",
     )
@@ -48,11 +47,11 @@ def _evaluate_subset(self, **kwargs):
     def _calculate_metrics_from_split(
         self, split: str, hf_subset: str | None = None, compute_overall=False
     ) -> dict[str, float]:
-        pass
+        return {}
 
 
-def test_mteb_results():
-    """Test TaskResult class (this is the same as the example in the docstring)"""
+@pytest.fixture()
+def task_result():
     scores = {
         "train": {
             "en-de": {
@@ -66,13 +65,19 @@ def test_mteb_results():
 
     evaluation_time = 100
 
-    mteb_results = TaskResult.from_task_results(
+    return TaskResult.from_task_results(
         task=DummyTask(), scores=scores, evaluation_time=evaluation_time
     )
 
-    assert mteb_results.get_score() == 0.55
-    assert mteb_results.get_score(languages=["eng"]) == 0.55
-    assert mteb_results.get_score(languages=["fra"]) == 0.6
+
+def test_task_results_get_score(task_result: TaskResult):
+    """Test TaskResult class (this is the same as the example in the docstring)"""
+    assert task_result.get_score() == 0.55
+    assert task_result.get_score(languages=["eng"]) == 0.55
+    assert task_result.get_score(languages=["fra"]) == 0.6
+
+
+def test_task_results_to_dict(task_result: TaskResult):
     dict_repr = {
         "dataset_revision": "1.0",
         "task_name": "dummy_task",
@@ -94,7 +99,52 @@ def test_mteb_results():
             ]
         },
     }
-    assert mteb_results.to_dict() == dict_repr
+    assert task_result.to_dict() == dict_repr
+
+
+def test_task_results_validate_and_filter():
+    scores = {
+        "train": {
+            "en-de": {
+                "main_score": 0.5,
+            },
+            "en-fr": {
+                "main_score": 0.6,
+            },
+        },
+        "test": {
+            "en-de": {
+                "main_score": 0.3,
+            },
+            "en-fr": {
+                "main_score": 0.4,
+            },
+        },
+    }
+
+    evaluation_time = 100
+
+    res = TaskResult.from_task_results(
+        task=DummyTask(), scores=scores, evaluation_time=evaluation_time
+    )
+
+    task = DummyTask()
+    task._eval_splits = ["train", "test"]
+    res1 = res.validate_and_filter_scores(task=task)
+
+    assert res1.scores.keys() == {"train", "test"}
+    assert res1.get_score() == (0.5 + 0.6 + 0.3 + 0.4) / 4
+
+    task._eval_splits = ["test"]
+    res2 = res.validate_and_filter_scores(task=task)
+    assert res2.scores.keys() == {"test"}
+    assert res2.get_score() == (0.3 + 0.4) / 2  # only test scores
+
+    task.hf_subsets = ["en-de"]
+    task._eval_splits = ["train", "test"]
+    res3 = res.validate_and_filter_scores(task=task)
+    assert res3.scores.keys() == {"train", "test"}
+    assert res3.get_score() == (0.5 + 0.3) / 2  # only en-de scores
 
 
 @pytest.mark.parametrize(