Merge pull request #3288 from flairNLP/extend-german-ler-dataset

Extend German LER Dataset
flairNLP · Aug 8, 2023 · 1590089 · 1590089
2 parents 4fe6ead + f1a51ab
commit 1590089
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 3 deletions.
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -2577,7 +2577,6 @@ def __init__(
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
         :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2591,13 +2590,17 @@ def __init__(
 
         # download data if necessary
         ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
-        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
+
+        for split in ["train", "dev", "test"]:
+            cached_path(f"{ler_path}ler_{split}.conll", Path("datasets") / dataset_name)
 
         super().__init__(
             data_folder,
             columns,
             in_memory=in_memory,
-            train_file="ler.conll",
+            train_file="ler_train.conll",
+            dev_file="ler_dev.conll",
+            test_file="ler_test.conll",
             **corpusargs,
         )
 

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -799,6 +799,15 @@ def check_number_sentences(reference: int, actual: int, split_name: str):
         check_number_sentences(len(corpus.dev), stats["dev"], "dev")
 
 
+def test_german_ler_corpus(tasks_base_path):
+    corpus = flair.datasets.NER_GERMAN_LEGAL()
+
+    # Number of instances per dataset split are taken from https://huggingface.co/datasets/elenanereiss/german-ler
+    assert len(corpus.train) == 53384, "Mismatch in number of sentences for train split"
+    assert len(corpus.dev) == 6666, "Mismatch in number of sentences for dev split"
+    assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],