Skip to content

Commit

Permalink
Merge pull request #3288 from flairNLP/extend-german-ler-dataset
Browse files Browse the repository at this point in the history
Extend German LER Dataset
  • Loading branch information
alanakbik committed Aug 8, 2023
2 parents 4fe6ead + f1a51ab commit 1590089
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
9 changes: 6 additions & 3 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2577,7 +2577,6 @@ def __init__(
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)

Expand All @@ -2591,13 +2590,17 @@ def __init__(

# download data if necessary
ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)

for split in ["train", "dev", "test"]:
cached_path(f"{ler_path}ler_{split}.conll", Path("datasets") / dataset_name)

super().__init__(
data_folder,
columns,
in_memory=in_memory,
train_file="ler.conll",
train_file="ler_train.conll",
dev_file="ler_dev.conll",
test_file="ler_test.conll",
**corpusargs,
)

Expand Down
9 changes: 9 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,15 @@ def check_number_sentences(reference: int, actual: int, split_name: str):
check_number_sentences(len(corpus.dev), stats["dev"], "dev")


def test_german_ler_corpus(tasks_base_path):
corpus = flair.datasets.NER_GERMAN_LEGAL()

# Number of instances per dataset split are taken from https://huggingface.co/datasets/elenanereiss/german-ler
assert len(corpus.train) == 53384, "Mismatch in number of sentences for train split"
assert len(corpus.dev) == 6666, "Mismatch in number of sentences for dev split"
assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down

0 comments on commit 1590089

Please sign in to comment.