From d8fe0b55d07504bcd067ea71df8b05de517ab8ac Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 23 May 2023 23:44:30 +0200 Subject: [PATCH 01/10] datasets: include AFRICA_POS implementation --- flair/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index e549810d1..092338e4a 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -163,6 +163,7 @@ # standard NER datasets # Expose all sequence labeling datasets from .sequence_labeling import ( + AFRICA_POS, BIOSCOPE, CONLL_03, CONLL_03_DUTCH, @@ -312,6 +313,7 @@ "SentenceDataset", "MongoDataset", "StringDataset", + "AFRICA_POS", "ANAT_EM", "AZDZ", "BC2GM", From 2bc445b9e3053db8f5fe9207e46401ae73cf3dcb Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 23 May 2023 23:44:55 +0200 Subject: [PATCH 02/10] datasets: add support for AfricaPOS dataset --- flair/datasets/sequence_labeling.py | 103 ++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index d214873b5..3020f15c1 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4795,3 +4795,106 @@ def __init__( sample_missing_splits=False, name="nermud", ) + + +class AFRICA_POS(MultiCorpus): + def __init__( + self, + languages: Union[str, List[str]] = "bam", + version: str = "v1", + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + **corpusargs, + ) -> None: + """Initialize the AfricaPOS corpus available on https://github.com/masakhane-io/masakhane-pos. + + It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus + with the languages you require. If you pass "all", all languages will be initialized. + :version: Specifies version of the dataset. Currently, only "v1" is supported. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) + + # if only one language is given + if type(languages) == str: + languages = [languages] + + # column format + columns = {0: "text", 1: "pos"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + supported_versions = ["v1"] + + if version not in supported_versions: + log.error(f"The specified version '{version}' is not in the list of supported version!") + log.error(f"Supported versions are '{supported_versions}'!") + raise Exception + + data_folder = base_path / dataset_name / version + + supported_languages = [ + "bam", + "bbj", + "ewe", + "fon", + "hau", + "ibo", + "kin", + "lug", + "luo", + "mos", + "pcm", + "nya", + "sna", + "swa", + "tsn", + "twi", + "wol", + "xho", + "yor", + "zul", + ] + + data_paths = { + "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/AfricaPOS", + } + + # use all languages if explicitly set to "all" + if languages == ["all"]: + languages = supported_languages + + corpora: List[Corpus] = [] + for language in languages: + if language not in supported_languages: + log.error(f"Language '{language}' is not in list of supported languages!") + log.error(f"Supported are '{supported_languages}'!") + log.error("Instantiate this Corpus for instance like so 'corpus = AFRICA_POS(languages='luo')'") + raise Exception + + language_folder = data_folder / language + + # download data if necessary + data_path = f"{data_paths[version]}/{language}" + cached_path(f"{data_path}/dev.txt", language_folder) + cached_path(f"{data_path}/test.txt", language_folder) + cached_path(f"{data_path}/train.txt", language_folder) + + # initialize comlumncorpus and add it to list + log.info(f"Reading data for language {language}@{version}") + corp = ColumnCorpus( + data_folder=language_folder, + column_format=columns, + encoding="utf-8", + in_memory=in_memory, + name=language, + **corpusargs, + ) + corpora.append(corp) + super().__init__( + corpora, + name="africa-pos-" + "-".join(languages), + ) From 9e1e26e18a6f11b87fef708c74bd43ba9dc5bc37 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 8 Aug 2023 21:47:29 +0200 Subject: [PATCH 03/10] tests: adjust test cases for MasakhaPOS dataset --- tests/test_datasets.py | 68 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d642be251..36b97391c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -808,6 +808,74 @@ def test_german_ler_corpus(tasks_base_path): assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split" +def test_afri_pos_corpus(tasks_base_path): + # This test covers the complete AfricaPOS dataset. + supported_versions = ["v1"] + + supported_languages = { + "v1": [ + "bam", + "bbj", + "ewe", + "fon", + "hau", + "ibo", + "kin", + "lug", + "luo", + "mos", + "pcm", + "nya", + "sna", + "swa", + "tsn", + "twi", + "wol", + "xho", + "yor", + "zul", + ], + } + + africa_pos_stats = { + "v1": { + "bam": {"train": 793, "dev": 158, "test": 634}, + "bbj": {"train": 750, "dev": 149, "test": 599}, + "ewe": {"train": 728, "dev": 145, "test": 582}, + "fon": {"train": 798, "dev": 159, "test": 637}, + "hau": {"train": 753, "dev": 150, "test": 601}, + "ibo": {"train": 803, "dev": 160, "test": 642}, + "kin": {"train": 757, "dev": 151, "test": 604}, + "lug": {"train": 733, "dev": 146, "test": 586}, + "luo": {"train": 757, "dev": 151, "test": 604}, + "mos": {"train": 757, "dev": 151, "test": 604}, + "pcm": {"train": 752, "dev": 150, "test": 600}, + "nya": {"train": 728, "dev": 145, "test": 582}, + "sna": {"train": 747, "dev": 149, "test": 596}, + "swa": {"train": 675, "dev": 134, "test": 539}, + "tsn": {"train": 753, "dev": 150, "test": 602}, + "twi": {"train": 775, "dev": 154, "test": 618}, + "wol": {"train": 770, "dev": 154, "test": 616}, + "xho": {"train": 752, "dev": 150, "test": 601}, + "yor": {"train": 875, "dev": 174, "test": 698}, + "zul": {"train": 753, "dev": 150, "test": 601}, + }, + } + + def check_number_sentences(reference: int, actual: int, split_name: str, language: str, version: str): + assert actual == reference, f"Mismatch in number of sentences for {language}@{version}/{split_name}" + + for version in supported_versions: + for language in supported_languages[version]: + corpus = flair.datasets.AFRICA_POS(languages=language, version=version) + + gold_stats = africa_pos_stats[version][language] + + check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version) + check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version) + check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) + + def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): corpus = MultiFileJsonlCorpus( train_files=[tasks_base_path / "jsonl/train.jsonl"], From da36e0fded6bd8a08778fd7289d706e4f98ab148 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 24 May 2023 00:03:08 +0200 Subject: [PATCH 04/10] datasets: fix MASAKHA_POS name --- flair/datasets/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 092338e4a..8100e4821 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -163,7 +163,6 @@ # standard NER datasets # Expose all sequence labeling datasets from .sequence_labeling import ( - AFRICA_POS, BIOSCOPE, CONLL_03, CONLL_03_DUTCH, @@ -174,6 +173,7 @@ KEYPHRASE_INSPEC, KEYPHRASE_SEMEVAL2010, KEYPHRASE_SEMEVAL2017, + MASAKHA_POS, NER_ARABIC_ANER, NER_ARABIC_AQMAR, NER_BASQUE, @@ -313,7 +313,6 @@ "SentenceDataset", "MongoDataset", "StringDataset", - "AFRICA_POS", "ANAT_EM", "AZDZ", "BC2GM", @@ -449,6 +448,7 @@ "KEYPHRASE_INSPEC", "KEYPHRASE_SEMEVAL2010", "KEYPHRASE_SEMEVAL2017", + "MASAKHA_POS", "NER_ARABIC_ANER", "NER_ARABIC_AQMAR", "NER_BASQUE", From d077266ea60ae2a6cb137186cdf90679782271b6 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 24 May 2023 00:03:31 +0200 Subject: [PATCH 05/10] datasets: add support for MasakhaPOS dataset --- flair/datasets/sequence_labeling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 3020f15c1..9a69f9aa8 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4797,7 +4797,7 @@ def __init__( ) -class AFRICA_POS(MultiCorpus): +class MASAKHA_POS(MultiCorpus): def __init__( self, languages: Union[str, List[str]] = "bam", @@ -4806,9 +4806,9 @@ def __init__( in_memory: bool = True, **corpusargs, ) -> None: - """Initialize the AfricaPOS corpus available on https://github.com/masakhane-io/masakhane-pos. + """Initialize the MasakhaPOS corpus available on https://github.com/masakhane-io/masakhane-pos. - It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus + It consists of 20 African languages. Pass a language code or a list of language codes to initialize the corpus with the languages you require. If you pass "all", all languages will be initialized. :version: Specifies version of the dataset. Currently, only "v1" is supported. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this @@ -4872,7 +4872,7 @@ def __init__( if language not in supported_languages: log.error(f"Language '{language}' is not in list of supported languages!") log.error(f"Supported are '{supported_languages}'!") - log.error("Instantiate this Corpus for instance like so 'corpus = AFRICA_POS(languages='luo')'") + log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='luo')'") raise Exception language_folder = data_folder / language From 5c53910f4b6ecfcd2e61d69891b93bb69142f4cb Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 8 Aug 2023 21:50:21 +0200 Subject: [PATCH 06/10] tests: adjust test cases for MasakhaPOS dataset --- tests/test_datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 36b97391c..9aa7f9c80 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -808,8 +808,8 @@ def test_german_ler_corpus(tasks_base_path): assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split" -def test_afri_pos_corpus(tasks_base_path): - # This test covers the complete AfricaPOS dataset. +def test_masakha_pos_corpus(tasks_base_path): + # This test covers the complete MasakhaPOS dataset. supported_versions = ["v1"] supported_languages = { @@ -867,7 +867,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag for version in supported_versions: for language in supported_languages[version]: - corpus = flair.datasets.AFRICA_POS(languages=language, version=version) + corpus = flair.datasets.MASAKHA_POS(languages=language, version=version) gold_stats = africa_pos_stats[version][language] From d84092c5bb03d4ace01c80164e28f11b00be6bc1 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 11 Jun 2023 11:05:03 +0200 Subject: [PATCH 07/10] datasets: sync with latest MasakhaPOS GitHub version: test splits are currently missing and luo + tsn are missing --- flair/datasets/sequence_labeling.py | 9 +++++---- tests/test_datasets.py | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 9a69f9aa8..22fe85b47 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4845,13 +4845,13 @@ def __init__( "ibo", "kin", "lug", - "luo", + #"luo", "mos", "pcm", "nya", "sna", "swa", - "tsn", + #"tsn", "twi", "wol", "xho", @@ -4860,7 +4860,7 @@ def __init__( ] data_paths = { - "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/AfricaPOS", + "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/data", } # use all languages if explicitly set to "all" @@ -4880,7 +4880,7 @@ def __init__( # download data if necessary data_path = f"{data_paths[version]}/{language}" cached_path(f"{data_path}/dev.txt", language_folder) - cached_path(f"{data_path}/test.txt", language_folder) + #cached_path(f"{data_path}/test.txt", language_folder) cached_path(f"{data_path}/train.txt", language_folder) # initialize comlumncorpus and add it to list @@ -4891,6 +4891,7 @@ def __init__( encoding="utf-8", in_memory=in_memory, name=language, + test_file=None, **corpusargs, ) corpora.append(corp) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 9aa7f9c80..ce2224aa0 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -822,13 +822,13 @@ def test_masakha_pos_corpus(tasks_base_path): "ibo", "kin", "lug", - "luo", + #"luo", "mos", "pcm", "nya", "sna", "swa", - "tsn", + #"tsn", "twi", "wol", "xho", @@ -847,13 +847,13 @@ def test_masakha_pos_corpus(tasks_base_path): "ibo": {"train": 803, "dev": 160, "test": 642}, "kin": {"train": 757, "dev": 151, "test": 604}, "lug": {"train": 733, "dev": 146, "test": 586}, - "luo": {"train": 757, "dev": 151, "test": 604}, + #"luo": {"train": 757, "dev": 151, "test": 604}, "mos": {"train": 757, "dev": 151, "test": 604}, "pcm": {"train": 752, "dev": 150, "test": 600}, "nya": {"train": 728, "dev": 145, "test": 582}, "sna": {"train": 747, "dev": 149, "test": 596}, "swa": {"train": 675, "dev": 134, "test": 539}, - "tsn": {"train": 753, "dev": 150, "test": 602}, + #"tsn": {"train": 753, "dev": 150, "test": 602}, "twi": {"train": 775, "dev": 154, "test": 618}, "wol": {"train": 770, "dev": 154, "test": 616}, "xho": {"train": 752, "dev": 150, "test": 601}, @@ -873,7 +873,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version) check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version) - check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) + #check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): From fccf83bb1c6c0023dc11876cbb250b3a7e473001 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 13 Jul 2023 23:42:20 +0200 Subject: [PATCH 08/10] datasets: some minor work on MasakhaPOS dataset parsing --- flair/datasets/sequence_labeling.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 22fe85b47..8735a81d9 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4845,13 +4845,11 @@ def __init__( "ibo", "kin", "lug", - #"luo", "mos", "pcm", "nya", "sna", "swa", - #"tsn", "twi", "wol", "xho", @@ -4872,7 +4870,7 @@ def __init__( if language not in supported_languages: log.error(f"Language '{language}' is not in list of supported languages!") log.error(f"Supported are '{supported_languages}'!") - log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='luo')'") + log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='bam')'") raise Exception language_folder = data_folder / language @@ -4880,7 +4878,7 @@ def __init__( # download data if necessary data_path = f"{data_paths[version]}/{language}" cached_path(f"{data_path}/dev.txt", language_folder) - #cached_path(f"{data_path}/test.txt", language_folder) + cached_path(f"{data_path}/test.txt", language_folder) cached_path(f"{data_path}/train.txt", language_folder) # initialize comlumncorpus and add it to list @@ -4891,7 +4889,6 @@ def __init__( encoding="utf-8", in_memory=in_memory, name=language, - test_file=None, **corpusargs, ) corpora.append(corp) From 5bd45264f2198a013e12e869e8e4da2c67dd5857 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 13 Jul 2023 23:42:46 +0200 Subject: [PATCH 09/10] tests: sync MasakhaPOS tests with upstream repo --- tests/test_datasets.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index ce2224aa0..56d524d04 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -822,13 +822,11 @@ def test_masakha_pos_corpus(tasks_base_path): "ibo", "kin", "lug", - #"luo", "mos", "pcm", "nya", "sna", "swa", - #"tsn", "twi", "wol", "xho", @@ -839,25 +837,23 @@ def test_masakha_pos_corpus(tasks_base_path): africa_pos_stats = { "v1": { - "bam": {"train": 793, "dev": 158, "test": 634}, + "bam": {"train": 775, "dev": 154, "test": 619}, "bbj": {"train": 750, "dev": 149, "test": 599}, "ewe": {"train": 728, "dev": 145, "test": 582}, - "fon": {"train": 798, "dev": 159, "test": 637}, + "fon": {"train": 810, "dev": 161, "test": 646}, "hau": {"train": 753, "dev": 150, "test": 601}, "ibo": {"train": 803, "dev": 160, "test": 642}, "kin": {"train": 757, "dev": 151, "test": 604}, "lug": {"train": 733, "dev": 146, "test": 586}, - #"luo": {"train": 757, "dev": 151, "test": 604}, "mos": {"train": 757, "dev": 151, "test": 604}, "pcm": {"train": 752, "dev": 150, "test": 600}, "nya": {"train": 728, "dev": 145, "test": 582}, "sna": {"train": 747, "dev": 149, "test": 596}, - "swa": {"train": 675, "dev": 134, "test": 539}, - #"tsn": {"train": 753, "dev": 150, "test": 602}, - "twi": {"train": 775, "dev": 154, "test": 618}, - "wol": {"train": 770, "dev": 154, "test": 616}, + "swa": {"train": 693, "dev": 138, "test": 553}, + "twi": {"train": 785, "dev": 157, "test": 628}, + "wol": {"train": 782, "dev": 156, "test": 625}, "xho": {"train": 752, "dev": 150, "test": 601}, - "yor": {"train": 875, "dev": 174, "test": 698}, + "yor": {"train": 893, "dev": 178, "test": 713}, "zul": {"train": 753, "dev": 150, "test": 601}, }, } @@ -873,7 +869,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version) check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version) - #check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) + check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): From 2ddae63624cc5519d00c7375a18a51148c9e80d3 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Fri, 11 Aug 2023 12:49:57 +0200 Subject: [PATCH 10/10] datasets: type -> isinstance fix --- flair/datasets/sequence_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 8735a81d9..0a5bf1b58 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4818,7 +4818,7 @@ def __init__( base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) # if only one language is given - if type(languages) == str: + if isinstance(languages, str): languages = [languages] # column format