From d8fe0b55d07504bcd067ea71df8b05de517ab8ac Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 23 May 2023 23:44:30 +0200
Subject: [PATCH 01/10] datasets: include AFRICA_POS implementation

---
 flair/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index e549810d1..092338e4a 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -163,6 +163,7 @@
 # standard NER datasets
 # Expose all sequence labeling datasets
 from .sequence_labeling import (
+    AFRICA_POS,
     BIOSCOPE,
     CONLL_03,
     CONLL_03_DUTCH,
@@ -312,6 +313,7 @@
     "SentenceDataset",
     "MongoDataset",
     "StringDataset",
+    "AFRICA_POS",
     "ANAT_EM",
     "AZDZ",
     "BC2GM",

From 2bc445b9e3053db8f5fe9207e46401ae73cf3dcb Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 23 May 2023 23:44:55 +0200
Subject: [PATCH 02/10] datasets: add support for AfricaPOS dataset

---
 flair/datasets/sequence_labeling.py | 103 ++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index d214873b5..3020f15c1 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4795,3 +4795,106 @@ def __init__(
             sample_missing_splits=False,
             name="nermud",
         )
+
+
+class AFRICA_POS(MultiCorpus):
+    def __init__(
+        self,
+        languages: Union[str, List[str]] = "bam",
+        version: str = "v1",
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        **corpusargs,
+    ) -> None:
+        """Initialize the AfricaPOS corpus available on https://github.com/masakhane-io/masakhane-pos.
+
+        It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus
+        with the languages you require. If you pass "all", all languages will be initialized.
+        :version: Specifies version of the dataset. Currently, only "v1" is supported.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
+
+        # if only one language is given
+        if type(languages) == str:
+            languages = [languages]
+
+        # column format
+        columns = {0: "text", 1: "pos"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        supported_versions = ["v1"]
+
+        if version not in supported_versions:
+            log.error(f"The specified version '{version}' is not in the list of supported version!")
+            log.error(f"Supported versions are '{supported_versions}'!")
+            raise Exception
+
+        data_folder = base_path / dataset_name / version
+
+        supported_languages = [
+            "bam",
+            "bbj",
+            "ewe",
+            "fon",
+            "hau",
+            "ibo",
+            "kin",
+            "lug",
+            "luo",
+            "mos",
+            "pcm",
+            "nya",
+            "sna",
+            "swa",
+            "tsn",
+            "twi",
+            "wol",
+            "xho",
+            "yor",
+            "zul",
+        ]
+
+        data_paths = {
+            "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/AfricaPOS",
+        }
+
+        # use all languages if explicitly set to "all"
+        if languages == ["all"]:
+            languages = supported_languages
+
+        corpora: List[Corpus] = []
+        for language in languages:
+            if language not in supported_languages:
+                log.error(f"Language '{language}' is not in list of supported languages!")
+                log.error(f"Supported are '{supported_languages}'!")
+                log.error("Instantiate this Corpus for instance like so 'corpus = AFRICA_POS(languages='luo')'")
+                raise Exception
+
+            language_folder = data_folder / language
+
+            # download data if necessary
+            data_path = f"{data_paths[version]}/{language}"
+            cached_path(f"{data_path}/dev.txt", language_folder)
+            cached_path(f"{data_path}/test.txt", language_folder)
+            cached_path(f"{data_path}/train.txt", language_folder)
+
+            # initialize comlumncorpus and add it to list
+            log.info(f"Reading data for language {language}@{version}")
+            corp = ColumnCorpus(
+                data_folder=language_folder,
+                column_format=columns,
+                encoding="utf-8",
+                in_memory=in_memory,
+                name=language,
+                **corpusargs,
+            )
+            corpora.append(corp)
+        super().__init__(
+            corpora,
+            name="africa-pos-" + "-".join(languages),
+        )

From 9e1e26e18a6f11b87fef708c74bd43ba9dc5bc37 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 8 Aug 2023 21:47:29 +0200
Subject: [PATCH 03/10] tests: adjust test cases for MasakhaPOS dataset

---
 tests/test_datasets.py | 68 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index d642be251..36b97391c 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -808,6 +808,74 @@ def test_german_ler_corpus(tasks_base_path):
     assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
 
 
+def test_afri_pos_corpus(tasks_base_path):
+    # This test covers the complete AfricaPOS dataset.
+    supported_versions = ["v1"]
+
+    supported_languages = {
+        "v1": [
+            "bam",
+            "bbj",
+            "ewe",
+            "fon",
+            "hau",
+            "ibo",
+            "kin",
+            "lug",
+            "luo",
+            "mos",
+            "pcm",
+            "nya",
+            "sna",
+            "swa",
+            "tsn",
+            "twi",
+            "wol",
+            "xho",
+            "yor",
+            "zul",
+        ],
+    }
+
+    africa_pos_stats = {
+        "v1": {
+            "bam": {"train": 793, "dev": 158, "test": 634},
+            "bbj": {"train": 750, "dev": 149, "test": 599},
+            "ewe": {"train": 728, "dev": 145, "test": 582},
+            "fon": {"train": 798, "dev": 159, "test": 637},
+            "hau": {"train": 753, "dev": 150, "test": 601},
+            "ibo": {"train": 803, "dev": 160, "test": 642},
+            "kin": {"train": 757, "dev": 151, "test": 604},
+            "lug": {"train": 733, "dev": 146, "test": 586},
+            "luo": {"train": 757, "dev": 151, "test": 604},
+            "mos": {"train": 757, "dev": 151, "test": 604},
+            "pcm": {"train": 752, "dev": 150, "test": 600},
+            "nya": {"train": 728, "dev": 145, "test": 582},
+            "sna": {"train": 747, "dev": 149, "test": 596},
+            "swa": {"train": 675, "dev": 134, "test": 539},
+            "tsn": {"train": 753, "dev": 150, "test": 602},
+            "twi": {"train": 775, "dev": 154, "test": 618},
+            "wol": {"train": 770, "dev": 154, "test": 616},
+            "xho": {"train": 752, "dev": 150, "test": 601},
+            "yor": {"train": 875, "dev": 174, "test": 698},
+            "zul": {"train": 753, "dev": 150, "test": 601},
+        },
+    }
+
+    def check_number_sentences(reference: int, actual: int, split_name: str, language: str, version: str):
+        assert actual == reference, f"Mismatch in number of sentences for {language}@{version}/{split_name}"
+
+    for version in supported_versions:
+        for language in supported_languages[version]:
+            corpus = flair.datasets.AFRICA_POS(languages=language, version=version)
+
+            gold_stats = africa_pos_stats[version][language]
+
+            check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version)
+            check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version)
+            check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],

From da36e0fded6bd8a08778fd7289d706e4f98ab148 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 24 May 2023 00:03:08 +0200
Subject: [PATCH 04/10] datasets: fix MASAKHA_POS name

---
 flair/datasets/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 092338e4a..8100e4821 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -163,7 +163,6 @@
 # standard NER datasets
 # Expose all sequence labeling datasets
 from .sequence_labeling import (
-    AFRICA_POS,
     BIOSCOPE,
     CONLL_03,
     CONLL_03_DUTCH,
@@ -174,6 +173,7 @@
     KEYPHRASE_INSPEC,
     KEYPHRASE_SEMEVAL2010,
     KEYPHRASE_SEMEVAL2017,
+    MASAKHA_POS,
     NER_ARABIC_ANER,
     NER_ARABIC_AQMAR,
     NER_BASQUE,
@@ -313,7 +313,6 @@
     "SentenceDataset",
     "MongoDataset",
     "StringDataset",
-    "AFRICA_POS",
     "ANAT_EM",
     "AZDZ",
     "BC2GM",
@@ -449,6 +448,7 @@
     "KEYPHRASE_INSPEC",
     "KEYPHRASE_SEMEVAL2010",
     "KEYPHRASE_SEMEVAL2017",
+    "MASAKHA_POS",
     "NER_ARABIC_ANER",
     "NER_ARABIC_AQMAR",
     "NER_BASQUE",

From d077266ea60ae2a6cb137186cdf90679782271b6 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 24 May 2023 00:03:31 +0200
Subject: [PATCH 05/10] datasets: add support for MasakhaPOS dataset

---
 flair/datasets/sequence_labeling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 3020f15c1..9a69f9aa8 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4797,7 +4797,7 @@ def __init__(
         )
 
 
-class AFRICA_POS(MultiCorpus):
+class MASAKHA_POS(MultiCorpus):
     def __init__(
         self,
         languages: Union[str, List[str]] = "bam",
@@ -4806,9 +4806,9 @@ def __init__(
         in_memory: bool = True,
         **corpusargs,
     ) -> None:
-        """Initialize the AfricaPOS corpus available on https://github.com/masakhane-io/masakhane-pos.
+        """Initialize the MasakhaPOS corpus available on https://github.com/masakhane-io/masakhane-pos.
 
-        It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus
+        It consists of 20 African languages. Pass a language code or a list of language codes to initialize the corpus
         with the languages you require. If you pass "all", all languages will be initialized.
         :version: Specifies version of the dataset. Currently, only "v1" is supported.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
@@ -4872,7 +4872,7 @@ def __init__(
             if language not in supported_languages:
                 log.error(f"Language '{language}' is not in list of supported languages!")
                 log.error(f"Supported are '{supported_languages}'!")
-                log.error("Instantiate this Corpus for instance like so 'corpus = AFRICA_POS(languages='luo')'")
+                log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='luo')'")
                 raise Exception
 
             language_folder = data_folder / language

From 5c53910f4b6ecfcd2e61d69891b93bb69142f4cb Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 8 Aug 2023 21:50:21 +0200
Subject: [PATCH 06/10] tests: adjust test cases for MasakhaPOS dataset

---
 tests/test_datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 36b97391c..9aa7f9c80 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -808,8 +808,8 @@ def test_german_ler_corpus(tasks_base_path):
     assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
 
 
-def test_afri_pos_corpus(tasks_base_path):
-    # This test covers the complete AfricaPOS dataset.
+def test_masakha_pos_corpus(tasks_base_path):
+    # This test covers the complete MasakhaPOS dataset.
     supported_versions = ["v1"]
 
     supported_languages = {
@@ -867,7 +867,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
 
     for version in supported_versions:
         for language in supported_languages[version]:
-            corpus = flair.datasets.AFRICA_POS(languages=language, version=version)
+            corpus = flair.datasets.MASAKHA_POS(languages=language, version=version)
 
             gold_stats = africa_pos_stats[version][language]
 

From d84092c5bb03d4ace01c80164e28f11b00be6bc1 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Sun, 11 Jun 2023 11:05:03 +0200
Subject: [PATCH 07/10] datasets: sync with latest MasakhaPOS GitHub version:
 test splits are currently missing and luo + tsn are missing

---
 flair/datasets/sequence_labeling.py |  9 +++++----
 tests/test_datasets.py              | 10 +++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 9a69f9aa8..22fe85b47 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4845,13 +4845,13 @@ def __init__(
             "ibo",
             "kin",
             "lug",
-            "luo",
+            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            "tsn",
+            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -4860,7 +4860,7 @@ def __init__(
         ]
 
         data_paths = {
-            "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/AfricaPOS",
+            "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/data",
         }
 
         # use all languages if explicitly set to "all"
@@ -4880,7 +4880,7 @@ def __init__(
             # download data if necessary
             data_path = f"{data_paths[version]}/{language}"
             cached_path(f"{data_path}/dev.txt", language_folder)
-            cached_path(f"{data_path}/test.txt", language_folder)
+            #cached_path(f"{data_path}/test.txt", language_folder)
             cached_path(f"{data_path}/train.txt", language_folder)
 
             # initialize comlumncorpus and add it to list
@@ -4891,6 +4891,7 @@ def __init__(
                 encoding="utf-8",
                 in_memory=in_memory,
                 name=language,
+                test_file=None,
                 **corpusargs,
             )
             corpora.append(corp)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 9aa7f9c80..ce2224aa0 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -822,13 +822,13 @@ def test_masakha_pos_corpus(tasks_base_path):
             "ibo",
             "kin",
             "lug",
-            "luo",
+            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            "tsn",
+            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -847,13 +847,13 @@ def test_masakha_pos_corpus(tasks_base_path):
             "ibo": {"train": 803, "dev": 160, "test": 642},
             "kin": {"train": 757, "dev": 151, "test": 604},
             "lug": {"train": 733, "dev": 146, "test": 586},
-            "luo": {"train": 757, "dev": 151, "test": 604},
+            #"luo": {"train": 757, "dev": 151, "test": 604},
             "mos": {"train": 757, "dev": 151, "test": 604},
             "pcm": {"train": 752, "dev": 150, "test": 600},
             "nya": {"train": 728, "dev": 145, "test": 582},
             "sna": {"train": 747, "dev": 149, "test": 596},
             "swa": {"train": 675, "dev": 134, "test": 539},
-            "tsn": {"train": 753, "dev": 150, "test": 602},
+            #"tsn": {"train": 753, "dev": 150, "test": 602},
             "twi": {"train": 775, "dev": 154, "test": 618},
             "wol": {"train": 770, "dev": 154, "test": 616},
             "xho": {"train": 752, "dev": 150, "test": 601},
@@ -873,7 +873,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
 
             check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version)
             check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version)
-            check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
+            #check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):

From fccf83bb1c6c0023dc11876cbb250b3a7e473001 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Thu, 13 Jul 2023 23:42:20 +0200
Subject: [PATCH 08/10] datasets: some minor work on MasakhaPOS dataset parsing

---
 flair/datasets/sequence_labeling.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 22fe85b47..8735a81d9 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4845,13 +4845,11 @@ def __init__(
             "ibo",
             "kin",
             "lug",
-            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -4872,7 +4870,7 @@ def __init__(
             if language not in supported_languages:
                 log.error(f"Language '{language}' is not in list of supported languages!")
                 log.error(f"Supported are '{supported_languages}'!")
-                log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='luo')'")
+                log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='bam')'")
                 raise Exception
 
             language_folder = data_folder / language
@@ -4880,7 +4878,7 @@ def __init__(
             # download data if necessary
             data_path = f"{data_paths[version]}/{language}"
             cached_path(f"{data_path}/dev.txt", language_folder)
-            #cached_path(f"{data_path}/test.txt", language_folder)
+            cached_path(f"{data_path}/test.txt", language_folder)
             cached_path(f"{data_path}/train.txt", language_folder)
 
             # initialize comlumncorpus and add it to list
@@ -4891,7 +4889,6 @@ def __init__(
                 encoding="utf-8",
                 in_memory=in_memory,
                 name=language,
-                test_file=None,
                 **corpusargs,
             )
             corpora.append(corp)

From 5bd45264f2198a013e12e869e8e4da2c67dd5857 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Thu, 13 Jul 2023 23:42:46 +0200
Subject: [PATCH 09/10] tests: sync MasakhaPOS tests with upstream repo

---
 tests/test_datasets.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ce2224aa0..56d524d04 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -822,13 +822,11 @@ def test_masakha_pos_corpus(tasks_base_path):
             "ibo",
             "kin",
             "lug",
-            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -839,25 +837,23 @@ def test_masakha_pos_corpus(tasks_base_path):
 
     africa_pos_stats = {
         "v1": {
-            "bam": {"train": 793, "dev": 158, "test": 634},
+            "bam": {"train": 775, "dev": 154, "test": 619},
             "bbj": {"train": 750, "dev": 149, "test": 599},
             "ewe": {"train": 728, "dev": 145, "test": 582},
-            "fon": {"train": 798, "dev": 159, "test": 637},
+            "fon": {"train": 810, "dev": 161, "test": 646},
             "hau": {"train": 753, "dev": 150, "test": 601},
             "ibo": {"train": 803, "dev": 160, "test": 642},
             "kin": {"train": 757, "dev": 151, "test": 604},
             "lug": {"train": 733, "dev": 146, "test": 586},
-            #"luo": {"train": 757, "dev": 151, "test": 604},
             "mos": {"train": 757, "dev": 151, "test": 604},
             "pcm": {"train": 752, "dev": 150, "test": 600},
             "nya": {"train": 728, "dev": 145, "test": 582},
             "sna": {"train": 747, "dev": 149, "test": 596},
-            "swa": {"train": 675, "dev": 134, "test": 539},
-            #"tsn": {"train": 753, "dev": 150, "test": 602},
-            "twi": {"train": 775, "dev": 154, "test": 618},
-            "wol": {"train": 770, "dev": 154, "test": 616},
+            "swa": {"train": 693, "dev": 138, "test": 553},
+            "twi": {"train": 785, "dev": 157, "test": 628},
+            "wol": {"train": 782, "dev": 156, "test": 625},
             "xho": {"train": 752, "dev": 150, "test": 601},
-            "yor": {"train": 875, "dev": 174, "test": 698},
+            "yor": {"train": 893, "dev": 178, "test": 713},
             "zul": {"train": 753, "dev": 150, "test": 601},
         },
     }
@@ -873,7 +869,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
 
             check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version)
             check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version)
-            #check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
+            check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):

From 2ddae63624cc5519d00c7375a18a51148c9e80d3 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Fri, 11 Aug 2023 12:49:57 +0200
Subject: [PATCH 10/10] datasets: type -> isinstance fix

---
 flair/datasets/sequence_labeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 8735a81d9..0a5bf1b58 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4818,7 +4818,7 @@ def __init__(
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
         # if only one language is given
-        if type(languages) == str:
+        if isinstance(languages, str):
             languages = [languages]
 
         # column format